Exemple #1
0
def save_member_rels(source_company_id, members, SOURCE):
    conn = db.connect_torndb()
    for m in members:
        member_key = m["key"]
        source_member = conn.get(
            "select * from source_member where source=%s and sourceId=%s order by id limit 1",
            SOURCE, member_key)
        if source_member is None:
            continue

        source_member_id = source_member["id"]
        source_company_member_rel = conn.get(
            "select * from source_company_member_rel where \
                sourceCompanyId=%s and sourceMemberId=%s", source_company_id,
            source_member_id)

        type = name_helper.position_check(m["position"])
        logger.info("position %s, type %s", m["position"], type)
        if source_company_member_rel is None:
            conn.insert(
                "insert source_company_member_rel(sourceCompanyId, sourceMemberId, \
                        position,type,createTime,modifyTime) \
                        values(%s,%s,%s,%s, now(),now())", source_company_id,
                source_member_id, m["position"], type)
    conn.close()
def parseMember_save(source_company_id, item, download_crawler):
    logger.info("parseMember_save")

    companyKey = item["key"]
    d = pq(html.fromstring(item['content'].decode("utf-8")))
    members = d('.startups-member')
    for m in members:
        name = d(m)('.media-heading').text()
        logger.info(name)

        desc = d(m)('.desc').text()
        position = d(m)('.title').text()

        logo = 'http:' + d(m)(".media-object").attr('src').replace(
            '@!logom', '')
        if logo.find('deafult') >= 0 or logo.find('default') >= 0: logo = None

        if logo:
            logo = logo.replace("https://", "http://")

        sourceId = d(m)('.media-body a').attr('href')
        if sourceId is not None:
            sourceId = str(companyKey) + '_' + sourceId.split(
                'person/')[-1].strip()
        else:
            sourceId = str(
                companyKey) + '_' + kr36_company_parser_2.get_company_code(
                    name)

        source_member = {
            "source": SOURCE,
            "sourceId": sourceId,
            "name": name,
            "photo_url": logo,
            "weibo": None,
            "location": 0,
            "role": position[:50],
            "description": desc,
            "education": None,
            "work": None
        }
        ptype = name_helper.position_check(position)

        source_company_member_rel = {
            "sourceCompanyId": source_company_id,
            "position": position[:50],
            "joinDate": None,
            "leaveDate": None,
            "type": ptype
        }
        try:
            parser_db_util.save_member_standard(source_member,
                                                download_crawler,
                                                source_company_member_rel)
            # logger.info(source_member)
            # logger.info(source_company_member_rel)
        except Exception, ex:
            logger.info("%s:%s", Exception, ex)
            exit()
Exemple #3
0
def save_member(r, SOURCE, download_crawler):
    member_key, name, weibo, introduction, education, work, location, role, pictureUrl, company_key, position = r
    conn = db.connect_torndb()
    source_member = conn.get(
        "select * from source_member where source=%s and sourceId=%s order by id limit 1",
        SOURCE, member_key)
    logo_id = None
    if source_member == None or source_member[
            "photo"] == None or source_member["photo"] == "":
        if pictureUrl is not None and pictureUrl != "":
            # image_value = download_crawler.get_image(pictureUrl)
            # if image_value is not None:
            #     logo_id = imgfs.put(image_value, content_type='jpeg', filename='member_%s_%s.jpg' % (SOURCE, member_key))
            # logger.info("gridfs logo_id=%s" % logo_id)
            (logo_id, w,
             h) = parser_mysql_util.get_logo_id_new(pictureUrl,
                                                    download_crawler, SOURCE,
                                                    member_key, "member")
    else:
        logo_id = source_member["photo"]

    if source_member is None:
        sql = "insert source_member(name,photo,weibo,location,role,description,\
        education,work,source,sourceId,createTime,modifyTime,processStatus) \
        values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),0)"

        source_member_id = conn.insert(sql, name, logo_id, weibo, location,
                                       role, introduction, education, work,
                                       SOURCE, member_key)
    else:
        source_member_id = source_member["id"]
        sql = "update source_member set name=%s,photo=%s,weibo=%s,location=%s,role=%s,description=%s,\
        education=%s,work=%s,modifyTime=now(),processStatus=0 where id=%s"

        conn.update(sql, name, logo_id, weibo, location, role, introduction,
                    education, work, source_member_id)

    if company_key is not None:
        source_company = conn.get(
            "select * from source_company where source=%s and sourceId=%s",
            SOURCE, company_key)
        if source_company is not None:
            source_company_id = source_company["id"]
            source_company_member_rel = conn.get(
                "select * from source_company_member_rel where \
                    sourceCompanyId=%s and sourceMemberId=%s",
                source_company_id, source_member_id)
            if source_company_member_rel is None:
                type = name_helper.position_check(position)
                logger.info("position %s, type %s", position, type)
                conn.insert(
                    "insert source_company_member_rel(sourceCompanyId, sourceMemberId, \
                            position,type,createTime,modifyTime) \
                            values(%s,%s,%s,%s, now(),now())",
                    source_company_id, source_member_id, position, type)
    conn.close()
def parseMember_save(source_company_id, item, download_crawler):
    if item is None:
        return None

    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    members = []
    # members
    logger.info("*** member ****")
    lis = d('ul.team-list> li')
    for li in lis:
        try:
            l = pq(li)
            member_name = l('div.per-name> a').text().strip()
            member_key = l('div.per-name> a').attr("href").split("/")[-1]
            position = l('div.per-position').text().strip()
            logo = l('a.avatar> img').attr("src")
            desc = l('div.per-des').text().strip()

            logger.info(
                "member_key: %s, member_name: %s, position: %s, desc: %s" % (member_key, member_name, position, desc))
            source_member = {
                "source": SOURCE,
                "sourceId": str(member_key),
                "name": member_name,
                "photo_url": logo,
                "weibo": None,
                "location": 0,
                "role": position,
                "description": desc,
                "education": None,
                "work": None
            }
            # member = {
            #     "key":member_key,
            #     "name":member_name,
            #     "position":position
            # }

            ptype = name_helper.position_check(position)

            source_company_member_rel = {
                "sourceCompanyId": source_company_id,
                "position": position,
                "joinDate": None,
                "leaveDate": None,
                "type": ptype
            }

            parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel)
            # members.append(member)
        except Exception,ex:
            logger.exception(ex)
def parseMember_save(source_company_id, item, download_crawler):
    logger.info("parseMember_save")
    members = item["content"]["member"]["data"]["members"]
    for m in members:
        if not m.has_key("name"):
            continue

        logger.info(m["name"])

        desc = m.get("intro")
        position = m.get("position", "")

        logo = m.get("avatar")
        if logo:
            logo = logo.replace("https://", "http://")
        source_member = {
            "source": SOURCE,
            "sourceId": str(m["id"]),
            "name": m["name"],
            "photo_url": logo,
            "weibo": None,
            "location": 0,
            "role": position,
            "description": desc,
            "education": None,
            "work": None
        }
        ptype = name_helper.position_check(position)

        source_company_member_rel = {
            "sourceCompanyId": source_company_id,
            "position": position,
            "joinDate": None,
            "leaveDate": None,
            "type": ptype
        }
        try:
            parser_db_util.save_member_standard(source_member,
                                                download_crawler,
                                                source_company_member_rel)
            # logger.info(source_member)
            # logger.info(source_company_member_rel)
        except:
            pass
def parse_member(item):
    if item is None:
        return []

    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    members = []
    # members
    logger.info("*** member ****")
    lis = d('ul.list-prodcase> li')
    for li in lis:
        try:
            l = pq(li)
            member_name = l('h4> a> b> span.c').text().strip()
            position = l('h4> a> b> span.c-gray').text().strip()
            str = l('h4> a').attr("href").strip()
            (member_key, ) = util.re_get_result(r'person/(\d*?)$', str)
            logger.info("member_key: %s, member_name: %s, position: %s" %
                        (member_key, member_name, position))
            memberId = parser_mongo_util.find_mongo_memberId(
                SOURCE, member_key)
            if memberId is None:
                continue
            type = name_helper.position_check(position)
            member = {
                "_memberId": memberId,
                "name": member_name,
                "position": position,
                "type": type
            }
            members.append(member)
        except Exception, ex:
            logger.exception(ex)
Exemple #7
0
#logger
loghelper.init_logger("Member_data_clean", stream=True)
logger = loghelper.get_logger("Member_data_clean")

if __name__ == "__main__":
    conn = db.connect_torndb()
    start = 0
    while True:
        membertitles = list(
            conn.query(
                "select * from company_member_rel where (active is null or active='Y') and (type not in (5010,5020,5030,5040) or type is null) order by id limit %s, 1000",
                start))
        if len(membertitles) == 0:
            break
        for title in membertitles:
            type = name_helper.position_check(title["position"])
            logger.info("%s->%s", title["position"], type)
            conn.update(
                "update company_member_rel set type=%s,modifyTime=now(),modifyUser=139 where id=%s",
                type, title["id"])

        #break

    while True:
        membertitles = list(
            conn.query(
                "select * from source_company_member_rel where type not in (5010,5020,5030,5040) or type is null order by id limit %s, 1000",
                start))
        if len(membertitles) == 0:
            break
        for title in membertitles:
Exemple #8
0
def parseMember_save(source_company_id, item):
    if item is None:
        return
    logger.info("*** member ***")
    html = item["content"]
    d = pq(html)

    lis = d('.manager_list > li')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p.item_manager_name > span').text()
                member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p.item_manager_title').text()

                member_desc = mem('div.item_manager_content').text()

                # print member_position
                # print member_name
                # print member_desc

                weibo = None
                if member_link is not None:
                    if 'weibo.com' in member_link:
                        weibo = member_link

                source_member = {
                    'name': member_name,
                    'photo_url': logo_url,
                    'weibo': weibo,
                    'location': None,
                    'role': member_position,
                    'description': member_desc,
                    'education': None,
                    'work': None,
                    'source': SOURCE,
                    'sourceId': member_key,
                }
                ptype = name_helper.position_check(member_position)
                source_company_member_rel = {
                    'sourceCompanyId': source_company_id,
                    'position': member_position,
                    'joinDate': None,
                    'leaveDate': None,
                    'type': ptype
                }
                logger.info(
                    json.dumps(source_member,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))

                parser_db_util.save_member_standard(source_member,
                                                    download_crawler,
                                                    source_company_member_rel)

            except Exception, ex:
                logger.exception(ex)