Beispiel #1
0
def parseMember_save(source_company_id, item, download_crawler):
    logger.info("parseMember_save")

    companyKey = item["key"]
    d = pq(html.fromstring(item['content'].decode("utf-8")))
    members = d('.startups-member')
    for m in members:
        name = d(m)('.media-heading').text()
        logger.info(name)

        desc = d(m)('.desc').text()
        position = d(m)('.title').text()

        logo = 'http:' + d(m)(".media-object").attr('src').replace(
            '@!logom', '')
        if logo.find('deafult') >= 0 or logo.find('default') >= 0: logo = None

        if logo:
            logo = logo.replace("https://", "http://")

        sourceId = d(m)('.media-body a').attr('href')
        if sourceId is not None:
            sourceId = str(companyKey) + '_' + sourceId.split(
                'person/')[-1].strip()
        else:
            sourceId = str(
                companyKey) + '_' + kr36_company_parser_2.get_company_code(
                    name)

        source_member = {
            "source": SOURCE,
            "sourceId": sourceId,
            "name": name,
            "photo_url": logo,
            "weibo": None,
            "location": 0,
            "role": position[:50],
            "description": desc,
            "education": None,
            "work": None
        }
        ptype = name_helper.position_check(position)

        source_company_member_rel = {
            "sourceCompanyId": source_company_id,
            "position": position[:50],
            "joinDate": None,
            "leaveDate": None,
            "type": ptype
        }
        try:
            parser_db_util.save_member_standard(source_member,
                                                download_crawler,
                                                source_company_member_rel)
            # logger.info(source_member)
            # logger.info(source_company_member_rel)
        except Exception, ex:
            logger.info("%s:%s", Exception, ex)
            exit()
Beispiel #2
0
def parseMember_save(source_company_id, item, download_crawler):
    if item is None:
        return None

    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    members = []
    # members
    logger.info("*** member ****")
    lis = d('ul.team-list> li')
    for li in lis:
        try:
            l = pq(li)
            member_name = l('div.per-name> a').text().strip()
            member_key = l('div.per-name> a').attr("href").split("/")[-1]
            position = l('div.per-position').text().strip()
            logo = l('a.avatar> img').attr("src")
            desc = l('div.per-des').text().strip()

            logger.info(
                "member_key: %s, member_name: %s, position: %s, desc: %s" % (member_key, member_name, position, desc))
            source_member = {
                "source": SOURCE,
                "sourceId": str(member_key),
                "name": member_name,
                "photo_url": logo,
                "weibo": None,
                "location": 0,
                "role": position,
                "description": desc,
                "education": None,
                "work": None
            }
            # member = {
            #     "key":member_key,
            #     "name":member_name,
            #     "position":position
            # }

            ptype = name_helper.position_check(position)

            source_company_member_rel = {
                "sourceCompanyId": source_company_id,
                "position": position,
                "joinDate": None,
                "leaveDate": None,
                "type": ptype
            }

            parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel)
            # members.append(member)
        except Exception,ex:
            logger.exception(ex)
Beispiel #3
0
def parseMember_save(source_company_id, item, download_crawler):
    company_key = item["sourceId"]
    logger.info("parseMember_save")
    if item.has_key("jqkaBrief") is False:
        return
    m = {"name": item["jqkaBrief"]["chairman"], "job": "董事长"}
    try:
        if m["name"] is None or m["name"].strip() == "":
            return

        position = m.get("job", "")

        if position.find("董事长") == -1:
            return

        logger.info("%s-%s", m["name"], position)
        source_member = {
            "source": SOURCE,
            "sourceId": str(company_key) + '_' + get_company_code(m["name"]),
            "name": m["name"],
            "photo_url": None,
            "weibo": None,
            "location": 0,
            "role": position,
            "description": None,
            "education": None,
            "work": None
        }
        # ptype = name_helper.position_check(position)
        ptype = 5010

        source_company_member_rel = {
            "sourceCompanyId": source_company_id,
            "position": position,
            "joinDate": None,
            "leaveDate": None,
            "type": ptype
        }
        # try:
        logger.info(
            json.dumps(source_member,
                       ensure_ascii=False,
                       cls=util.CJsonEncoder))
        logger.info(
            json.dumps(source_company_member_rel,
                       ensure_ascii=False,
                       cls=util.CJsonEncoder))

        parser_db_util.save_member_standard(source_member, download_crawler,
                                            source_company_member_rel)
        pass
    except:
        pass
def parseMember_save(source_company_id, item, download_crawler):
    name = item['name']
    logger.info('parseMember_save:%s' % name)
    members = item['content']['member']['current_employees']

    for m in members:
        # logger.info('*******%s'%m)
        person = m.get('person_identifier', '')
        name = person.get('value', '')
        # logger.info('name:%s', name)
        uuid = person.get('uuid', '')
        desc = None
        position = m.get('title', '')
        # logger.info('position:%s', position)
        logo = person.get('image_id', '')
        if logo:
            logo = 'https://crunchbase-production-res.cloudinary.com/image/upload/c_thumb,h_200,w_200,f_auto,g_faces,z_0.7,b_white,q_auto:eco/%s' % logo
        # logger.info('logo:%s', logo)
        source_member = {
            "source": SOURCE,
            "sourceId": uuid,
            "name": name,
            "photo_url": logo,
            "weibo": None,
            "location": 0,
            "role": position,
            "description": desc,
            "education": None,
            "work": None
        }
        ptype = name_helper.crunchbase_position_check(position)
        # logger.info('ptype:%s',ptype)

        source_company_member_rel = {
            "sourceCompanyId": source_company_id,
            "position": position,
            "joinDate": None,
            "leaveDate": None,
            "type": ptype
        }
        logger.info(json.dumps(source_member, ensure_ascii=False, indent=2))
        logger.info(
            json.dumps(source_company_member_rel, ensure_ascii=False,
                       indent=2))

        try:
            parser_db_util.save_member_standard(source_member,
                                                download_crawler,
                                                source_company_member_rel)
        except:
            pass
Beispiel #5
0
def parseMember_save(source_company_id, type, members, download_crawler):
    logger.info("parseMember_save")
    for m in members:
        if not m.has_key("name"):
            continue

        logger.info(m["name"])

        desc = m.get("intro")
        member_type = type_map.get(m.get("type"), "")
        position = m.get("position", "")
        if len(position) > 20:
            if desc is None:
                desc = position
            else:
                desc += '\n' + position
            position = member_type
        else:
            position = member_type + position

        logo = m.get("avatar")
        if logo:
            logo = logo.replace("https://", "http://")
        source_member = {
            "source": SOURCE,
            "sourceId": str(m["id"]),
            "name": m["name"],
            "photo_url": logo,
            "weibo": None,
            "location": 0,
            "role": None,
            "description": desc,
            "education": None,
            "work": None
        }

        source_company_member_rel = {
            "sourceCompanyId": source_company_id,
            "position": position,
            "joinDate": None,
            "leaveDate": None,
            "type": type
        }
        try:
            parser_db_util.save_member_standard(source_member,
                                                download_crawler,
                                                source_company_member_rel)
        except:
            pass
Beispiel #6
0
def parseMember_save(source_company_id, item, download_crawler):
    logger.info("parseMember_save")
    members = item["content"]["member"]["data"]["members"]
    for m in members:
        if not m.has_key("name"):
            continue

        logger.info(m["name"])

        desc = m.get("intro")
        position = m.get("position", "")

        logo = m.get("avatar")
        if logo:
            logo = logo.replace("https://", "http://")
        source_member = {
            "source": SOURCE,
            "sourceId": str(m["id"]),
            "name": m["name"],
            "photo_url": logo,
            "weibo": None,
            "location": 0,
            "role": position,
            "description": desc,
            "education": None,
            "work": None
        }
        ptype = name_helper.position_check(position)

        source_company_member_rel = {
            "sourceCompanyId": source_company_id,
            "position": position,
            "joinDate": None,
            "leaveDate": None,
            "type": ptype
        }
        try:
            parser_db_util.save_member_standard(source_member,
                                                download_crawler,
                                                source_company_member_rel)
            # logger.info(source_member)
            # logger.info(source_company_member_rel)
        except:
            pass
Beispiel #7
0
def parseMember_save(source_company_id, item):
    if item is None:
        return
    logger.info("*** member ***")
    html = item["content"]
    d = pq(html)

    lis = d('.manager_list > li')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p.item_manager_name > span').text()
                member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p.item_manager_title').text()

                member_desc = mem('div.item_manager_content').text()

                # print member_position
                # print member_name
                # print member_desc

                weibo = None
                if member_link is not None:
                    if 'weibo.com' in member_link:
                        weibo = member_link

                source_member = {
                    'name': member_name,
                    'photo_url': logo_url,
                    'weibo': weibo,
                    'location': None,
                    'role': member_position,
                    'description': member_desc,
                    'education': None,
                    'work': None,
                    'source': SOURCE,
                    'sourceId': member_key,
                }
                ptype = name_helper.position_check(member_position)
                source_company_member_rel = {
                    'sourceCompanyId': source_company_id,
                    'position': member_position,
                    'joinDate': None,
                    'leaveDate': None,
                    'type': ptype
                }
                logger.info(
                    json.dumps(source_member,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))

                parser_db_util.save_member_standard(source_member,
                                                    download_crawler,
                                                    source_company_member_rel)

            except Exception, ex:
                logger.exception(ex)