Beispiel #1
0
def get_android_domain(app_market, app_id):
    domain = None
    if app_market == 16010 or app_market == 16020:
        android_app = parser_mongo_util.find_android_market(app_market, app_id)
        if android_app:
            domain = android_app["apkname"]
    else:
        domain = app_id
    return domain
Beispiel #2
0
def parse_company(item):
    if item is None:
        logger.info("here")
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html1 = item["content"]
    logger.info(company_key)
    d = pq((html.fromstring(html1.decode("utf-8"))))

    name = d('h1.name').text().strip()

    fullName = d('div.company-business> h4').text()
    if fullName.find("来源")>=0:
        fullName = fullName.split(" ")[-1]

    fullName = name_helper.company_name_normalize(fullName)

    if (name is None or name == "") or (fullName is None or fullName == ""):
        logger.info("here1: %s", name)
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        logger.info("here")
        return {
            "status": "No_Name",
        }
    logo = d('div.company-logo> img').attr('src')

    if logo.startswith("http") or logo.startswith("https") or logo.find("default") >= 0:
        pass
    else:
        logo = None

    # if logo.find("default") >= 0:
    #     logo = None

    brief = None
    desc_text = d('div.job-sec> div.text').text()
    logger.info("desc: %s", desc_text)

    if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5:
        desc = None
    else:

        desc = desc_text.replace('公司简介:',"").replace("收起","").replace("展开","").replace("&nbsp;","").strip()

    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        lll = d('div.info-primary> p').text().strip()
        if len(lll.split(" ")) == 3:
            field = lll.split(" ")[2]
            stage = lll.split(" ")[0]
            headCount = lll.split(" ")[1]

    except:
        pass

    headCount = headCount.replace("人", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None



    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0



    links = d('div.company-products> ul> li> div.text> div.name> a')
    artifacts = []
    for linkp in links:
        link = pq(linkp)('a').attr("href")
        website = url_helper.url_normalize(link)
        logger.info("website: %s" % website)

        type, app_market, app_id = url_helper.get_market(website)
        if type == 4010:
            if item["url"] != website and website.find("zhipin") == -1:
                flag, domain = url_helper.get_domain(website)
                if flag is not None:
                    if flag is False:
                        domain = None
                    artifacts.append({
                        "type": 4010,
                        "name": name,
                        "description": None,
                        "link": website,
                        "domain": domain
                    })
        elif type == 4020 or type == 4030:
            domain = None
            if domain is not None:
                artifacts.append({
                    "type": type,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })
        elif type == 4040:
            domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4040,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })
        elif type == 4050:
            domain = None
            if app_market == 16010 or app_market == 16020:
                android_app = parser_mongo_util.find_android_market(app_market, app_id)
                if android_app:
                    domain = android_app["apkname"]
            else:
                domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4050,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })

    #parser member
    members = []

    lis = d('div.manager-list> div> ul >li> div')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('div.info-user> img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p> span.name').text()
                # member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p> span.job-title').text()

                member_desc = mem('div.item_manager_content').text()

                # weibo = None
                # if member_link is not None:
                #     if 'weibo.com' in member_link:
                #         weibo = member_link

                source_member = {'name': member_name,
                                 'photo_url': logo_url,
                                 'weibo': None,
                                 'location': None,
                                 'role': member_position,
                                 'description': member_desc,
                                 'education': None,
                                 'work': None
                                 }
                members.append(source_member)
            except:
                pass

    sourceId2link =  d('div.company-tab> a').eq(0).attr("href")
    if sourceId2link is not None and sourceId2link.find("gongsi") >=0:
        sourceId2 = sourceId2link.split("/")[-1].replace(".html","")
    else:
        sourceId2 =  None

    source_company = {
                      "name": name,
                      "fullName": fullName  if fullName is not None and fullName.strip() != "" else None,
                      "description": desc,
                      "brief": brief,
                      "round": None,
                      "roundDesc": None,
                      "companyStatus": 2010,
                      'fundingType': funding_type,
                      "locationId": int(0),
                      "address": address,
                      "phone": None,
                      "establishDate": None,
                      "logo": logo,
                      "source": SOURCE,
                      "sourceId": company_key,
                      "sourceId2": sourceId2,
                      "sourceUrl": "https://www.zhipin.com/gongsi/%s.html?ka=company-intro" % company_key,
                      "field": field,
                      "headCountMin": min_staff,
                      "headCountMax": max_staff,
                      "artifacts": artifacts,
                      "members": members,
                      "status": 1,
                      "stage": 0,
                      }

    return source_company
def parse_artifact(source_company_id, item):
    name = item['name']
    logger.info('parse_artifact:%s' % name)

    artifacts = []
    desc = ''
    descs = item['content']['company_base']['properties']
    if descs.has_key('short_description'):
        desc = descs['short_description']

    of = item['content']['company_base']['overview_fields2']
    if of.has_key('website'):
        website = of['website']['value']
        website = url_helper.url_normalize(website)
        # logger.info('website:%s'%website)
        if website is not None and website.find(
                'twitter') == -1 and website.find(
                    'linkedin') == -1 and website.find('facebook') == -1:
            type, app_market, app_id = url_helper.get_market(website)
            # logger.info('type:%s---market:%s---app_id:%s'%(type,market,app_id))
            if type == 4010:
                flag, domain = url_helper.get_domain(website)
                if flag is not None:
                    if flag is False:
                        domain = None
                    artifacts.append({
                        "type": 4010,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })
            elif type == 4020 or type == 4030:
                domain = None
                if domain is not None:
                    artifacts.append({
                        "type": type,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })
            elif type == 4040:
                domain = app_id
                if domain is not None:
                    artifacts.append({
                        "type": 4040,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })
            elif type == 4050:
                domain = None
                if app_market == 16010 or app_market == 16020:
                    android_app = parser_mongo_util.find_android_market(
                        app_market, app_id)
                    if android_app:
                        domain = android_app["apkname"]
                else:
                    domain = app_id
                if domain is not None:
                    artifacts.append({
                        "type": 4050,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })

    return artifacts
Beispiel #4
0
def parse_company(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    logger.info(company_key)
    d = pq(html)

    if html.decode("utf-8").find("这个公司的主页还在建设") >= 0:
        return {
            "status": "No_Name",
        }
    name = d('.company_main > h1 > a').text()
    link = d('.company_main > h1 > a').attr('href')
    fullName = d('.company_main > h1 > a').attr('title')
    fullName = name_helper.company_name_normalize(fullName)
    if name is None or fullName is None or (name.find("拉勾") >= 0
                                            and company_key != "147"):
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        return {
            "status": "No_Name",
        }
    logo = d('.top_info_wrap > img').attr('src')

    if logo.startswith("http") or logo.startswith("https"):
        pass
    else:
        logo = "http:" + logo

    if logo.find("logo_default") >= 0:
        logo = None

    brief = d('.company_word').text()
    desc_text = d('.company_intro_text').text()

    if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10:
        desc = None
    else:
        desc = d('.company_intro_text > .company_content').html()
        desc = desc.replace('<span class="text_over">展开</span>', '')

        soup = BeautifulSoup(desc, "lxml")
        raw = soup.getText()
        desc = raw

    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        field = d(
            '#basic_container > .item_content >ul > li:eq(0) > span').text()
        stage = d(
            '#basic_container > .item_content >ul > li:eq(1) > span').text()
        headCount = d(
            '#basic_container > .item_content >ul > li:eq(2) > span').text()
        headCount = headCount[0:headCount.index(u'人')]
        location = d(
            '#basic_container > .item_content >ul > li:eq(3) > span').text()
        address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text()
    except:
        pass

    headCount = headCount.replace("people", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None

    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0

    location_id = 0
    location_new = parser_mongo_util.get_location(location)
    if location_new != None:
        location_id = location_new["locationId"]

    #website = util.norm_url(link)
    website = url_helper.url_normalize(link)
    logger.info("website: %s" % website)

    artifacts = []
    type, app_market, app_id = url_helper.get_market(website)
    if type == 4010:
        if item["url"] != website and website.find("lagou.com") == -1:
            flag, domain = url_helper.get_domain(website)
            if flag is not None:
                if flag is False:
                    domain = None
                artifacts.append({
                    "type": 4010,
                    "name": name,
                    "description": desc,
                    "link": website,
                    "domain": domain
                })
    elif type == 4020 or type == 4030:
        domain = None
        if domain is not None:
            artifacts.append({
                "type": type,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4040:
        domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4040,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4050:
        domain = None
        if app_market == 16010 or app_market == 16020:
            android_app = parser_mongo_util.find_android_market(
                app_market, app_id)
            if android_app:
                domain = android_app["apkname"]
        else:
            domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4050,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })

    #parser member
    members = []

    lis = d('.manager_list > li')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p.item_manager_name > span').text()
                member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p.item_manager_title').text()

                member_desc = mem('div.item_manager_content').text()

                weibo = None
                if member_link is not None:
                    if 'weibo.com' in member_link:
                        weibo = member_link

                source_member = {
                    'name': member_name,
                    'photo_url': logo_url,
                    'weibo': weibo,
                    'location': None,
                    'role': member_position,
                    'description': member_desc,
                    'education': None,
                    'work': None
                }
                members.append(source_member)
            except:
                pass

    source_company = {
        "name":
        name,
        "fullName":
        fullName if fullName is not None and fullName.strip() != "" else None,
        "description":
        desc,
        "brief":
        brief,
        "round":
        None,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        funding_type,
        "locationId":
        int(location_id),
        "address":
        address,
        "phone":
        None,
        "establishDate":
        None,
        "logo":
        logo,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "sourceUrl":
        "https://www.lagou.com/gongsi/%s.html" % company_key,
        "field":
        field,
        "headCountMin":
        min_staff,
        "headCountMax":
        max_staff,
        "artifacts":
        artifacts,
        "members":
        members,
        "status":
        1
    }

    return source_company
def parse_artifact(item):
    if item is None:
        return []

    artifacts = []
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    #artifact
    logger.info("*** artifact ***")
    lis = d('ul.list-prod> li> div.on-edit-hide')
    for li in lis:
        l = pq(li)
        strtype = l('h4> span.tag').text().strip()
        #logger.info(strtype)
        if strtype != u"网站" and strtype != "app":
            continue

        link = l('h4> b> a').attr("href").strip()
        if link == "":
            continue

        domain = None
        type = None
        if strtype == u"网站":
            type, app_market, app_id = url_helper.get_market(link)
            if type == 4010:
                link = url_helper.url_normalize(link)
                flag, domain = url_helper.get_domain(link)
                if flag is None:
                    continue
                if flag is False:
                    domain = None

        if type != 4010:
            type, app_market, app_id = url_helper.get_market(link)
            if type == 4040:
                domain = app_id
            elif type == 4050:
                if app_market == 16010 or app_market == 16020:
                    android_app = parser_mongo_util.find_android_market(
                        app_market, app_id)
                    if android_app:
                        domain = android_app["apkname"]
                else:
                    domain = app_id
            if domain is None and type != 4030 and type != 4020:
                continue

        name = l('h4> b').text().strip()
        desc = l('p').text().strip()
        logger.info("type: %s, name: %s, link: %s, desc: %s" %
                    (type, name, link, desc))
        artifact = {
            "type": type,
            "name": name,
            "desc": desc,
            "link": link,
            "domain": domain
        }
        artifacts.append(artifact)

    logger.info("")
    return artifacts
def parse_base(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    company_short_name = ""
    product_name = d('div.line-title> span> b').clone().children().remove(
    ).end().text().strip()
    temps = product_name.split("/", 1)
    if len(temps) == 2:
        product_name = temps[0].strip()
        company_short_name = temps[1].strip()
    if company_short_name == "":
        company_short_name = product_name
    logger.info("product name: %s" % product_name)
    logger.info("company short name: %s" % company_short_name)

    company_name = d('div.des-more> div').eq(0).text().strip().replace(
        "公司全称:", "")
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""
    company_name = name_helper.company_name_normalize(company_name)
    logger.info("company name: %s" % company_name)

    if company_short_name == "" and company_name == "":
        return

    establish_date = None
    str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:", "")
    result = util.re_get_result('(\d*)\.(\d*)', str)
    if result != None:
        (year, month) = result
        try:
            if int(month) > 12:
                month = "1"
        except:
            month = "1"
        establish_date = datetime.datetime.strptime("%s-%s-1" % (year, month),
                                                    '%Y-%m-%d')
    logger.info("establish date: %s" % establish_date)

    locationId = 0
    str = d('span.loca').text().strip()
    #logger.info(str)
    result = util.re_get_result(u'(.*?)·(.*?)$', str)
    if result != None:
        (province, city) = result
        province = province.strip()
        city = city.strip()
        logger.info("location: %s-%s" % (province, city))

        locationId = 0
        result = parser_mongo_util.get_location(city)
        if result != None:
            locationId = result["locationId"]
        else:
            result = parser_mongo_util.get_location(province)
            if result != None:
                locationId = result["locationId"]

    if locationId == 0:
        loc1, loc2 = name_helper.get_location_from_company_name(company_name)
        if loc1 is not None:
            result = parser_mongo_util.get_location(loc1)
            if result != None:
                locationId = result["locationId"]
    logger.info("locationId: %d" % locationId)

    company_status = 2010
    str = d('div.des-more> div').eq(2).text().strip()
    if str == "已关闭":
        company_status = 2020
    logger.info("company_status: %d" % company_status)

    funding_type = 0
    str = d("span.tag.bg-c").text().strip()
    logger.info("融资需求: %s" % str)
    if str == "融资需求 · 需要融资":
        funding_type = 8020
    elif str == "融资需求 · 寻求收购":
        funding_type = 8020
    logger.info("funding_type=%d" % funding_type)

    field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
    logger.info("field: %s" % field)

    sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
    logger.info("sub field: %s" % sub_field)

    tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(
        " ", ",")
    logger.info("tags: %s" % tags)

    desc = d("div.des").text().strip()
    logger.info("desc: %s" % desc)

    #logo
    logo = d("div.pic >img").attr("src")
    #if logo:
    #    logo = logo.replace("http://", "https://")
    logger.info("logo: %s", logo)

    website = d('div.link-line> a.weblink').attr("href").strip()
    if website == "http://%e6%9a%82%e6%97%a0":
        website = ""
    website = url_helper.url_normalize(website)
    logger.info("website: %s" % website)

    artifacts = []
    type, app_market, app_id = url_helper.get_market(website)
    if type == 4010:
        flag, domain = url_helper.get_domain(website)
        if flag is not None:
            if flag is False:
                domain = None
            artifacts.append({
                "type": 4010,
                "name": product_name,
                "desc": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4040:
        domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4040,
                "name": product_name,
                "desc": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4050:
        domain = None
        if app_market == 16010 or app_market == 16020:
            android_app = parser_mongo_util.find_android_market(
                app_market, app_id)
            if android_app:
                domain = android_app["apkname"]
        else:
            domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4050,
                "name": product_name,
                "desc": desc,
                "link": website,
                "domain": domain
            })

    #获投状态
    roundStr = d('span.t-small.c-green').text().replace("(", "").replace(
        ")", "").replace("获投状态:", "").strip()
    fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)
    logger.info("获投状态: %d, %s", fundingRound, roundStr)

    logger.info("")

    return {
        "name": product_name,
        "shortName": company_short_name,
        "fullName": company_name,
        "productName": product_name,
        "description": desc,
        "brief": "",
        "round": fundingRound,
        "roundDesc": roundStr,
        "companyStatus": company_status,
        "fundingType": funding_type,
        "locationId": locationId,
        "establishDate": establish_date,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": field,
        "subField": sub_field,
        "tags": tags,
        "type": 41010,
        "artifacts": artifacts
    }