Ejemplo n.º 1
0
def find_company_by_full_name(companyId, full_name):
    global caflag
    logger.info("find_company_by_full_name")
    if full_name is None or full_name.strip() == "":
        return None

    full_name = name_helper.company_name_normalize(full_name)
    conn = db.connect_torndb()
    company = conn.get(
        "select * from company where fullName=%s and (active is null or active !='N') and id!=%s order by id desc limit 1",
        full_name, companyId)
    conn.close()
    if company is not None:
        logger.info("find_company_by_full_name 1")
        return company["id"]

    # add company_alias into checking list
    if caflag is True:
        conn = db.connect_torndb()
        company_alias = conn.get(
            "select a.* from company_alias a join company c on c.id=a.companyId where (c.active is null or c.active !='N') \
                                    and a.type=12010 and a.name=%s and c.id!=%s order by c.id desc limit 1",
            full_name, companyId)
        conn.close()
        if company_alias is not None:
            logger.info("find_company_by_full_name 2")
            return company_alias["companyId"]
    return None
Ejemplo n.º 2
0
def find_company_by_full_name(full_name):
    # logger.info("find_company_by_full_name")
    if full_name is None or full_name == "":
        return None

    full_name = name_helper.company_name_normalize(full_name)
    conn = db.connect_torndb()
    corporate = conn.get(
        "select * from corporate_alias where name=%s and (active is null or active !='N') limit 1",
        full_name)
    conn.close()
    if corporate is not None:
        logger.info("find_corporate_by_full_name 1")
        return corporate["id"]

    conn = db.connect_torndb()
    company = conn.get(
        "select * from company_alias where name=%s and (active is null or active !='N') limit 1",
        full_name)
    conn.close()
    if company is not None:
        logger.info("find_company_by_full_name 1")
        return company["id"]

    return None
Ejemplo n.º 3
0
def parse_company(item):
    if item is None:
        return None

    #logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    logo = d('.top_info_wrap > img').attr('src')
    if logo.startswith("http") or logo.startswith("https"):
        pass
    else:
        logo = "http:" + logo

    name = d('.company_main > h1 > a').text()
    fullName = d('.company_main > h1 > a').attr('title')
    fullName = name_helper.company_name_normalize(fullName)
    if name is None or fullName is None:
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName

    if fullName.find("分公司") >= 0:
        return {"status": "Sub_company", "name": fullName}

    return {"status": "good"}
Ejemplo n.º 4
0
def find_company_by_name(names):
    companyIds = []
    for name in names:
        name = name_helper.company_name_normalize(name)
        conn = db.connect_torndb()
        companies = conn.query(
            "select * from company where fullName=%s and (active is null or active !='N') order by id desc",
            name)
        companyIds.extend([
            company["id"] for company in companies
            if company["id"] not in companyIds
        ])
        # logger.info("a: %s",companyIds)
        companies2 = conn.query(
            "select * from company where name=%s and (active is null or active !='N') order by id desc",
            name)
        companyIds.extend([
            company["id"] for company in companies2
            if company["id"] not in companyIds
        ])
        # logger.info("b: %s", companyIds)
        # company_alias = conn.query("select distinct a.companyId from company_alias a join company c on c.id=a.companyId where (c.active is null or c.active !='N') \
        #                            and (a.active is null or a.active !='N') and a.name=%s order by c.id desc", name)
        # companyIds.extend([company["companyId"] for company in company_alias if company["companyId"] not in companyIds])
        # logger.info("c: %s", companyIds)
    return companyIds
Ejemplo n.º 5
0
def process(company):
    company_name = name_helper.company_name_normalize(company["name"])
    company_name = company_name.replace("'", "")
    while True:
        proxy = yield get_proxy()
        wait_time = random.randrange(3, 10)
        yield gen.sleep(wait_time)

        TYCID = proxy.get("TYCID")
        if TYCID is None:
            # step1
            # 搜索公司页,获得TYCID
            flag = yield step1(proxy, company_name)
            if flag is False:
                continue

        # step2
        # 已有TYCID, 访问/tongji/companyname.json
        flag = yield step2(proxy, company_name)
        if flag is False:
            continue

        # step3
        # 搜索获得tyc_company_id
        tyc_company_id = yield step3(proxy, company_name)
        if tyc_company_id == -1:
            continue
        elif tyc_company_id == 0:
            # 搜不到
            update_check_time(company, exist=False)
            break

        wait_time = random.randrange(1, 3)
        yield gen.sleep(wait_time)

        # step4
        flag = yield step4(proxy, company_name, tyc_company_id)
        if flag is False:
            continue

        # step5
        flag = yield step5(proxy, company_name, tyc_company_id)
        if flag is False:
            continue

        # step6
        flag = yield step6(proxy, company_name, tyc_company_id)
        if flag is False:
            continue
        update_check_time(company, exist=True)

        # step7
        yield step7(proxy, company_name, tyc_company_id)

        release_proxy(proxy)
        proxy_success(proxy)
        yield gen.sleep(1)
        break
Ejemplo n.º 6
0
def process(company, proxy):
    company_name = name_helper.company_name_normalize(company["name"])
    company_name = company_name.replace("'", "")

    wait_time = random.randrange(3, 10)
    time.sleep(wait_time)

    TYCID = proxy.get("TYCID")
    if TYCID is None:
        # step1
        # 搜索公司页,获得TYCID
        flag = step1(proxy, company_name)
        if flag is False:
            return False

    # step2
    # 已有TYCID, 访问/tongji/companyname.json
    flag = step2(proxy, company_name)
    if flag is False:
        return False

    # step3
    # 搜索获得tyc_company_id
    tyc_company_id = step3(proxy, company_name)
    if tyc_company_id == -1:
        return False
    elif tyc_company_id == 0:
        # 搜不到
        update_check_time(company, exist=False)
        return True

    wait_time = random.randrange(1, 3)
    time.sleep(wait_time)

    # step4
    flag = step4(proxy, company_name, tyc_company_id)
    if flag is False:
        return False

    # step5
    flag = step5(proxy, company_name, tyc_company_id)
    if flag is False:
        return False

    # step6
    flag = step6(proxy, company_name, tyc_company_id)
    if flag is False:
        return False
    update_check_time(company, exist=True)

    # step7
    step7(proxy, company_name, tyc_company_id)

    time.sleep(1)
    return True
Ejemplo n.º 7
0
def find_from_gongshang(name):
    name = name_helper.company_name_normalize(name)
    if name is None:
        return
    chinese, company = name_helper.name_check(name)
    if chinese is True and company is True:
        gs = mongo.info.gongshang.find_one({"name": name})
        if gs is not None:
            for investor in gs["investors"]:
                if investor["type"] == u"企业投资":
                    logger.info("gongshang name: %s", investor["name"])
                    add_2_company_list(investor["name"])
            if gs.has_key("invests"):
                for invest in gs["invests"]:
                    add_2_company_list(invest["name"])
Ejemplo n.º 8
0
def add_2_company_list(name):
    name = name_helper.company_name_normalize(name)
    if name is None:
        return
    chinese, company = name_helper.name_check(name)
    if chinese is True and company is True:
        logger.info("fullname: %s", name)
        name_md5 = util.md5str(name)
        c = mongo.info.company_idx.find_one({"name_md5": name_md5})
        if c is None:
            data = {
                "name": name,
                "name_md5": name_md5,
                "createTime": datetime.datetime.utcnow()
            }
            mongo.info.company_idx.insert_one(data)
Ejemplo n.º 9
0
def parser(item):
    if item is None:
        return None

    investor_key = item["key"]

    html = item["content"]
    #logger.info(html)
    d = pq(html)
    investor_name = d('div.picinfo> p> span.title').text()
    investor_name = name_helper.company_name_normalize(investor_name)
    logger.info("investor_name: " + investor_name)

    if investor_name is None:
        logger.info("No investor name!!!")
        return None

    logo = d('div.pic> img').attr("src")
    if logo is not None:
        logo = logo.strip()
    logger.info("Investor Logo: %s" % logo)

    website = d('span.links >a[target="_black"]').attr("href")
    if website is None or website.strip() == "暂无":
        website = None

    website = url_helper.url_normalize(website)
    flag, domain = url_helper.get_domain(website)
    if flag is None:
        website = None

    logger.info("Investor website: %s" % website)

    stageStr = d('div.pad.block> div.list-tags.yellow').text().replace(
        " ", ",").strip()
    logger.info("Investor rounds: %s" % stageStr)

    fieldsStr = d('div.pad.block> div.list-tags.darkblue').text().replace(
        " ", ",").strip()
    logger.info("Investor fields: %s" % fieldsStr)

    desc = d('div.des').text().strip()
    logger.info("Investor desc: %s" % desc)

    return investor_key, investor_name, logo, website, stageStr, fieldsStr, desc
Ejemplo n.º 10
0
def save_beian_company_names(items, source, sourceId):
    for item in items:
        if item.has_key("whoisExpire") and item["whoisExpire"] == 'Y':
            continue

        if item["organizerType"] != "企业":
            continue

        company_name = name_helper.company_name_normalize(item["organizer"])

        source_company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name.name": company_name})

        if source_company_name is None:

            scndata = {
                "name": company_name,
                "chinese": 'Y',
                "type": 12010,
                "extended": 'Y',
            }
            save_mongo_source_company_name(source, sourceId, scndata)
Ejemplo n.º 11
0
def find_company_by_fullname(full_name):
    fcompanyIds = []
    full_name = name_helper.company_name_normalize(full_name)
    conn = db.connect_torndb()
    fcompanies = conn.query(
        "select * from company where fullName=%s and (active is null or active !='N') order by id desc",
        full_name)
    fcompanyIds.extend([
        company["id"] for company in fcompanies
        if company["id"] not in fcompanyIds
    ])
    # logger.info("a: %s",companyIds)
    fcompanies2 = conn.query(
        "select * from company where name=%s and (active is null or active !='N') order by id desc",
        full_name)
    fcompanyIds.extend([
        company["id"] for company in fcompanies2
        if company["id"] not in fcompanyIds
    ])
    # logger.info("b: %s", companyIds)
    conn.close()
    return fcompanyIds
Ejemplo n.º 12
0
def save_company_name(app, item_of_name, source, sourceId):
    company_name = app[item_of_name]
    if company_name is None or company_name.strip() == "":
        return

    company_name = name_helper.company_name_normalize(company_name)

    source_company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name.name": company_name})

    if source_company_name is None:
        (chinese, company) = name_helper.name_check(app[item_of_name])
        if chinese is True:
            chinese_type = "Y"
        else:
            chinese_type = "N"

        scnamedata = {
            "name": company_name,
            "chinese": chinese_type,
            "type": 12010,
            "extended": 'Y',
        }
        save_mongo_source_company_name(source, sourceId, scnamedata)
Ejemplo n.º 13
0
def find_companies_by_full_name_corporate(full_names):
    companyIds = []
    for full_name in full_names:
        if full_name is None or full_name == "":
            continue

        full_name = name_helper.company_name_normalize(full_name)

        conn = db.connect_torndb()
        corporate_aliases = conn.query("select a.* from corporate_alias a join corporate c on c.id=a.corporateId where "
                                       "(c.active is null or c.active !='N') and (a.active is null or a.active !='N') "
                                       "and a.name=%s",
                                       full_name)
        # conn.close()
        for ca in corporate_aliases:
            # logger.info("*******found %s",ca)
            company = conn.get("select * from company where corporateId=%s and (active is null or active!='N') limit 1",
                               ca["corporateId"])
            if company is not None:
                logger.info("find_company_by_full_name %s: %s", full_name, company["id"])
                if company["id"] not in companyIds:
                    companyIds.append(company["id"])
        conn.close()
    return companyIds
Ejemplo n.º 14
0
def parse_company(item):
    logger.info("parse_company")
    company_key = item["key"]

    #company basic info
    c = item["content"]["company_base"]["data"]
    #check if page is under development or is completed(CREATED)
    # if c["status"] == "INIT":
    #     return {
    #         "status":c["status"],
    #     }

    tags = item["content"]["company_base"]["data"]["industryTag"]
    tags2 = []
    for tag in tags:
        tags2.append(tag["name"])
    tags_str = ",".join(tags2)

    logo = c["logo"]
    if logo:
        logo = logo.replace("https://", "http://")
    establish_date = None
    if c.has_key("startDate"):
        d = time.localtime(c["startDate"] / 1000)
        if d.tm_year > 1980:
            establish_date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday)

    address1 = None
    address2 = None
    if c.has_key("address1"):
        address1 = c["address1"]
    if c.has_key("address2"):
        address2 = c["address2"]

    location_id = 0
    if address2 != None:
        city = kr36_cities.get(str(address2), None)
        if city != None:
            location = parser_db_util.get_location(formCityName(city))
            if location != None:
                location_id = location["locationId"]

    if location_id == 0 and address1 != None:
        city = kr36_cities.get(str(address1), None)
        if city != None:
            location = parser_db_util.get_location(formCityName(city))
            if location != None:
                location_id = location["locationId"]

    #logger.info("locationid =%s",location_id)

    fullName = c["fullName"]
    fullName = fullName.replace("_", "")
    idx = fullName.rfind(u"公司")
    if idx != -1:
        fullName = fullName[:(idx + len(u"公司"))]
    fullName = name_helper.company_name_normalize(fullName)

    desc = ""
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None
    otherDesc = None

    if c.has_key("companyIntroduce"):
        if c["companyIntroduce"]["productService"] is not None and c[
                "companyIntroduce"]["productService"].strip(
                ) != "":  # productService
            productDesc = c["companyIntroduce"]["productService"]
        if c["companyIntroduce"]["userMarket"] is not None and c[
                "companyIntroduce"]["userMarket"].strip() != "":
            marketDesc = c["companyIntroduce"]["userMarket"]
    # if c.has_key("dataLights"): # 我们的用户
    #     operationDesc = c["dataLights"].strip()
    # if c.has_key("projectPlan"): # 未来的我们
    #     modelDesc = c["projectPlan"].strip()
    # if c.has_key("competitor"): # 与我们相似的产品
    #     compititorDesc = c["competitor"].strip()
    if c.has_key("intro"):  # 其他
        # otherDesc = c["intro"].strip()
        desc = c["intro"].strip()
    # if c.has_key("story"): # 团队介绍
    #     teamDesc = c["story"].strip()

    headCount = c["scale"].replace("人", "")
    min_staff = None
    max_staff = None
    if headCount.strip() != "":
        if headCount == "少于15":
            min_staff = 1
            max_staff = 15
        else:
            staffarr = headCount.split('-')
            if len(staffarr) > 1:
                try:
                    min_staff = int(staffarr[0])
                    max_staff = int(staffarr[1])
                except:
                    pass
            else:
                try:
                    min_staff = int(staffarr[0].strip())
                    max_staff = None
                except:
                    pass

    return {
        "name":
        c["name"],
        "fullName":
        fullName if fullName is not None and fullName.strip() != "" else None,
        "description":
        desc,
        "productDesc":
        productDesc,
        "modelDesc":
        modelDesc,
        "operationDesc":
        operationDesc,
        "teamDesc":
        teamDesc,
        "marketDesc":
        marketDesc,
        "compititorDesc":
        compititorDesc,
        "advantageDesc":
        advantageDesc,
        "planDesc":
        planDesc,
        "otherDesc":
        otherDesc,
        "brief":
        c["brief"],
        "round":
        None,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        0,
        "locationId":
        location_id,
        "address":
        None,
        "phone":
        None,
        "establishDate":
        establish_date,
        "logo":
        logo,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "field":
        None,
        "subField":
        None,
        "tags":
        tags_str,
        "headCountMin":
        min_staff,
        "headCountMax":
        max_staff
    }
Ejemplo n.º 15
0
def parse_company(item):
    if item is None:
        logger.info("here")
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html1 = item["content"]
    logger.info(company_key)
    d = pq((html.fromstring(html1.decode("utf-8"))))

    name = d('h1.name').text().strip()

    fullName = d('div.company-business> h4').text()
    if fullName.find("来源")>=0:
        fullName = fullName.split(" ")[-1]

    fullName = name_helper.company_name_normalize(fullName)

    if (name is None or name == "") or (fullName is None or fullName == ""):
        logger.info("here1: %s", name)
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        logger.info("here")
        return {
            "status": "No_Name",
        }
    logo = d('div.company-logo> img').attr('src')

    if logo.startswith("http") or logo.startswith("https") or logo.find("default") >= 0:
        pass
    else:
        logo = None

    # if logo.find("default") >= 0:
    #     logo = None

    brief = None
    desc_text = d('div.job-sec> div.text').text()
    logger.info("desc: %s", desc_text)

    if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5:
        desc = None
    else:

        desc = desc_text.replace('公司简介:',"").replace("收起","").replace("展开","").replace("&nbsp;","").strip()

    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        lll = d('div.info-primary> p').text().strip()
        if len(lll.split(" ")) == 3:
            field = lll.split(" ")[2]
            stage = lll.split(" ")[0]
            headCount = lll.split(" ")[1]

    except:
        pass

    headCount = headCount.replace("人", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None



    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0



    links = d('div.company-products> ul> li> div.text> div.name> a')
    artifacts = []
    for linkp in links:
        link = pq(linkp)('a').attr("href")
        website = url_helper.url_normalize(link)
        logger.info("website: %s" % website)

        type, app_market, app_id = url_helper.get_market(website)
        if type == 4010:
            if item["url"] != website and website.find("zhipin") == -1:
                flag, domain = url_helper.get_domain(website)
                if flag is not None:
                    if flag is False:
                        domain = None
                    artifacts.append({
                        "type": 4010,
                        "name": name,
                        "description": None,
                        "link": website,
                        "domain": domain
                    })
        elif type == 4020 or type == 4030:
            domain = None
            if domain is not None:
                artifacts.append({
                    "type": type,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })
        elif type == 4040:
            domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4040,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })
        elif type == 4050:
            domain = None
            if app_market == 16010 or app_market == 16020:
                android_app = parser_mongo_util.find_android_market(app_market, app_id)
                if android_app:
                    domain = android_app["apkname"]
            else:
                domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4050,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })

    #parser member
    members = []

    lis = d('div.manager-list> div> ul >li> div')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('div.info-user> img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p> span.name').text()
                # member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p> span.job-title').text()

                member_desc = mem('div.item_manager_content').text()

                # weibo = None
                # if member_link is not None:
                #     if 'weibo.com' in member_link:
                #         weibo = member_link

                source_member = {'name': member_name,
                                 'photo_url': logo_url,
                                 'weibo': None,
                                 'location': None,
                                 'role': member_position,
                                 'description': member_desc,
                                 'education': None,
                                 'work': None
                                 }
                members.append(source_member)
            except:
                pass

    sourceId2link =  d('div.company-tab> a').eq(0).attr("href")
    if sourceId2link is not None and sourceId2link.find("gongsi") >=0:
        sourceId2 = sourceId2link.split("/")[-1].replace(".html","")
    else:
        sourceId2 =  None

    source_company = {
                      "name": name,
                      "fullName": fullName  if fullName is not None and fullName.strip() != "" else None,
                      "description": desc,
                      "brief": brief,
                      "round": None,
                      "roundDesc": None,
                      "companyStatus": 2010,
                      'fundingType': funding_type,
                      "locationId": int(0),
                      "address": address,
                      "phone": None,
                      "establishDate": None,
                      "logo": logo,
                      "source": SOURCE,
                      "sourceId": company_key,
                      "sourceId2": sourceId2,
                      "sourceUrl": "https://www.zhipin.com/gongsi/%s.html?ka=company-intro" % company_key,
                      "field": field,
                      "headCountMin": min_staff,
                      "headCountMax": max_staff,
                      "artifacts": artifacts,
                      "members": members,
                      "status": 1,
                      "stage": 0,
                      }

    return source_company
Ejemplo n.º 16
0
def process(url, key, content):
    global LATEST
    if content.find('360安全中心') == -1:
        return

    #logger.info(content)

    r = "var detail = \(function \(\) \{\s*?return\s*?(.*?);\s*?\}\)"
    result = util.re_get_result(r, content)
    (b, ) = result
    base = json.loads(b.replace("'", '"'), strict=False)
    name = base["sname"]
    type = base["type"]
    package = base["pname"].strip()
    #logger.info("%s, %s, %s" % (type, name, package))

    d = pq(html.fromstring(content.decode("utf-8")))
    desc = ""
    try:
        # desc = d('div.breif').contents()[0].strip()
        desc = d('div.breif').text().strip()
        ts = desc.split("【基本信息】")
        desc = ts[0].strip()
    except:
        pass
    if desc == "":
        try:
            desc = d('div#html-brief').text().strip()
        except:
            pass

    #logger.info(desc)

    author = d('div.base-info> table> tbody> tr> td').eq(
        0).contents()[1].strip()
    chinese, is_company = name_helper.name_check(author)
    if chinese and is_company:
        author = name_helper.company_name_normalize(author)
    author = None

    #logger.info(author)
    modify_date_str = d('div.base-info> table> tbody> tr> td').eq(
        1).contents()[1].strip()
    #logger.info(modify_date_str)
    modify_date = datetime.datetime.strptime(modify_date_str, "%Y-%m-%d")
    #logger.info(modify_date)
    versionname = None
    try:
        versionname = d('div.base-info> table> tbody> tr> td').eq(
            2).contents()[1].strip()
        if versionname.startswith("V"):
            versionname = versionname.replace("V", "")
    except:
        pass
    #logger.info(versionname)
    compatibility = d('div.base-info> table> tbody> tr> td').eq(
        3).contents()[1].strip()
    language = d('div.base-info> table> tbody> tr> td').eq(
        4).contents()[1].strip()

    if language == "其他":
        if hz.is_chinese_string(desc):
            language = "中文"
    #logger.info(language)

    icon = d('div#app-info-panel> div> dl> dt >img').attr("src").strip()
    #logger.info(icon)

    screenshots = []
    try:
        screenshots = d('div#scrollbar').attr("data-snaps").split(",")
    except:
        pass

    commentbyeditor = None
    r = "<p><strong>【小编点评】</strong>(.*?)</p>"
    result = util.re_get_result(r, content)
    if result:
        (commentbyeditor, ) = result

    updates = None
    r = "<br/><b>【更新内容】</b><br/>(.*?)</div>"
    result = util.re_get_result(r, content)
    if result:
        (updates, ) = result
        updates = updates.replace("<br />", "\n").strip()

    tags = d("div.app-tags> a").text().replace(" ", ",")

    size = None
    r = "'size':'(.*?)'"
    result = util.re_get_result(r, content)
    if result:
        (size, ) = result
        size = int(size)

    downloadstr = d("span.s-3").eq(0).text().replace("下载:", "").replace(
        "次", "").replace("+", "").strip()
    download = None
    try:
        if downloadstr.endswith("千"):
            download = float(downloadstr.replace("千", "")) * 1000
        elif downloadstr.endswith("万"):
            download = float(downloadstr.replace("万", "")) * 10000
        elif downloadstr.endswith("亿"):
            download = float(downloadstr.replace("亿", "")) * 10000 * 10000
        else:
            download = int(downloadstr)
        score = float(d("span.s-1").text().replace("分", "").strip()) * 0.5
    except:
        traceback.print_exc()

    item = {
        "link": url,
        "apkname": package,
        "appmarket": APPMARKET,
        "name": name,
        "brief": None,
        "website": None,
        "description": desc,
        "commentbyeditor": commentbyeditor,
        "updateDate": modify_date,
        "language": language,
        "tags": tags,
        "version": versionname,
        "updates": updates,
        "size": size,
        "compatibility": compatibility,
        "icon": icon,
        "author": author,
        "screenshots": screenshots,
        "type": type,
        "key": str(key),
        "key_int": key,
        "download": download,
    }
    logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

    android.save(collection, APPMARKET, item)
    android.merge(item)

    if LATEST < key:
        LATEST = key
Ejemplo n.º 17
0
def process(crawler, app, content):
    if content.find('请检查您所输入的URL地址是否有误') != -1:
        return

    key = app["key_int"]
    url = app["link"]

    d = pq(content)
    cate = d('div.nav> span >a').eq(1).text().strip()
    if cate == "游戏":
        return

    sub_cate = d('div.nav> span >a').eq(2).text().strip()
    name = d('h1.app-name> span').text().strip()
    downloadstr = d("span.download-num").eq(0).text().replace("下载次数:","").replace("+","").strip()
    if downloadstr.endswith("千"):
        download = float(downloadstr.replace("千","")) * 1000
    elif downloadstr.endswith("万"):
        download = float(downloadstr.replace("万","")) * 10000
    elif downloadstr.endswith("亿"):
        download = float(downloadstr.replace("亿","")) * 10000 * 10000
    else:
        download = int(downloadstr)
    logger.info("%s-%s, %s, %s", cate, sub_cate, name, download)


    mosug_url = "http://m.baidu.com/mosug?wd=%s&type=soft" % urllib.quote(name.encode("utf-8"))
    while True:
        result = crawler.crawl(mosug_url)
        if result['get'] == 'success':
            mosug_content = result["content"]
            break
    #logger.info(mosug_content)

    data = json.loads(mosug_content)
    if data["result"].get("s") is None:
        return

    found = False
    for dt in data["result"].get("s"):
        if dt.get("package") is None:
            continue
        if long(dt["docid"]) == key:
            download = int(dt["download_num"])
            score = int(dt["score"]) * 0.05
            break


    # screenshot
    screenshots = []
    imgs = d('img.imagefix')
    #logger.info(imgs)
    for img in imgs:
        surl = pq(img).attr("src")
        #logger.info(url)
        screenshots.append(surl)

    # content
    desc = d('p.content').text()
    #logger.info(desc)

    icon = d('div.app-pic> img').attr("src")
    #logger.info(icon)
    author = d('div.origin-wrap> span> span').eq(1).text()
    chinese, is_company = name_helper.name_check(author)
    if chinese and is_company:
        author = name_helper.company_name_normalize(author)
    #logger.info("author: %s", author)
    commentbyeditor = d('span.head-content').text()

    item = {
        "link": url,
        "apkname": app["apkname"],
        "appmarket": APPMARKET,
        "name": name,
        "brief": None,
        "website": None,
        "description": desc,
        "commentbyeditor": commentbyeditor,
        "updateDate": None,
        "language": None,
        "tags": sub_cate,
        "version": app["version"],
        "updates": None,
        "size": app["size"],
        "compatibility": None,
        "icon": icon,
        "author": author,
        "screenshots": screenshots,
        "type": app["type"],
        "key": str(key),
        "key_int": key,
        "download": download
    }
    logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

    android.save(collection, APPMARKET, item)
    android.merge(item)
Ejemplo n.º 18
0
def parse_company(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    logger.info(company_key)
    d = pq(html)

    # logo_id processed in parser_db_util
    '''
    logo_id = None
    if logo_url is not None:
        logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url)
    '''

    if html.decode("utf-8").find("这个公司的主页还在建设") >= 0:
        return {
            "status": "No_Name",
        }
    name = d('.company_main > h1 > a').text()
    link = d('.company_main > h1 > a').attr('href')
    fullName = d('.company_main > h1 > a').attr('title')
    fullName = name_helper.company_name_normalize(fullName)
    if name is None or fullName is None or name.find("拉勾") >= 0:
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        return {
            "status": "No_Name",
        }
    logo = d('.top_info_wrap > img').attr('src')

    if logo.startswith("http") or logo.startswith("https"):
        pass
    else:
        logo = "http:" + logo

    if logo.find("logo_default") >= 0:
        logo = None

    brief = d('.company_word').text()
    desc_text = d('.company_intro_text').text()

    if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10:
        desc = None
    else:
        desc = d('.company_intro_text > .company_content').html()
        desc = desc.replace('<span class="text_over">展开</span>', '')

        soup = BeautifulSoup(desc, "lxml")
        raw = soup.getText()

        # logger.info(desc)
        #logger.info(raw)

        desc = raw

    # if desc is None or desc.strip() == "":
    #     return {
    #         "status": "No_Name",
    #     }
    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        field = d(
            '#basic_container > .item_content >ul > li:eq(0) > span').text()
        stage = d(
            '#basic_container > .item_content >ul > li:eq(1) > span').text()
        headCount = d(
            '#basic_container > .item_content >ul > li:eq(2) > span').text()
        headCount = headCount[0:headCount.index(u'人')]
        location = d(
            '#basic_container > .item_content >ul > li:eq(3) > span').text()
        address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text()
    except:
        pass

    headCount = headCount.replace("people", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None

    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0

    location_id = 0
    location_new = parser_db_util.get_location(location)
    if location_new != None:
        location_id = location_new["locationId"]

    #website = util.norm_url(link)
    website = url_helper.url_normalize(link)
    logger.info("website: %s" % website)

    artifacts = []
    type, app_market, app_id = url_helper.get_market(website)
    if type == 4010:
        if item["url"] != website and website.find("lagou.com") == -1:
            flag, domain = url_helper.get_domain(website)
            if flag is not None:
                if flag is False:
                    domain = None
                artifacts.append({
                    "type": 4010,
                    "name": name,
                    "description": desc,
                    "link": website,
                    "domain": domain
                })
    elif type == 4020 or type == 4030:
        domain = None
        if domain is not None:
            artifacts.append({
                "type": type,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4040:
        domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4040,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4050:
        domain = None
        if app_market == 16010 or app_market == 16020:
            android_app = parser_db_util.find_android_market(
                app_market, app_id)
            if android_app:
                domain = android_app["apkname"]
        else:
            domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4050,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })

    source_company = {
        "name":
        name,
        "fullName":
        fullName if fullName is not None and fullName.strip() != "" else None,
        "description":
        desc,
        "productDesc":
        None,
        "modelDesc":
        None,
        "operationDesc":
        None,
        "teamDesc":
        None,
        "marketDesc":
        None,
        "compititorDesc":
        None,
        "advantageDesc":
        None,
        "planDesc":
        None,
        "brief":
        brief,
        "round":
        None,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        funding_type,
        "locationId":
        location_id,
        "address":
        address,
        "phone":
        None,
        "establishDate":
        None,
        "logo":
        logo,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "field":
        field,
        "subField":
        None,
        "tags":
        None,
        "headCountMin":
        min_staff,
        "headCountMax":
        max_staff,
        "artifacts":
        artifacts,
        "status":
        1
    }

    return source_company
Ejemplo n.º 19
0
    # lines = []
    cnt = 0
    tot = 0
    pb = 0
    for line in lines:
        # logger.info(line)
        names = [name.strip() for name in line.strip().split("+++")]
        if len(names) != 4:
            logger.info(line)
            exit()
        tot += 1
        shortname = names[0]
        fullName = names[1]
        brief = names[2]
        website = names[3]
        fullName = name_helper.company_name_normalize(fullName)
        # if len(brief) < 100:
        #     logger.info(brief)
        # if len(brief) == 0:
        #     brief = None
        #     logger.info("none")
        # if website is not None and website.strip() != "":
        #     logger.info(website)

        # logger.info("name:%s, fullName:%s", shortname, fullName)
        # company_ids = find_company.find_companies_by_full_name_corporate([fullName])
        #
        # if len(company_ids) != 0:
        #     logger.info("found : %s, %s", fullName, company_ids)
        #     cnt += 1
        # insert(shortname,fullName,brief,website)
Ejemplo n.º 20
0
def parse_company(item):
    if item is None:
        logger.info("here")
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html1 = item["content"]
    logger.info(company_key)
    d = pq((html.fromstring(html1.decode("utf-8"))))

    name = d('h1').text().split()[0].strip()

    fullName = name

    fullName = name_helper.company_name_normalize(fullName)

    if (name is None or name == "") or (fullName is None or fullName == ""):
        logger.info("here1: %s", name)
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    # if companycheck is not True:
    #     logger.info("here")
    #     return {
    #         "status": "No_Name",
    #     }
    logo = d('.bigELogo').attr('src')

    if logo.startswith("http") or logo.startswith(
            "https") or logo.find("default") >= 0:
        pass
    else:
        logo = None

    brief = None
    desc_text = d('.profile').text()
    logger.info("desc: %s", desc_text)

    if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5:
        desc = None
    else:
        desc = desc_text.replace('公司简介:', "").replace("收起", "").replace(
            "展开", "").replace("&nbsp;", "").strip()

    field = d('.comp-industry').text().strip()
    stage = ''
    headCount = d('.new-compintro li:nth-child(2)').text().split()[-1]
    location = d('.new-compintro li:nth-child(3)').attr('data-city')
    address = d('.new-compintro li:nth-child(3)').text().replace('公司地址:',
                                                                 '').strip()
    headCount = headCount.replace("人", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None

    #
    # funding_type = 0
    # if stage == '不需要融资':
    #     stage = 0
    #     funding_type = 8010
    # elif stage == '未融资':
    #     stage = 0
    # elif stage == '天使轮':
    #     stage = 1010
    # elif stage == 'A轮':
    #     stage = 1030
    # elif stage == 'B轮':
    #     stage = 1040
    # elif stage == 'C轮':
    #     stage = 1050
    # elif stage == 'D轮及以上':
    #     stage = 1060
    # elif stage == '上市公司':
    #     stage = 1110
    # else:
    #     stage = 0
    #

    # links = d('div.company-products> ul> li> div.text> div.name> a')
    artifacts = []
    # for linkp in links:
    #     link = pq(linkp)('a').attr("href")
    #     website = url_helper.url_normalize(link)
    #     logger.info("website: %s" % website)
    #
    #     type, app_market, app_id = url_helper.get_market(website)
    #     if type == 4010:
    #         if item["url"] != website and website.find("zhipin") == -1:
    #             flag, domain = url_helper.get_domain(website)
    #             if flag is not None:
    #                 if flag is False:
    #                     domain = None
    #                 artifacts.append({
    #                     "type": 4010,
    #                     "name": name,
    #                     "description": None,
    #                     "link": website,
    #                     "domain": domain
    #                 })
    #     elif type == 4020 or type == 4030:
    #         domain = None
    #         if domain is not None:
    #             artifacts.append({
    #                 "type": type,
    #                 "name": name,
    #                 "description": None,
    #                 "link": website,
    #                 "domain": domain
    #             })
    #     elif type == 4040:
    #         domain = app_id
    #         if domain is not None:
    #             artifacts.append({
    #                 "type": 4040,
    #                 "name": name,
    #                 "description": None,
    #                 "link": website,
    #                 "domain": domain
    #             })
    #     elif type == 4050:
    #         domain = None
    #         if app_market == 16010 or app_market == 16020:
    #             android_app = parser_mongo_util.find_android_market(app_market, app_id)
    #             if android_app:
    #                 domain = android_app["apkname"]
    #         else:
    #             domain = app_id
    #         if domain is not None:
    #             artifacts.append({
    #                 "type": 4050,
    #                 "name": name,
    #                 "description": None,
    #                 "link": website,
    #                 "domain": domain
    #             })

    # parser member
    members = []

    lis = d('div.executive dl')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p:nth-child(2)').text()
                # member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p:nth-child(3)').text()

                member_desc = mem('dd').text()

                # weibo = None
                # if member_link is not None:
                #     if 'weibo.com' in member_link:
                #         weibo = member_link

                source_member = {
                    'name': member_name,
                    'photo_url': logo_url,
                    'weibo': None,
                    'location': None,
                    'role': member_position,
                    'description': member_desc,
                    'education': None,
                    'work': None
                }
                members.append(source_member)
            except:
                pass

    source_company = {
        "name": name,
        "fullName": fullName,
        "description": desc,
        "brief": brief,
        "round": None,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': None,
        "locationId": int(0),
        "address": address,
        "phone": None,
        "establishDate": None,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "sourceUrl": "https://www.liepin.com/company/%s/" % company_key,
        "field": field,
        "headCountMin": min_staff,
        "headCountMax": max_staff,
        "artifacts": artifacts,
        "members": members,
        "status": 1,
        "stage": 0,
    }

    return source_company
Ejemplo n.º 21
0
def parse_company(item):
    # logger.info("parse_company")

    d = pq(html.fromstring(item['content'].decode("utf-8")))
    company_key = item["key"]

    # company basic info
    tags = []
    for tag in d('.word_list').text().split():
        if tag.strip() not in tags: tags.append(tag)

    tags_str = ",".join(tags)

    logo = d('.peoimg img').attr('src')
    if logo:
        logo = logo.replace("https://", "http://")

    establish_date = None
    time_content = d('.time_content li:last-child')
    if d(time_content)('.upword').text().find('成立') > 0:
        establish_date = d(time_content)('.time_up').text()
        establish_date = datetime.datetime.strptime(establish_date, '%Y-%m-%d')

    companyName = d('.company_div h5').text()
    city = name_helper.get_location_from_company_name(companyName)[0]
    location_id = 0
    if city != None:
        location = parser_db_util.get_location(city)
        if location != None:
            location_id = location["locationId"]

    # logger.info("locationid =%s",location_id)

    fullName = companyName.replace("_", "")
    fullName = name_helper.company_name_normalize(fullName)

    desc = d('#intro_srocll p').text()
    productDesc = ''
    website = ''
    for p in d('.procont_lis p'):
        if d(p).text().find('官网') > 0 and d(p)('a').attr('href') is not None:
            website = d(p)('a').attr('href')
            continue
        productDesc += d(p).text() + '\n'

    if desc == '' or desc is None: desc = productDesc

    shortName = d('.peo_center h4').text().split(':')[0].split(':')[0].split(
        '——')[0].split(',')[0].split('|')[0]

    companyResult = {}
    # isCompany
    # print companyName,company_key, ',', name_helper.name_check(companyName)[1], ',', len(website)>0
    if name_helper.name_check(companyName)[1] == True:
        # English name
        if name_helper.name_check(shortName)[0] == False:
            pass
        else:
            cnt = 0
            for s in shortName:
                if s in companyName: cnt += 1

            if not cnt > 2:
                shortName = companyName
    else:
        if not len(website) > 0:
            return 0
        else:
            companyResult['fakeName'] = fullName
            fullName = None

    companyResult.update({
        "name": shortName,
        "fullName": fullName,
        "description": desc,
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": establish_date,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None,
        "brief": None,
        "website": website,
    })

    return companyResult
Ejemplo n.º 22
0
def parse_company(item):
    logger.info("parse_company")
    company_key = item["key"]

    #company basic info
    c = item["content"]["company_base"]["data"]["company"]
    #check if page is under development or is completed(CREATED)
    if c["status"] == "INIT":
        return {
            "status": c["status"],
        }

    tags = item["content"]["company_base"]["data"]["tags"]
    tags2 = []
    for tag in tags:
        tags2.append(tag["name"])
    tags_str = ",".join(tags2)

    logo = c["logo"]
    if logo:
        logo = logo.replace("https://", "http://")
    establish_date = None
    if c.has_key("startDate"):
        d = time.localtime(c["startDate"] / 1000)
        if d.tm_year > 1980:
            establish_date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday)

    address1 = None
    address2 = None
    if c.has_key("address1"):
        address1 = c["address1"]
    if c.has_key("address2"):
        address2 = c["address2"]

    location_id = 0
    if address2 != None:
        city = kr36_cities.get(str(address2), None)
        if city != None:
            location = parser_mongo_util.get_location(formCityName(city))
            if location != None:
                location_id = location["locationId"]

    if location_id == 0 and address1 != None:
        city = kr36_cities.get(str(address1), None)
        if city != None:
            location = parser_mongo_util.get_location(formCityName(city))
            if location != None:
                location_id = location["locationId"]

    #logger.info("locationid =%s",location_id)

    fullName = c["fullName"]
    fullName = fullName.replace("_", "")
    idx = fullName.rfind(u"公司")
    if idx != -1:
        fullName = fullName[:(idx + len(u"公司"))]
    fullName = name_helper.company_name_normalize(fullName)

    desc = ""
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None

    if c.has_key("projectAdvantage"):
        productDesc = c["projectAdvantage"].strip()
    if c.has_key("dataLights"):
        operationDesc = c["dataLights"].strip()
    if c.has_key("projectPlan"):
        modelDesc = c["projectPlan"].strip()
    if c.has_key("competitor"):
        compititorDesc = c["competitor"].strip()
    if c.has_key("intro"):
        desc = c["intro"].strip()
    if c.has_key("story"):
        teamDesc = c["story"].strip()

    return {
        "status": c["status"],
        "name": c["name"],
        "fullName": fullName,
        "description": desc,
        "productDesc": productDesc,
        "modelDesc": modelDesc,
        "operationDesc": operationDesc,
        "teamDesc": teamDesc,
        "marketDesc": marketDesc,
        "compititorDesc": compititorDesc,
        "advantageDesc": advantageDesc,
        "planDesc": planDesc,
        "brief": c["brief"],
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": establish_date,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": c.get("industry"),
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None
    }
Ejemplo n.º 23
0
def expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler, test=False):
    logger.info("source: %s, sourceId: %s Start expand!!!", source, sourceId)
    logger.info("clean old expanded data")

    expand_clean(source, sourceId)
    sourcecompany = collection_source_company.find_one({"source": source, "sourceId": sourceId})
    # exit()
    company_fullname = sourcecompany["source_company"]["fullName"]
    if company_fullname is not None and company_fullname.strip() != "":
        company_fullname = name_helper.company_name_normalize(company_fullname)

        scnames = sourcecompany["source_company_name"]
        check_fullname = False
        for scname in scnames:
            if scname["name"] == company_fullname:
                check_fullname = True
                break
        if check_fullname is False:
            (chinese, company) = name_helper.name_check(company_fullname)
            if chinese is True:
                chinese_type = "Y"
            else:
                chinese_type = "N"
            scname_data ={
                "name": company_fullname,
                "chinese": chinese_type,
                "type": 12010,
            }
            save_mongo_source_company_name(source, sourceId, scname_data)

    round = 1

    while True:
        if round >= 6:
            collection_source_company.update_one({"_id": sourcecompany["_id"]},{'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}})
            break

        source_company_names = find_mongo_data(collection_source_company, "source_company_name", source, sourceId)
        main_beianhaos = find_mongo_data(collection_source_company, "source_mainbeianhao", source, sourceId)
        artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId)

        logger.info(json.dumps(source_company_names, ensure_ascii=False, cls=util.CJsonEncoder))
        logger.info(json.dumps(main_beianhaos, ensure_ascii=False, cls=util.CJsonEncoder))
        logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder))

        # Check if there are new stuff which need to do expansion
        if len(source_company_names) == 0 and len(artifacts) == 0 and len(main_beianhaos) == 0:
            collection_source_company.update_one({"_id": sourcecompany["_id"]}, {'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}})
            break

        logger.info("source: %s, sourceId: %s expand for round %d", source, sourceId, round)

        # Step A/1:按公司名,备案查询
        logger.info("source: %s, sourceId: %s 按公司名备案查询", source, sourceId)
        for source_company_name in source_company_names:
            # Only check chinese company name
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            if source_company_name["chinese"] is None:
                (chinese, companyName) = name_helper.name_check(source_company_name["name"])
            else:
                chinese = source_company_name["chinese"]

            if chinese != "Y":
                continue

            check_name = list(collection_beian.find({"organizer": source_company_name["name"]}))
            # Case that one company_name has multiple beian# : 上海汇翼->(today.ai/teambition.com)#If only one found in Mongo.beian(organizer) it is fine
            if len(check_name) == 0:
                if test:
                    items_beianlinks = []
                else:
                    items_beianlinks = beian_links_crawler.query_by_company_name(source_company_name["name"])
                    save_collection_beian(collection_beian, items_beianlinks)  # insert infos into Mongo.beian
            else:
                items_beianlinks = check_name
            save_beian_artifacts(items_beianlinks, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_beianlinks, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_beianlinks, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao

            # beian
            # 发现更多的artifact(website)和公司名,主备案号

        # Step A/2:按domian,备案查询
        logger.info("source: %s, sourceId: %s 按domian备案查询", source, sourceId)
        for artifact in artifacts:
            # Only check is artifact is a website
            if artifact["type"] != 4010:
                continue
            if artifact["domain"] is None:
                link = url_helper.url_normalize(artifact["link"])
                (flag, domain) = url_helper.get_domain(link)
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            check_domain = list(collection_beian.find({"domain": domain}))

            if len(check_domain) == 0:
                if test:
                    items_merge =[]
                else:
                    items_beianlinks = beian_links_crawler.query_by_domain(domain)
                    items_icpchinaz = icp_chinaz_crawler.query_by_domain(domain)
                    items_merge = merge_beian(items_beianlinks, items_icpchinaz)

                    save_collection_beian(collection_beian, items_merge)  # insert infos into Mongo.beian
            else:
                items_merge = check_domain

            # filer by check domain to avoid sinaapp.cn case
            items_merge = filter_domain(items_merge, domain)

            save_beian_artifacts(items_merge, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_merge, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_merge, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao

            # beian
            # 发现更多的artifact(website)和公司名,主备案号

        # Step A/3 #按主备案号查询
        logger.info("source: %s, sourceId: %s 按主备案号查询", source, sourceId)
        for main_beianhao in main_beianhaos:
            mainBeianhao = main_beianhao["mainBeianhao"]
            check_mainBeianhao = collection_main_beianhao.find_one({"mainBeianhao": mainBeianhao})

            if check_mainBeianhao is None:
                if test:
                    items_merge =[]
                else:
                    items_beianlinks = beian_links_crawler.query_by_main_beianhao(mainBeianhao)
                    items_icpchinaz = icp_chinaz_crawler.query_by_main_beianhao(mainBeianhao)
                    items_merge = merge_beian(items_beianlinks, items_icpchinaz)

                    save_collection_beian(collection_beian, items_merge)  # insert infos into Mongo.beian
                # if mainBeianhao could be found in two links
                if len(items_merge) > 0:
                    items_main_beianhao = [{"mainBeianhao": mainBeianhao}]
                    save_collection_mainBeianhao(collection_main_beianhao, items_main_beianhao)  # insert mainBeianhao into Mongo.main_beianhao
            else:
                items_merge = list(collection_beian.find({"mainBeianhao": mainBeianhao}))

            save_beian_artifacts(items_merge, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_merge, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_merge, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao
            # 发现更多的artifact(website)和公司名

        # itunes扩展
        # Step B/1 #查询itunes artifact
        logger.info("source: %s, sourceId: %s 查询itunes artifact", source, sourceId)

        itunes_company_enames = {}
        app_by_name = {}

        for artifact in artifacts:
            if artifact["type"] != 4040:
                continue
            # Get trackid
            trackid = None
            if artifact["domain"] is None:
                (apptype, appmarket, trackid) = url_helper.get_market(artifact["link"])
                if apptype != 4040:
                    continue

            else:
                try:
                    trackid = int(artifact["domain"])
                except:
                    pass

            if trackid is not None:
                app = collection_itunes.find_one({"trackId": trackid})

                if app is None:
                    # mark it as Noactive
                    set_artifact_active(artifact, "N", source, sourceId)
                else:
                    copy_from_itunes(app, artifact, source, sourceId)  # 存在: copy from mongo.itunes
                    if app.has_key("offline") and app["offline"] is True:
                        set_artifact_active(artifact, "Offline", source, sourceId)
                    else:
                        set_artifact_active(artifact, "Y", source, sourceId)

                    english, is_company = name_helper.english_name_check(app["sellerName"])
                    if english and is_company:
                        itunes_company_enames["sellerName"] = 1
                        app_by_name = app
            else:
                set_artifact_active(artifact, "N", source, sourceId)

        # save the only english name
        if len(itunes_company_enames) == 1:
            company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name": {"$elemMatch": {"type": 12010, "chinese":"N"}}})

            if company_name is None:
                save_company_name(app_by_name, "sellerName", source, sourceId)

        # Step B/2根据公司名查询更多的itunes artifact
        logger.info("source: %s, sourceId: %s 根据公司名查询更多的itunes artifact", source, sourceId)
        for source_company_name in source_company_names:
            # producer name
            '''
            check_itunes_producers = list(collection_itunes.find({"developer": source_company_name["name"]}))
            if len(check_itunes_producers) > 0:
                for app in check_itunes_producers:
                    # Check if itunesId is already existed in artifacts
                    if find_itunesId(app["trackId"], source_company_id):
                        pass
                    else:
                        source_artifact_id = save_itunes_artifact(app, source_company_id)
                        #save_artifact_itunes_rel(app["_id"], source_artifact_id)
                    save_company_name(app, "developer", source_company_id)
            '''
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            check_itunes_sellers = list(collection_itunes.find({"sellerName": source_company_name["name"]}))
            if len(check_itunes_sellers) > 0:
                '''
                domains = {}
                for app in check_itunes_sellers:
                    sellerUrl = app.get("sellerUrl")
                    flag ,domain = url_helper.get_domain(sellerUrl)
                    if flag is not None and domain is not None:
                        domains[domain] = 1
                '''
                lens_domain = count_domains(check_itunes_sellers, "sellerUrl")
                artifact_status = check_source_artifact(source, sourceId)

                for app in check_itunes_sellers:
                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)

                        if app.has_key("sellerUrl"):
                            # if find_link(app["sellerUrl"], source_company_id) or check_source_artifact(source_company_id):
                            if artifact_status:
                                pass
                            elif lens_domain == 1:
                                artifact_id = save_itunesSellerUrl_artifact(app, source, sourceId)

                                if artifact_id is not None:
                                    artifact_status = True

                            # comment due to incorrect expand
                            '''
                            if app.has_key("supportUrl"):
                                if find_link(app["supportUrl"], source_company_id):
                                    pass
                                else:
                                    save_itunesSupportUrl_artifact(app, source_company_id)
                            '''

                            # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                            # save_company_name(app, "sellerName", source_company_id)

        # Step B/3根据域名查询更多的itunes artifact
        logger.info("source: %s, sourceId: %s 根据域名查询更多的itunes artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue

            if artifact["domain"] is None:
                (flag, domain) = url_helper.get_domain(artifact["link"])
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            if domain in itunesDomainEx:
                continue

            check_itunes_sellerDomains = list(collection_itunes.find({"sellerDomain": domain}))
            if len(check_itunes_sellerDomains) > 0:

                lens_company_names = count_company_names(check_itunes_sellerDomains, "sellerName")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_itunes_sellerDomains:

                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)

                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                        chinese, is_company = name_helper.name_check(app["sellerName"])
                        if chinese and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

                        english, is_company = name_helper.english_name_check(app["sellerName"])
                        if english and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

            check_itunes_supportDomains = list(collection_itunes.find({"supportDomain": domain}))
            if len(check_itunes_supportDomains) > 0 and len(check_itunes_supportDomains) < 100:

                lens_company_names = count_company_names(check_itunes_supportDomains, "sellerName")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_itunes_supportDomains:
                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)
                        # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:
                        chinese, is_company = name_helper.name_check(app["sellerName"])
                        if chinese and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

                        english, is_company = name_helper.english_name_check(app["sellerName"])
                        if english and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

        # 发现更多的artifact(website)和公司名,check if existed in source_art..and company_name


        # android扩展
        # Step C/1#查询android artifact
        logger.info("source: %s, sourceId: %s 查询android artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4050:
                continue
            # Get apkname
            apkname = None
            if artifact["domain"] is None:
                (apptype, appmarket, appid) = url_helper.get_market(artifact["link"])
                # Get apkname of baidu and 360 from android market
                if apptype != 4050:
                    continue

                if appmarket == 16010 or appmarket == 16020:
                    android_app = collection_android_market.find_one({"appmarket": appmarket, "key_int": appid})
                    if android_app:
                        apkname = android_app["apkname"]
                else:
                    apkname = appid
            else:
                apkname = artifact["domain"]

            if apkname is not None:
                app = collection_android.find_one({"apkname": apkname})

                if app is None:
                    # mark it as Noactive
                    set_artifact_active(artifact, "N", source, sourceId)
                else:
                    copy_from_android(app, artifact, source, sourceId)  # 存在: copy from mongo.android
                    set_artifact_active(artifact, "Y", source, sourceId)

                    # chinese, is_company = name_helper.name_check(app["author"])
                    # if is_company:
                    #     save_company_name(app, "author", source_company_id)
            else:
                set_artifact_active(artifact, "N", source, sourceId)

        # Step C/2根据公司名查询更多的android artifact
        logger.info("source: %s, sourceId: %s 根据公司名查询更多的android artifact", source, sourceId)
        for source_company_name in source_company_names:
            # producer name
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            check_android_authors = list(collection_android.find({"author": source_company_name["name"]}))
            if len(check_android_authors) > 0 and len(check_android_authors) < 200:

                lens_domain = count_domains(check_android_authors, "website")
                artifact_status = check_source_artifact(source, sourceId)

                # check if author is consistent
                for app in check_android_authors:
                    # Check if AnId have one 4010
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)

                        if artifact_status:
                            pass
                        elif lens_domain == 1:
                            artifact_id = save_androidWebsite_artifact(app, source, sourceId)

                            if artifact_id is not None:
                                artifact_status = True

                                # save_artifact_android_rel(app["_id"], source_artifact_id)
                                # save_company_name(app, "author", source_company_id)

        # Step C/3根据域名查询更多的android artifact
        logger.info("source: %s, sourceId: %s 根据域名查询更多的android artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue

            if artifact["domain"] is None:
                (flag, domain) = url_helper.get_domain(artifact["link"])
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            check_android_websiteDomains = list(collection_android.find({"website_domain": domain}))
            if len(check_android_websiteDomains) > 0:

                lens_company_names = count_company_names(check_android_websiteDomains, "author")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_android_websiteDomains:
                    # Check if AndroidId is already existed in artifacts
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)
                        # save_artifact_android_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        chinese, is_company = name_helper.name_check(app["author"])
                        if is_company:
                            save_company_name(app, "author", source, sourceId)
                            company_name_status = True

            check_android_apknameDomains = list(collection_android.find({"apkname_domain": domain}))
            # add threshold to avoid case: domain: com.wowotuan
            if len(check_android_apknameDomains) > 0 and len(check_android_apknameDomains) < 100:

                lens_company_names = count_company_names(check_android_apknameDomains, "author")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_android_apknameDomains:
                    # Check if AndroidId is already existed in artifacts
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)
                        # save_artifact_android_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        chinese, is_company = name_helper.name_check(app["author"])
                        if is_company:
                            save_company_name(app, "author", source, sourceId)
                            company_name_status = True
        # 发现更多的artifact(website)和公司名

        # 曾用名 TODO

        # 清洗website artfiact
        # 查询meta信息, 标记不能访问的?website?, 处理转跳的website
        logger.info("source: %s, sourceId: %s website meta", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue
            if artifact["link"] is None or artifact["link"].strip() == "":
                # set_active("source_artifact", "N", artifact["id"])
                set_artifact_active(artifact, "N", source, sourceId)
                continue

            url = artifact["link"].strip()
            meta = collection_website.find_one({"url": url})
            if meta is None or meta["httpcode"]==404:
                meta = website.get_meta_info(url)
                if meta:
                    websiteId = save_collection_website(collection_website, meta)
                    if websiteId is not None and not test:
                        #screenshot_wesbite(collection_website, websiteId, screenshot_crawler)
                        pass
                else:
                    meta = {
                        "url": artifact["link"],
                        "httpcode": 404
                    }
                    websiteId = save_collection_website(collection_website, meta)
                    set_artifact_active(artifact, "N", source, sourceId)

            if meta:
                # 发生转跳
                # logger.info(meta)
                if meta["httpcode"] == 200:
                    redirect_url = meta.get("redirect_url")
                    if artifact["link"] != redirect_url:
                        url = url_helper.url_normalize(meta["redirect_url"])
                        (flag_new, domain_new) = url_helper.get_domain(url)

                        meta_new = {
                            "url": url,
                            "domain": domain_new if flag_new is True else None,
                            "redirect_url": url,
                            "title": meta["title"],
                            "tags": meta["tags"],
                            "description": meta["description"],
                            "httpcode": 200

                        }

                        websiteId_new = save_collection_website(collection_website, meta_new)
                        if websiteId_new is not None and not test:
                            #screenshot_wesbite(collection_website, websiteId_new, screenshot_crawler)
                            pass

                        flag, domain = url_helper.get_domain(artifact["link"])
                        if domain_new != domain:  # 跳出原域名
                            set_artifact_active(artifact, "Redirect", source, sourceId)
                        else:
                            if flag is True:  # 这是个'好'地址
                                set_artifact_active(artifact, "Y", source, sourceId)
                            else:
                                if flag_new is True:  # 转跳后是个 '好'地址
                                    set_artifact_active(artifact, "Redirect", source, sourceId)
                                    save_website_artifact(meta_new, source, sourceId)
                                else:
                                    set_artifact_active(artifact, "Y", source, sourceId)
                    else:
                        set_artifact_active(artifact, "Y", source, sourceId)
                elif meta["httpcode"] == 404:
                    set_artifact_active(artifact, "N", source, sourceId)

        # verify -> source_artifacts/source_company_name set verify
        logger.info("source: %s, sourceId: %s set verify", source, sourceId)
        for artifact in artifacts:
            set_artifact_expand(artifact, source, sourceId)
        for source_company_name in source_company_names:
            set_scname_expand(source_company_name, source, sourceId)
        for main_beianhao in main_beianhaos:
            set_scbeianhao_expand(main_beianhao, source, sourceId)

        round += 1
Ejemplo n.º 24
0
        # tot += 1
        shortname = names[1]
        fullName = names[2]
        if fullName is None or fullName.strip() == "":
            fullName = names[3]
        if fullName is None or fullName.strip() == "":
            logger.info(line)
            # logger.info("er1")
            continue
            # exit()
        # fullNames = [name_helper.company_name_normalize(unicode(name[2])), name_helper.company_name_normalize(unicode(name[3]))]
        fullNames = []
        for fn in [names[2], names[3]]:
            if fn is not None and fn.strip() != "":
                fullNames.append(
                    name_helper.company_name_normalize(unicode(fn)))
        fullName = name_helper.company_name_normalize(unicode(fullName))

        roundstr = names[4]
        inv = names[5]
        fdate = names[6]
        investor = names[7]
        if investor is not None:
            investor = investor.split("/")[0]
        if investor is None or investor.strip() == "":
            if names[8] is not None:
                investor = names[8].split("/")[0]

        if investor is None:
            logger.info(line)
            logger.info("er2")
Ejemplo n.º 25
0
def process(crawler, url, apkname, content):
    # logger.info(content)
    if has_content(content,apkname):
        logger.info("hereherehere")
        #content = content.decode('utf-8')
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))
        #content = unicode(content, encoding="utf-8", errors='replace')
        #d = pq(content)

        name = d('span.title').text()
        # logger.info("name: %s",name)

        icon = d('div.app-icon> img').attr("src")

        brief = d('p.tagline').text()
        # logger.info(brief)

        commentbyeditor= d('div.editorComment> div').text()
        #logger.info(editor_comment)

        screenshots = []
        imgs = d('div.overview> img')
        # logger.info(imgs)
        for img in imgs:
            imgurl = pq(img).attr("src")
            screenshots.append(imgurl)

        desc = d('div.desc-info> div').text()
        # logger.info(desc)
        updates = d('div.change-info> div').text()
        # logger.info(update_desc)
        try:
            size = int(d('meta[itemprop="fileSize"]').attr("content"))
        except:
            size = d('meta[itemprop="fileSize"]').attr("content")
            if size.find("KB") >= 0:
                size = int(float(size.replace("KB","").strip())* 1024)
            elif size.find("MB") >= 0:
                size = int(float(size.replace("MB","").strip())* 1024 * 1024)
            else:
                size = None
        tags = d('dd.tag-box >a').text().replace(" ",",")


        datestr = d('time#baidu_time').text()
        updatedate = datetime.datetime.strptime(datestr, "%Y年%m月%d日")
        #versionname = d(':contains("版本")').next()
        #logger.info(versionname)
        author = d('span.dev-sites').text()
        chinese, is_company = name_helper.name_check(author)
        if chinese and is_company:
            author = name_helper.company_name_normalize(author)
        try:
            website=d('a.dev-sites').attr("href")
            website = url_helper.url_normalize(website)
        except:
            website=None

        compatibility=None
        if content.find("查看权限要求") == -1:
            r1 = "content=\"Android\">(.*?)</dd>.*<dt>来自"
        else:
            r1 = "content=\"Android\">(.*?)<div>.*"
        result1 = util.re_get_result(r1, content)
        if result1:
            (compatibility,)= result1
            compatibility=compatibility.replace("\n","").replace("\r","").replace("\s","").replace(" ","")
        #logger.info(compatibility)

        versionname=None
        r2 = "<dt>版本</dt>.*<dd>(.*?)</dd>.*<dt>要求"
        result2 = util.re_get_result(r2, content)
        if result2:
            (versionname,)= result2
            versionname = versionname.replace("\n", "").replace("\r", "").replace("\s", "").replace("&nbsp;","").strip()

        #logger.info(versionname)

        try:
            versionname = versionname.split()[0]
            if versionname.startswith("V"):
                versionname = versionname.replace("V", "")
        except:
            pass
        # download = int(d("i[itemprop='interactionCount']").attr("content").split(":")[1])
        dnum = d("i[itemprop='interactionCount']").attr("content").split(":")[1]
        download = None
        try:
            download = int(dnum)
        except:
            if dnum.find("万") >= 0:
                download = int(float(dnum.replace("万", "").strip()) * 10000)
            elif dnum.find("亿") >= 0:
                download = int(float(dnum.replace("亿", "").strip()) * 10000 * 10000)
            else:
                logger.info("********download :%s cannot get", dnum)

        item = {
            "link": url,
            "apkname": apkname,
            "appmarket": APPMARKET,
            "name": name,
            "brief": brief,
            "website": website,
            "description": desc,
            "commentbyeditor": commentbyeditor,
            "updateDate": updatedate,
            "language": None,
            "tags": tags,
            "version": versionname,
            "updates": updates,
            "size": size,
            "compatibility": compatibility,
            "icon": icon,
            "author": author,
            "screenshots": screenshots,
            "type": None,
            "key": apkname,
            "key_int": None,
            "download":download,
            }

        logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

        android.save(collection, APPMARKET, item)
        android.merge(item)
        collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": True}})

    else:
        logger.info("App: %s has no content", apkname)
        #logger.info(content)
        collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": False}})