Ejemplo n.º 1
0
def insert(shortname, name, brief, website):
    name = name.replace("(开业)", "")
    sourceId = util.md5str(name)
    sid = parser_db_util.save_company_yitai(shortname, name, 13100, sourceId,
                                            brief)
    logger.info("sid:%s->sourceId:%s", sid, sourceId)
    parser_db_util.save_source_company_name(sid, name, 12010)
    parser_db_util.save_source_company_name(sid, shortname, 12020)
    if website is not None and website.strip() != "":
        website = url_helper.url_normalize(website)
        if website is not None and website != "":
            if website.find("http://") == -1 and website.find("https://"):
                website = "http://" + website
            type, market, app_id = url_helper.get_market(website)
            if type == 4010:
                if website.find('sse.com') > 0:
                    pass
                else:
                    artifact = {
                        "sourceCompanyId": sid,
                        "name": shortname,
                        "description": None,
                        "link": website,
                        "domain": app_id,
                        "type": type
                    }

                    parser_db_util.save_artifacts_standard(sid, [artifact])
Ejemplo n.º 2
0
def find_company_by_artifact(website):
    type, market, website_domain = url_helper.get_market(website)

    if type == 4010 and website_domain is not None:

        conn = db.connect_torndb()
        artifact = conn.get(
            "select a.* from artifact a join company c on c.id=a.companyId "
            "where (c.active is null or c.active !='N') and a.type=%s and a.link=%s limit 1",
            4010, website)
        conn.close()
        if artifact is not None:
            logger.info("find_company_by_artifact 1, %s, %s", artifact["type"],
                        artifact["link"])
            return artifact["companyId"]

        conn = db.connect_torndb()
        artifact = conn.get(
            "select a.* from artifact a join company c on c.id=a.companyId "
            "where (c.active is null or c.active !='N') and a.type=%s and a.domain=%s limit 1",
            4010, website_domain)
        conn.close()
        if artifact is not None:
            logger.info("find_company_by_artifact 2, %s, %s", artifact["type"],
                        artifact["domain"])
            return artifact["companyId"]
    return None
Ejemplo n.º 3
0
def find_itunesId(itunesId, companyId):

    conn = db.connect_torndb()
    artifacts = conn.query(
        "select * from artifact where companyId=%s and type=4040", companyId)
    conn.close()
    #Check if itunesId is already existed in artifacts
    for artifact in artifacts:

        trackid = None
        if artifact["domain"] is None:
            (apptype, appmarket,
             trackid) = url_helper.get_market(artifact["link"])
            if apptype != 4040:
                continue

        else:
            try:
                trackid = int(artifact["domain"])
            except:
                pass

        if trackid == itunesId:
            return True
    return False
Ejemplo n.º 4
0
def find_androidAppname(androidApk, source, sourceId):
    if androidApk is None or androidApk.strip() == "":
        return True

    artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId, nonexpand=False)
    #Check if apkname is already existed in artifacts
    for artifact in artifacts:
        if artifact["type"] != 4050:
           continue

        apkname = None
        if artifact["domain"] is None:
            (apptype, appmarket, appid) = url_helper.get_market(artifact["link"])
            if apptype != 4050:
                continue
            # Get apkname of baidu and 360 from android market
            if appmarket == 16010 or appmarket == 16020:
                android_app = collection_android_market.find_one({"appmarket": appmarket, "key_int": appid})
                if android_app:
                    apkname = android_app["apkname"]
            else:
                apkname = appid
        else:
            apkname = artifact["domain"]
        #logger.info(apkname)
        if apkname == androidApk:
            return True
    return False
Ejemplo n.º 5
0
def aggregate(item, tt=1):
    company_ids = []
    flag = False
    btb = item["data"]["btb"]
    basic = item["data"]["basic"]
    if btb["bz_code"] is not None and btb["bz_code"].strip() != "":
        # logger.info(btb["source"])
        fullName = basic["company"]
        shortName = basic["product"]
        website = basic["gw_link"]
        type, market, app_id = url_helper.get_market(website)
        artifact = {"link": website, "domain": app_id, "type": type}
        website1 = btb["gw"]
        type, market, app_id = url_helper.get_market(website1)
        artifact1 = {"link": website, "domain": app_id, "type": type}

        if tt == 1:
            if len(find_companies_by_full_name_corporate([fullName])) > 0:
                company_ids = find_companies_by_full_name_corporate([fullName])
                logger.info("%s found fullName by %s", btb["bz_code"],
                            fullName)
                flag = 2
            elif len(find_companies_by_artifacts([artifact])) > 0:
                company_ids = find_companies_by_artifacts([artifact])
                logger.info("%s found artifact by %s", btb["bz_code"], website)
                flag = 3
            elif len(find_reference([shortName, btb["bz_code"]])) > 0:
                company_ids = find_reference([shortName, btb["bz_code"]])
                logger.info("%s found shortName by %s", btb["bz_code"],
                            shortName)
                flag = 4
            else:
                flag = 5
        else:
            if len(find_companies_by_artifacts([artifact1])) > 0:
                logger.info("%s found artifact by %s", btb["bz_code"], website)
                flag = 4

            else:
                flag = 5

    else:
        flag = 0

    return flag, company_ids
Ejemplo n.º 6
0
def update_domain_website():
    conn = db.connect_torndb()
    arts = conn.query("select * from investor where (active ='Y' or active is null) and "
                      "domain is null and website is not null")
    for art in arts:
        (linktype,appmarket , domain) = url_helper.get_market(art["website"])
        if domain is not None:
            sql = "update investor set domain=%s where id=%s"
            conn.update(sql, domain, art["id"])
    conn.close()
Ejemplo n.º 7
0
def update_domain_artifact():
    # conn = db.connect_torndb()
    conn = db.connect_torndb_proxy()
    arts = conn.query(
        "select * from artifact where (active ='Y' or active is null) and domain is null"
    )
    for art in arts:
        if art["type"] in [4010, 4040, 4050]:
            (linktype, appmarket, domain) = url_helper.get_market(art["link"])
            if domain is not None:
                update_domain(domain, art["id"])
        if art["type"] in [4020]:
            if art["link"] is not None and art["link"].strip() != "":
                update_domain(art["link"], art["id"])
    conn.close()
Ejemplo n.º 8
0
def parse_artifact(source_company_id, r):
    type, market, app_id = url_helper.get_market(r['website'])
    artifacts = []

    if type == 4010 and r['website'].strip(
    ) != '' and r['website'] is not None:
        artifact = {
            "sourceCompanyId": source_company_id,
            "name": r["name"],
            "description": None,
            "link": r['website'],
            "domain": app_id,
            "type": type
        }
        artifacts.append(artifact)

    return artifacts
Ejemplo n.º 9
0
def find_androidAppname(androidApk, companyId):
    # mongo
    mongo = db.connect_mongo()
    collection_android_market = mongo.market.android_market

    if androidApk is None or androidApk.strip() == "":
        mongo.close()
        return True

    conn = db.connect_torndb()
    artifacts = conn.query(
        "select * from artifact where companyId=%s and type=4050", companyId)
    conn.close()

    #Check if apkname is already existed in artifacts
    for artifact in artifacts:

        apkname = None
        if artifact["domain"] is None:
            (apptype, appmarket,
             appid) = url_helper.get_market(artifact["link"])
            if apptype != 4050:
                continue
            # Get apkname of baidu and 360 from android market
            if appmarket == 16010 or appmarket == 16020:
                android_app = collection_android_market.find_one({
                    "appmarket":
                    appmarket,
                    "key_int":
                    appid
                })
                if android_app:
                    apkname = android_app["apkname"]
            else:
                apkname = appid
        else:
            apkname = artifact["domain"]
        #logger.info(apkname)
        if apkname == androidApk:
            mongo.close()
            return True
    mongo.close()
    return False
Ejemplo n.º 10
0
def parse_base(item):
    if item is None:
        return None

    company_key = item["key"]
    content = item["content"]
    artifacts = []
    link = url_helper.url_normalize(content["website"])
    type, app_market, app_id = url_helper.get_market(link)
    if type == 4010 or \
    ( (type == 4040 or type == 4050) and app_id):
        artifacts.append({
                    "type":type,
                    "name":content["name"],
                    "desc":content["desc"],
                    "link":link,
                    "domain":app_id
            })

    return {
        "shortName": content["name"],
        "fullName": None,
        "productName": content["name"],
        "description": None,
        "brief": content["desc"],
        "round": 0,
        "roundDesc": "",
        "companyStatus": 2010,
        "fundingType": 0,
        "locationId": 0,
        "establishDate": None,
        "logo": None,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": None,
        "type":41020,
        "score":content["score"],
        "artifacts":artifacts
    }
Ejemplo n.º 11
0
def parse_artifact(source_company_id,item):
    logger.info("parse_artifact")
    c = item["baseinfo"]
    artifacts = []
    website = c.get("website","").strip()

    website = url_helper.url_normalize(website)
    if website is not None and website != "":
        if website.find("http://") == -1 and website.find("https://"):
            website = "http://"+website
        type, market, app_id = url_helper.get_market(website)
        if type == 4010:
            if website.find('neeq') > 0:
                pass
            else:
                artifact = {
                    "sourceCompanyId": source_company_id,
                    "name": c["name"],
                    "description": None,
                    "link": website,
                    "domain": app_id,
                    "type": type
                }
                artifacts.append(artifact)
        elif (type==4040 or type==4050) and app_id is not None:
            domain = get_android_domain(market, app_id)
            if (type==4040 or type==4050) and domain is not None:
                artifact = {
                    "sourceCompanyId": source_company_id,
                    "name": c["name"],
                    "description": None,
                    "link": website,
                    "domain": domain,
                    "type": type
                }
                artifacts.append(artifact)

    return artifacts
Ejemplo n.º 12
0
def find_itunesId(itunesId, source, sourceId):
    artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId, nonexpand=False)
    #Check if itunesId is already existed in artifacts
    for artifact in artifacts:
        if artifact["type"] != 4040:
           continue

        #Get trackid
        trackid = None
        if artifact["domain"] is None:
            (apptype, appmarket, trackid) = url_helper.get_market(artifact["link"])
            if apptype != 4040:
                continue

        else:
            try:
                trackid = int(artifact["domain"])
            except:
                pass

        if trackid == itunesId:
            return True
    return False
Ejemplo n.º 13
0
def process():
    logger.info("itjuzi_next_parser begin...")

    items = parser_db_util.find_process(SOURCE, TYPE)

    for item in items:
        logger.info(item["url"])

        r = parse_base(item)
        if r is None:
            continue
        #logger.info(r)
        source_company_id = parser_db_util.save_company(
            r, SOURCE, download_crawler)
        logger.info("source_company_id=%s", source_company_id)

        parser_db_util.save_company_score(source_company_id, r["score"])

        artifacts = []
        for artifact in r["artifacts"]:
            link = artifact["link"]
            type, app_market, app_id = url_helper.get_market(link)
            if type is None:
                continue
            if type == 4040 or type == 4050:
                if app_id is None:
                    continue
            artifact["type"] = type
            artifact["domain"] = app_id
            artifacts.append(artifact)

        parser_db_util.save_artifacts(source_company_id, artifacts)

        parser_db_util.update_processed(item["_id"])
        #break

    logger.info("itjuzi_next_parser end.")
Ejemplo n.º 14
0
def find(artifact):
    # app = collection_android.find_one({"apkname": apkname})
    # if app is not None:
    #     # logger.info("find domain:%s app: link:%s", apkname, app["link"])
    #     return app["link"]
    # else:
    #     logger.info("cannot find domain:%s ", apkname)
    #     return None
    apkname = None

    (apptype, appmarket, appid) = url_helper.get_market(artifact["link"])
    # Get apkname of baidu and 360 from android market
    if apptype not in [4040, 4050]:
        return None

    if appmarket == 16010 or appmarket == 16020:
        android_app = collection_android_market.find_one({
            "appmarket": appmarket,
            "key_int": appid
        })
        if android_app:
            apkname = android_app["apkname"]
    else:
        apkname = appid

    app = None

    if apkname is not None:
        if apptype == 4040:
            app = collection_itunes.find_one({"trackId": appid})
        else:
            app = collection_android.find_one({"apkname": apkname})

    if app is None:
        return None
    else:
        return app
Ejemplo n.º 15
0
        for bc in bcs:
            companyIds = []
            num0 += 1

            websitestr = bc["websites"]

            if websitestr is None:
                num1 += 1
                continue
            else:
                # companyIds = []
                websites = websitestr.split("|")

                for website in websites:
                    tp, market, app_id = url_helper.get_market(website)
                    # logger.info("%s-%s", type(tp),tp)
                    artifact = {"link": website, "domain": app_id, "type": tp}

                    for id in find_companies_by_artifacts([artifact]):
                        if id not in companyIds:
                            companyIds.append(id)
                    # companyIds.extend(find_companies_by_artifacts([artifact]))

                if len(companyIds) == 0:
                    num2 += 1
                elif len(companyIds) > 1:
                    num3 += 1
                else:
                    logger.info("%s matched company: %s   %s", bc["symbol"],
                                companyIds, num4)
Ejemplo n.º 16
0
def parse_company(item):

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    logger.info(company_key)
    d = pq(html)

    logo = d('.logo-block > img').attr('src')
    if logo == 'http://assets3.chuangyepu.com/chuangyepu/images/big-screenshot.png':
        logo = None

    basic_info = d('div.col-md-9> div> table> tr> td').eq(1)
    #logger.info(basic_info)
    name = pq(basic_info)('div.name').text().strip()
    brief = pq(basic_info)('div.desc').eq(0).text().strip()
    if name is None:
        return {
            "status": "No_Name",
        }
    #logger.info(name+" "+brief)
    try:
        website = pq(basic_info)('div.desc').eq(1)('a').text().strip()
    except:
        website = None

    #logger.info("website: %s",website)

    #parser artifact

    tags = pq(basic_info)('div.line-block').text().strip().replace(" ", ",")
    #logger.info(tags)

    main_blocks = d('div.col-md-9> div.col-sm-12')
    h4s = d('div.col-md-9> h4')
    logger.info("main: %d, h4: %d", len(main_blocks), len(h4s))

    #产品介绍/团队成员/媒体报道/融资历史
    if len(h4s) != len(main_blocks) - 1:
        return {
            "status": "No_Data",
        }

    desc = None
    round = None
    roundDesc = None
    source_fundings = []

    for i in xrange(len(h4s)):
        h4 = h4s.eq(i).text().strip()
        d = main_blocks.eq(i + 1)
        #DESC
        if h4 == "产品介绍":
            desc = d('div.content> div> p.desc').text().strip()
        #parser finance
        if h4 == "融资历史":
            lines = d('table> tr')
            for li in lines:
                line = pq(li)
                if line.text().find("时间") >= 0:
                    continue
                #logger.info(line)

                date = line('td.investment_date> span').text().strip() + "/01"
                try:
                    fundingDate = datetime.datetime.strptime(date, '%Y/%m/%d')
                except:
                    fundingDate = None
                #logger.info(fundingDate)

                roundStr = line('td.investment-round').text().strip()
                fundingRound, roundStr = chuangyepu_helper.getFundingRound(
                    roundStr)
                #logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr)

                moneyStr = line('td.money').text().strip()
                (currency, investment,
                 precise) = chuangyepu_helper.getMoney(moneyStr)
                #logger.info("%s - %s - %s" % (currency, investment, precise))

                fs = line('td').eq(3)('p> a')
                investors = []
                for f in fs:
                    iv = pq(f)
                    investor_url = iv.attr("href")
                    investor_name = iv.text().strip()
                    if investor_name is not None and investor_url is not None and investor_url != "" and investor_url.find(
                            "institutions") >= 0:
                        investor_key = investor_url.strip().split("/")[-1]

                        investor = {"name": investor_name, "key": investor_key}
                        investors.append(investor)
                source_funding = {
                    "investment": investment,
                    "precise": precise,
                    "round": fundingRound,
                    "roundDesc": roundStr,
                    "currency": currency,
                    "fundingDate": fundingDate,
                    "investors": investors
                }
                #logger.info(json.dumps(source_funding, ensure_ascii=False, cls=util.CJsonEncoder))

                source_fundings.append(source_funding)
                if round is None or round < fundingRound:
                    round = fundingRound
                    roundDesc = roundStr

        if h4 == "团队成员":
            #not accurate member infos
            pass
        if h4 == "媒体报道":
            pass

    artifacts = []

    if desc is None:
        desc = brief

    if brief is not None and len(brief.decode('utf-8')) > 200:
        brief = None

    type, app_market, app_id = url_helper.get_market(website)
    if type == 4010:
        if item["url"] != website:
            flag, domain = url_helper.get_domain(website)
            if flag is not None:
                if flag is False:
                    domain = None
                artifacts.append({
                    "type": 4010,
                    "name": name,
                    "description": desc,
                    "link": website,
                    "domain": domain
                })
    elif type == 4040:
        domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4040,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4050:
        domain = None
        if app_market == 16010 or app_market == 16020:
            android_app = parser_db_util.find_android_market(
                app_market, app_id)
            if android_app:
                domain = android_app["apkname"]
        else:
            domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4050,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })

    #logger.info("Desc: %s", desc)
    #logger.info("round: %s, roundDesc: %s", round, roundDesc)

    source_company = {
        "name": name,
        "fullName": None,
        "description": desc,
        "productDesc": None,
        "modelDesc": None,
        "operationDesc": None,
        "teamDesc": None,
        "marketDesc": None,
        "compititorDesc": None,
        "advantageDesc": None,
        "planDesc": None,
        "brief": brief,
        "round": round,
        "roundDesc": roundDesc,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": None,
        "address": None,
        "phone": None,
        "establishDate": None,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": None,
        "headCountMin": None,
        "headCountMax": None,
        "artifacts": artifacts,
        "fundings": source_fundings,
        "status": 1
    }

    #for i in source_company:
    #    logger.info("%s -> %s", i, source_company[i])
    return source_company
Ejemplo n.º 17
0
def parse_company(item):
    if item is None:
        logger.info("here")
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html1 = item["content"]
    logger.info(company_key)
    d = pq((html.fromstring(html1.decode("utf-8"))))

    name = d('h1.name').text().strip()

    fullName = d('div.company-business> h4').text()
    if fullName.find("来源")>=0:
        fullName = fullName.split(" ")[-1]

    fullName = name_helper.company_name_normalize(fullName)

    if (name is None or name == "") or (fullName is None or fullName == ""):
        logger.info("here1: %s", name)
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        logger.info("here")
        return {
            "status": "No_Name",
        }
    logo = d('div.company-logo> img').attr('src')

    if logo.startswith("http") or logo.startswith("https") or logo.find("default") >= 0:
        pass
    else:
        logo = None

    # if logo.find("default") >= 0:
    #     logo = None

    brief = None
    desc_text = d('div.job-sec> div.text').text()
    logger.info("desc: %s", desc_text)

    if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5:
        desc = None
    else:

        desc = desc_text.replace('公司简介:',"").replace("收起","").replace("展开","").replace("&nbsp;","").strip()

    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        lll = d('div.info-primary> p').text().strip()
        if len(lll.split(" ")) == 3:
            field = lll.split(" ")[2]
            stage = lll.split(" ")[0]
            headCount = lll.split(" ")[1]

    except:
        pass

    headCount = headCount.replace("人", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None



    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0



    links = d('div.company-products> ul> li> div.text> div.name> a')
    artifacts = []
    for linkp in links:
        link = pq(linkp)('a').attr("href")
        website = url_helper.url_normalize(link)
        logger.info("website: %s" % website)

        type, app_market, app_id = url_helper.get_market(website)
        if type == 4010:
            if item["url"] != website and website.find("zhipin") == -1:
                flag, domain = url_helper.get_domain(website)
                if flag is not None:
                    if flag is False:
                        domain = None
                    artifacts.append({
                        "type": 4010,
                        "name": name,
                        "description": None,
                        "link": website,
                        "domain": domain
                    })
        elif type == 4020 or type == 4030:
            domain = None
            if domain is not None:
                artifacts.append({
                    "type": type,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })
        elif type == 4040:
            domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4040,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })
        elif type == 4050:
            domain = None
            if app_market == 16010 or app_market == 16020:
                android_app = parser_mongo_util.find_android_market(app_market, app_id)
                if android_app:
                    domain = android_app["apkname"]
            else:
                domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4050,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })

    #parser member
    members = []

    lis = d('div.manager-list> div> ul >li> div')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('div.info-user> img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p> span.name').text()
                # member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p> span.job-title').text()

                member_desc = mem('div.item_manager_content').text()

                # weibo = None
                # if member_link is not None:
                #     if 'weibo.com' in member_link:
                #         weibo = member_link

                source_member = {'name': member_name,
                                 'photo_url': logo_url,
                                 'weibo': None,
                                 'location': None,
                                 'role': member_position,
                                 'description': member_desc,
                                 'education': None,
                                 'work': None
                                 }
                members.append(source_member)
            except:
                pass

    sourceId2link =  d('div.company-tab> a').eq(0).attr("href")
    if sourceId2link is not None and sourceId2link.find("gongsi") >=0:
        sourceId2 = sourceId2link.split("/")[-1].replace(".html","")
    else:
        sourceId2 =  None

    source_company = {
                      "name": name,
                      "fullName": fullName  if fullName is not None and fullName.strip() != "" else None,
                      "description": desc,
                      "brief": brief,
                      "round": None,
                      "roundDesc": None,
                      "companyStatus": 2010,
                      'fundingType': funding_type,
                      "locationId": int(0),
                      "address": address,
                      "phone": None,
                      "establishDate": None,
                      "logo": logo,
                      "source": SOURCE,
                      "sourceId": company_key,
                      "sourceId2": sourceId2,
                      "sourceUrl": "https://www.zhipin.com/gongsi/%s.html?ka=company-intro" % company_key,
                      "field": field,
                      "headCountMin": min_staff,
                      "headCountMax": max_staff,
                      "artifacts": artifacts,
                      "members": members,
                      "status": 1,
                      "stage": 0,
                      }

    return source_company
Ejemplo n.º 18
0
def parse_artifact(source_company_id, item):
    name = item['name']
    logger.info('parse_artifact:%s' % name)

    artifacts = []
    desc = ''
    descs = item['content']['company_base']['properties']
    if descs.has_key('short_description'):
        desc = descs['short_description']

    of = item['content']['company_base']['overview_fields2']
    if of.has_key('website'):
        website = of['website']['value']
        website = url_helper.url_normalize(website)
        # logger.info('website:%s'%website)
        if website is not None and website.find(
                'twitter') == -1 and website.find(
                    'linkedin') == -1 and website.find('facebook') == -1:
            type, app_market, app_id = url_helper.get_market(website)
            # logger.info('type:%s---market:%s---app_id:%s'%(type,market,app_id))
            if type == 4010:
                flag, domain = url_helper.get_domain(website)
                if flag is not None:
                    if flag is False:
                        domain = None
                    artifacts.append({
                        "type": 4010,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })
            elif type == 4020 or type == 4030:
                domain = None
                if domain is not None:
                    artifacts.append({
                        "type": type,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })
            elif type == 4040:
                domain = app_id
                if domain is not None:
                    artifacts.append({
                        "type": 4040,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })
            elif type == 4050:
                domain = None
                if app_market == 16010 or app_market == 16020:
                    android_app = parser_mongo_util.find_android_market(
                        app_market, app_id)
                    if android_app:
                        domain = android_app["apkname"]
                else:
                    domain = app_id
                if domain is not None:
                    artifacts.append({
                        "type": 4050,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })

    return artifacts
Ejemplo n.º 19
0
def find_fof_alias(investorId):
    conn = db.connect_torndb()
    investor = conn.get("select * from fof where (active is null or active='Y') and id=%s", investorId)
    if investor is None:
        logger.info("investor :%s is not available", investorId)
        conn.close()
        return

    aliases = conn.query("select * from fof_alias where (active is null or active='Y') and "
                         "(verify is null or verify !='N') and fofId=%s", investorId)

    aliases_amac = conn.query("select iaa.* from fof_alias_amac iaa join fof_alias ia on "
                              "iaa.fofAliasId = ia.id  where (ia.active is null or ia.active='Y') and "
                              "(ia.verify is null or ia.verify !='N') and ia.fofId=%s", investorId)

    managerIds = [alias["amacId"] for alias in aliases_amac if alias["amacId"] is not None and alias["amacId"] is not
                   None and alias["amacType"] is not None and alias["amacType"] == 'M']

    managerIds_mysql = [alias["fofAliasId"] for alias in aliases_amac if alias["amacId"] is not None and
                        alias["amacId"] is not None and alias["amacType"] is not None and alias["amacType"] == 'M']

    names = [alias["name"] for alias in aliases if alias["name"] is not None and alias["type"] == 12010]

    manager_names = [alias["name"] for alias in aliases if alias["name"] is not None and alias["type"] == 12010 and
                     alias["id"] in managerIds_mysql]

    logger.info("managerId: %s", ";".join(managerIds))
    logger.info("names: %s", ";".join(names))
    logger.info("manager_names: %s", ";".join(manager_names))

    #find by domains
    InvestorDms = []
    if investor["website"] is not None and investor["website"].strip() != "":
        if investor["domain"] is not None and investor["domain"].strip() != "":
            InvestorDms.append(investor["domain"])
        else:
            type, market, website_domain = url_helper.get_market(investor["website"])
            if type == 4010 and website_domain is not None:
                conn.update("update investor set domain=%s where id=%s", website_domain, investorId)
                InvestorDms.append(website_domain)
    logger.info("investor: %s has self domain: %s", investor["name"], ":".join(InvestorDms))

    if len(managerIds) > 0:
        amac_domains = get_websit_domains(managerIds)
        for amac_domain in amac_domains:
            if amac_domain not in InvestorDms: InvestorDms.append(amac_domain)

    logger.info("investor: %s has total domain: %s", investor["name"], ":".join(InvestorDms))

    if len(InvestorDms) > 0:
        newMangers = find_amac_manager_by_domains(InvestorDms)
        logger.info("investor: %s has found %s amac managers by domain", investor["name"], len(newMangers))
        if len(newMangers) > 0:
            for newManger in newMangers:
                if newManger["managerName"] is not None:
                    if newManger["managerName"] not in names or newManger["managerName"] not in manager_names:
                        logger.info("investor: %s has a new alias: %s", investor["name"], newManger["managerName"])

                        add_fof_alias_from_amac(investorId, newManger["managerName"],
                                                     str(newManger["_id"]), addFund=True)
                        names.append(newManger["managerName"])
                    else:
                        logger.info("investor: %s already has alias: %s", investor["name"], newManger["managerName"])
Ejemplo n.º 20
0
def update_investor(investor,source_investor):
    conn = db.connect_torndb()
    investor_id = investor["id"]
    logger.info("****checking %s/%s/%s", investor["name"], investor["id"], source_investor["id"])
    if investor["online"] is not None and investor["online"] == "Y":
        logger.info("online not update!!!")
        time.sleep(1)
        pass
    else:
        logger.info("Update investor : %d with source_investor: %d ", investor_id, source_investor["id"])
        replace(investor, source_investor)


    #insert investor_alias
    for name in [source_investor["name"], source_investor["fullName"],source_investor["enName"], source_investor["enFullName"]]:
        if name is None or name.strip() == "": continue
        investor_alias = conn.get("select * from investor_alias where name=%s and "
                                  "investorId=%s and (active is null or active='Y') limit 1",
                                  name, investor["id"])
        # logger.info("here: %s", investor_alias)
        if investor_alias is None:
            chinese, is_company = name_helper.name_check(name)
            if is_company:
                type = 12010
            else:
                type = 12020
            sql = "insert investor_alias(investorId, name, type, createTime,modifyTime) values(%s,%s,%s, now(),now())"
            logger.info("Add new investor alias: %s for %s", name, investor["id"])
            conn.insert(sql, investor["id"], name, type)

    #insert investor_artifact:
    artifacts = []
    if source_investor["website"] is not None and source_investor["website"] != "":
        type, market, app_id = url_helper.get_market(source_investor["website"])
        if type == 4010:
            if source_investor["website"].find('36kr') > 0 and source_investor["website"].find("baidu") > 0:
                pass
            else:
                artifact = {
                    "investorId": investor["id"],
                    "name": investor["name"] ,
                    "description": None,
                    "link": source_investor["website"],
                    "domain": app_id,
                    "type": type
                }
                artifacts.append(artifact)
        elif (type == 4040 or type == 4050) and app_id is not None:
            domain = get_android_domain(market, app_id)
            if (type == 4040 or type == 4050) and domain is not None:
                artifact = {
                    "investorId": investor["id"],
                    "name": investor["name"] ,
                    "description": None,
                    "link": source_investor["website"],
                    "domain": domain,
                    "type": type
                }
                artifacts.append(artifact)


    weibo = source_investor.get("weibo", "")
    if weibo is not None and weibo.strip() != "" and weibo.find("weibo") >= 0:
        artifact = {
            "investorId": investor["id"],
            "name": investor["name"] ,
            "description": None,
            "link": weibo,
            "domain": None,
            "type": 4030
        }
        artifacts.append(artifact)

    weixin = source_investor.get("wechatId", "")
    if weixin is not None and weixin.strip() != "":
        artifact = {
            "investorId": investor["id"],
            "name": investor["name"] ,
            "description": None,
            "link": weixin,
            "domain": weixin,
            "type": 4020
        }
        artifacts.append(artifact)

    if len(artifacts) > 0:
        for art in artifacts:
            if art["type"] not in [4030] and art["domain"] is not None and art["domain"].strip()!="":

                iart = conn.get("select * from investor_artifact where type=%s and investorId=%s and domain=%s limit 1",
                            art["type"], investor["id"], art["domain"])
            else:
                iart = conn.get("select * from investor_artifact where type=%s and investorId=%s and link=%s limit 1",
                                art["type"], investor["id"], art["link"])

            if iart is None:
                logger.info("add new artifact: %s/%s/%s", art["type"], art["name"], art["link"])
                sql = "insert investor_artifact(investorId,type, name, link, domain, createTime,modifyTime) \
                                         values(%s,%s,%s,%s,%s,now(),now())"
                conn.insert(sql, investor["id"], art["type"], art["name"], art["link"], art["domain"])


    #insert contact

    contacts = conn.query("select * from source_investor_contact where sourceInvestorId=%s", source_investor["id"])
    if len(contacts) >0:
        conn.execute("delete from investor_contact where investorId=%s and createUser=139", investor["id"])
        for s in contacts:
            sql = "insert investor_contact(investorId, locationId, address, phone, email, createUser, " \
                  "createTime,modifyTime) \
                              values(%s,%s,%s,%s,%s,%s,now(),now())"
            conn.insert(sql, investor["id"], s["locationId"], s["address"], s["phone"], s["email"], 139)


    # insert member
    members = conn.query("select * from source_investor_member where sourceInvestorId=%s", source_investor["id"])
    for m in members:
        member = conn.get("select * from investor_member where investorId=%s and name=%s limit 1", investor["id"], m["name"])
        if member is not None: continue
        sql = "insert investor_member(investorId,name,logo, position, description,createUser,createTime,modifyTime) \
                              values(%s,%s,%s,%s,%s,%s,now(),now())"
        conn.insert(sql, investor["id"], m["name"], m["logo"], m["position"],
                    m["description"], 139)
    conn.close()
Ejemplo n.º 21
0
def parse_base(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    company_short_name = ""
    product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip()
    if product_name is None or product_name.strip() == "":
        product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip()
    temps = product_name.split("/",1)
    if len(temps) == 2:
        product_name = temps[0].strip()
        company_short_name = temps[1].strip()
    if company_short_name == "":
        company_short_name = product_name
    logger.info("product name: %s" % product_name)
    logger.info("company short name: %s" % company_short_name)

    company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","")
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    if company_name is None or company_name.strip() == "":
        try:
            company_name = d('div.des-more> h2').text().strip()
        except:
            pass
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    company_name = name_helper.company_name_normalize(company_name)
    logger.info("company name: %s" % company_name)

    if company_short_name == "" and company_name == "":
        return

    establish_date = None
    str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","")
    result = util.re_get_result('(\d*)\.(\d*)',str)
    if result != None:
        (year, month) = result
        try:
            if int(month) > 12:
                month = "1"
        except:
            month = "1"
        establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
    logger.info("establish date: %s" % establish_date)

    locationId=0
    str = d('span.loca').text().strip()
    #logger.info(str)
    result = util.re_get_result(u'(.*?)·(.*?)$',str)
    if result != None:
        (province, city) = result
        province = province.strip()
        city = city.strip()
        logger.info("location: %s-%s" % (province, city))

        locationId = 0
        result = parser_db_util.get_location(city)
        if result != None:
            locationId = result["locationId"]
        else:
            result = parser_db_util.get_location(province)
            if result != None:
                locationId = result["locationId"]

    if locationId == 0:
        loc1,loc2 = name_helper.get_location_from_company_name(company_name)
        if loc1 is not None:
            result = parser_db_util.get_location(loc1)
            if result != None:
                locationId = result["locationId"]
    logger.info("locationId: %d" % locationId)

    company_status = 2010
    str = d('div.des-more> div').eq(2).text().strip()
    if str == "已关闭":
        company_status = 2020
    logger.info("company_status: %d" % company_status)

    funding_type = 0
    str = d("span.tag.bg-c").text().strip()
    logger.info("融资需求: %s" % str)
    if str == "融资需求 · 需要融资":
        funding_type = 8020
    elif str == "融资需求 · 寻求收购":
        funding_type = 8020
    logger.info("funding_type=%d" % funding_type)
    try:
        brief = d("h2.seo-slogan").text().strip()
    except:
        brief = ""
    logger.info("brief: %s" % brief)

    if brief.find("暂未收录"):
        brief = ""
    field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
    logger.info("field: %s" % field)

    sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
    logger.info("sub field: %s" % sub_field)

    tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",")
    logger.info("tags: %s" % tags)

    desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\
        replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip()
    logger.info("********desc: %s" % desc)

    #logo
    logo = d("div.pic >img").attr("src")
    #if logo:
    #    logo = logo.replace("http://", "https://")
    logger.info("logo: %s", logo)


    # website = d('div.link-line> a').text().strip()
    # if website is None or website == "":
    #     website = d('div.link-line> a.webTink').text().strip()
    # if website is None or website == "":
    #     try:
    #         logger.info("here")
    #         website = d('div.link-line> span.weblink> a').eq(1).text().strip()
    #         logger.info(website)
    #     except:
    #         pass
    artifacts = []
    for ty in [1,2,3]:
        if ty == 1:
            was = d('div.link-line> a')
        else:
            was = d('div.link-line> span.weblink,span.webTink> a')

        for wa in was:
            webs =[]

            try:
                website = pq(wa).attr("href").strip()
                if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:

            #     website = pq(wa).text().strip()
            except:
                pass
            try:
                website = pq(wa).text().strip()
                if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:
            #     website = pq(wa).text().strip()
            except:
                pass

            #
            # if website=="http://%e6%9a%82%e6%97%a0":
            #     website = ""
            # website = url_helper.url_normalize(website)
            # logger.info("website: %s" % website)

            # artifacts = []
            for website in webs:
                type, app_market, app_id = url_helper.get_market(website)
                if type == 4010:
                    flag, domain = url_helper.get_domain(website)
                    if flag is not None:
                        if flag is False:
                            domain = None
                        artifacts.append({
                            "type":4010,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })

                elif type == 4020:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4020,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": website
                        })

                elif type == 4030:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4030,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": None
                        })

                elif type == 4040:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                                "type":4040,
                                "name":product_name,
                                "desc":desc,
                                "link":website,
                                "domain": domain
                        })
                elif type == 4050:
                    domain = None
                    if app_market == 16010 or app_market == 16020:
                        android_app = parser_db_util.find_android_market(app_market, app_id)
                        if android_app:
                            domain = android_app["apkname"]
                    else:
                        domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type":4050,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })


    #获投状态
    roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip()
    fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr)
    logger.info("获投状态: %d, %s", fundingRound, roundStr)

    logger.info("")


    return {
        "shortName": company_short_name,
        "fullName": company_name if company_name is not None and company_name.strip() != "" else None,
        "productName": product_name,
        "description": desc,
        "brief": brief,
        "round": fundingRound,
        "roundDesc": roundStr,
        "companyStatus": company_status,
        "fundingType": funding_type,
        "locationId": locationId,
        "establishDate": establish_date,
        "logo": logo,
        "sourceId": company_key,
        "field": field,
        "subField": sub_field,
        "tags": tags,
        "type":41010,
        "artifacts":artifacts
    }
Ejemplo n.º 22
0
def parse_company(item):
    logger.info("parse_company")
    company_key = item["postdata"]["id"]

    #company basic info
    c = item["data"]["basic"]

    tags = c["tags"]

    tags_str = tags.replace("|",",")

    logo=c["icon"]
    if logo.find("product_default.png") >= 0:
        logo = None

    establish_date = None
    if c.has_key("open_time"):
        try:
            establish_date = datetime.datetime.strptime(c["open_time"], "%Y-%m-%d")
        except:
            pass

    address1 = None
    address2 = None
    if c.has_key("city"):
        address2 = c["city"]
    if c.has_key("province"):
        address1 = c["province"]

    location_id = 0
    if address2!=None and address2.strip()!="":
        location = parser_db_util.get_location(address2)
        if location != None:
            location_id= location["locationId"]

    if location_id==0 and address1 != None and address1.strip()!="":
        location = parser_db_util.get_location(address1)
        if location != None:
            location_id = location["locationId"]

    fullName = c["company"]
    if fullName is None or fullName.strip() == "":
        fullName = None
    else:
        fullName = fullName.replace("_","")
        idx = fullName.rfind(u"公司")
        if idx != -1:
            fullName = fullName[:(idx+len(u"公司"))]
        fullName = name_helper.company_name_normalize(fullName)

    name = c["product"]
    desc = ""
    brief = ""
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None
    otherDesc = None


    if c.has_key("desc"):  # 其他
        # otherDesc = c["intro"].strip()
        desc = c["desc"].strip()

    if c.has_key("yewu"):  # 其他
        # otherDesc = c["intro"].strip()
        brief = c["yewu"].strip()

    if name is None or fullName is None:
        return {
            "status": "No_Name",
        }

    artifacts = []
    websites = []
    if c.has_key("gw_link") is True and c["gw_link"].strip() !="" and c["gw_link"] not in websites:
        websites.append(c["gw_link"])
    if c.has_key("source_gw_link") is True and c["source_gw_link"].strip() != "" and c["source_gw_link"] not in websites:
        websites.append(c["source_gw_link"])
    if item["data"].has_key("productinfos") is True:
        for pi in item["data"]["productinfos"]:
            if pi.has_key("link") is True and pi["link"].strip() !="" and pi["link"] not in websites:
                websites.append(pi["link"])

    for website in websites:
        type, app_market, app_id = url_helper.get_market(website)
        if type == 4010:
            if item["url"] != website and website.find("qimingpian.com") == -1:
                flag, domain = url_helper.get_domain(website)
                if flag is not None:
                    if flag is False:
                        domain = None
                    artifacts.append({
                        "type": 4010,
                        "name": name,
                        "description": brief,
                        "link": website,
                        "domain": domain
                    })
        elif type == 4020 or type == 4030:
            domain = None
            if domain is not None:
                artifacts.append({
                    "type": type,
                    "name": name,
                    "description": brief,
                    "link": website,
                    "domain": domain
                })
        elif type == 4040:
            domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4040,
                    "name": name,
                    "description": brief,
                    "link": website,
                    "domain": domain
                })
        elif type == 4050:
            domain = None
            if app_market == 16010 or app_market == 16020:
                android_app = parser_db_util.find_android_market(app_market, app_id)
                if android_app:
                    domain = android_app["apkname"]
            else:
                domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4050,
                    "name": name,
                    "description": brief,
                    "link": website,
                    "domain": domain
                })

    return {
        "name": name,
        "fullName": fullName,
        "description": desc,
        "productDesc": productDesc,
        "modelDesc": modelDesc,
        "operationDesc": operationDesc,
        "teamDesc": teamDesc,
        "marketDesc": marketDesc,
        "compititorDesc": compititorDesc,
        "advantageDesc": advantageDesc,
        "planDesc": planDesc,
        "otherDesc": otherDesc,
        "brief": brief,
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": establish_date,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None,
        "artifacts": artifacts,

    }
Ejemplo n.º 23
0
def parse_company(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    logger.info(company_key)
    d = pq(html)

    # logo_id processed in parser_db_util
    '''
    logo_id = None
    if logo_url is not None:
        logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url)
    '''

    if html.decode("utf-8").find("这个公司的主页还在建设") >= 0:
        return {
            "status": "No_Name",
        }
    name = d('.company_main > h1 > a').text()
    link = d('.company_main > h1 > a').attr('href')
    fullName = d('.company_main > h1 > a').attr('title')
    fullName = name_helper.company_name_normalize(fullName)
    if name is None or fullName is None or name.find("拉勾") >= 0:
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        return {
            "status": "No_Name",
        }
    logo = d('.top_info_wrap > img').attr('src')

    if logo.startswith("http") or logo.startswith("https"):
        pass
    else:
        logo = "http:" + logo

    if logo.find("logo_default") >= 0:
        logo = None

    brief = d('.company_word').text()
    desc_text = d('.company_intro_text').text()

    if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10:
        desc = None
    else:
        desc = d('.company_intro_text > .company_content').html()
        desc = desc.replace('<span class="text_over">展开</span>', '')

        soup = BeautifulSoup(desc, "lxml")
        raw = soup.getText()

        # logger.info(desc)
        #logger.info(raw)

        desc = raw

    # if desc is None or desc.strip() == "":
    #     return {
    #         "status": "No_Name",
    #     }
    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        field = d(
            '#basic_container > .item_content >ul > li:eq(0) > span').text()
        stage = d(
            '#basic_container > .item_content >ul > li:eq(1) > span').text()
        headCount = d(
            '#basic_container > .item_content >ul > li:eq(2) > span').text()
        headCount = headCount[0:headCount.index(u'人')]
        location = d(
            '#basic_container > .item_content >ul > li:eq(3) > span').text()
        address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text()
    except:
        pass

    headCount = headCount.replace("people", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None

    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0

    location_id = 0
    location_new = parser_db_util.get_location(location)
    if location_new != None:
        location_id = location_new["locationId"]

    #website = util.norm_url(link)
    website = url_helper.url_normalize(link)
    logger.info("website: %s" % website)

    artifacts = []
    type, app_market, app_id = url_helper.get_market(website)
    if type == 4010:
        if item["url"] != website and website.find("lagou.com") == -1:
            flag, domain = url_helper.get_domain(website)
            if flag is not None:
                if flag is False:
                    domain = None
                artifacts.append({
                    "type": 4010,
                    "name": name,
                    "description": desc,
                    "link": website,
                    "domain": domain
                })
    elif type == 4020 or type == 4030:
        domain = None
        if domain is not None:
            artifacts.append({
                "type": type,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4040:
        domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4040,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4050:
        domain = None
        if app_market == 16010 or app_market == 16020:
            android_app = parser_db_util.find_android_market(
                app_market, app_id)
            if android_app:
                domain = android_app["apkname"]
        else:
            domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4050,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })

    source_company = {
        "name":
        name,
        "fullName":
        fullName if fullName is not None and fullName.strip() != "" else None,
        "description":
        desc,
        "productDesc":
        None,
        "modelDesc":
        None,
        "operationDesc":
        None,
        "teamDesc":
        None,
        "marketDesc":
        None,
        "compititorDesc":
        None,
        "advantageDesc":
        None,
        "planDesc":
        None,
        "brief":
        brief,
        "round":
        None,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        funding_type,
        "locationId":
        location_id,
        "address":
        address,
        "phone":
        None,
        "establishDate":
        None,
        "logo":
        logo,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "field":
        field,
        "subField":
        None,
        "tags":
        None,
        "headCountMin":
        min_staff,
        "headCountMax":
        max_staff,
        "artifacts":
        artifacts,
        "status":
        1
    }

    return source_company
Ejemplo n.º 24
0
logger = loghelper.get_logger("prepare_source_artifact_domain")




if __name__ == "__main__":
    start = 0
    conn =db.connect_torndb()
    while True:
        items = list(conn.query("select * from source_artifact order by id limit %s,1000",start))
        for item in items:
            if item["domain"] is not None and item["domain"].strip() != "":
                continue

            if item["type"] == 4010:
                link = url_helper.url_normalize(item["link"])
                (flag, domain) = url_helper.get_domain(link)
                if flag is True:
                    logger.info("%s, %s %s %s", item["id"], item["type"], link, domain)
                    conn.update("update source_artifact set domain=%s where id=%s", domain, item["id"])

            elif item["type"] == 4040 or item["type"] == 4050:
                (apptype, appmarket, trackid) = url_helper.get_market(item["link"])
                if (apptype == 4040 or apptype == 4050) and trackid is not None:
                    logger.info("%s %s %s %s", item["id"], apptype, item["link"], trackid)
                    conn.update("update source_artifact set type=%s, domain=%s where id=%s",apptype,trackid,item["id"])
        start += 1000
        if len(items) == 0:
            break
    conn.close()
Ejemplo n.º 25
0
def parse_company(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    logger.info(company_key)
    d = pq(html)

    if html.decode("utf-8").find("这个公司的主页还在建设") >= 0:
        return {
            "status": "No_Name",
        }
    name = d('.company_main > h1 > a').text()
    link = d('.company_main > h1 > a').attr('href')
    fullName = d('.company_main > h1 > a').attr('title')
    fullName = name_helper.company_name_normalize(fullName)
    if name is None or fullName is None or (name.find("拉勾") >= 0
                                            and company_key != "147"):
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        return {
            "status": "No_Name",
        }
    logo = d('.top_info_wrap > img').attr('src')

    if logo.startswith("http") or logo.startswith("https"):
        pass
    else:
        logo = "http:" + logo

    if logo.find("logo_default") >= 0:
        logo = None

    brief = d('.company_word').text()
    desc_text = d('.company_intro_text').text()

    if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10:
        desc = None
    else:
        desc = d('.company_intro_text > .company_content').html()
        desc = desc.replace('<span class="text_over">展开</span>', '')

        soup = BeautifulSoup(desc, "lxml")
        raw = soup.getText()
        desc = raw

    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        field = d(
            '#basic_container > .item_content >ul > li:eq(0) > span').text()
        stage = d(
            '#basic_container > .item_content >ul > li:eq(1) > span').text()
        headCount = d(
            '#basic_container > .item_content >ul > li:eq(2) > span').text()
        headCount = headCount[0:headCount.index(u'人')]
        location = d(
            '#basic_container > .item_content >ul > li:eq(3) > span').text()
        address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text()
    except:
        pass

    headCount = headCount.replace("people", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None

    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0

    location_id = 0
    location_new = parser_mongo_util.get_location(location)
    if location_new != None:
        location_id = location_new["locationId"]

    #website = util.norm_url(link)
    website = url_helper.url_normalize(link)
    logger.info("website: %s" % website)

    artifacts = []
    type, app_market, app_id = url_helper.get_market(website)
    if type == 4010:
        if item["url"] != website and website.find("lagou.com") == -1:
            flag, domain = url_helper.get_domain(website)
            if flag is not None:
                if flag is False:
                    domain = None
                artifacts.append({
                    "type": 4010,
                    "name": name,
                    "description": desc,
                    "link": website,
                    "domain": domain
                })
    elif type == 4020 or type == 4030:
        domain = None
        if domain is not None:
            artifacts.append({
                "type": type,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4040:
        domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4040,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4050:
        domain = None
        if app_market == 16010 or app_market == 16020:
            android_app = parser_mongo_util.find_android_market(
                app_market, app_id)
            if android_app:
                domain = android_app["apkname"]
        else:
            domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4050,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })

    #parser member
    members = []

    lis = d('.manager_list > li')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p.item_manager_name > span').text()
                member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p.item_manager_title').text()

                member_desc = mem('div.item_manager_content').text()

                weibo = None
                if member_link is not None:
                    if 'weibo.com' in member_link:
                        weibo = member_link

                source_member = {
                    'name': member_name,
                    'photo_url': logo_url,
                    'weibo': weibo,
                    'location': None,
                    'role': member_position,
                    'description': member_desc,
                    'education': None,
                    'work': None
                }
                members.append(source_member)
            except:
                pass

    source_company = {
        "name":
        name,
        "fullName":
        fullName if fullName is not None and fullName.strip() != "" else None,
        "description":
        desc,
        "brief":
        brief,
        "round":
        None,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        funding_type,
        "locationId":
        int(location_id),
        "address":
        address,
        "phone":
        None,
        "establishDate":
        None,
        "logo":
        logo,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "sourceUrl":
        "https://www.lagou.com/gongsi/%s.html" % company_key,
        "field":
        field,
        "headCountMin":
        min_staff,
        "headCountMax":
        max_staff,
        "artifacts":
        artifacts,
        "members":
        members,
        "status":
        1
    }

    return source_company
Ejemplo n.º 26
0
def parse_artifact(item):
    logger.info("parse_artifact")
    company_key = item["key"]
    c = item["content"]["company_base"]["data"]["company"]
    artifacts = []
    # artifact
    website = c.get("website", "").strip()
    website = url_helper.url_normalize(website)
    if website is not None and website != "":
        type, market, app_id = url_helper.get_market(website)
        if type == 4010:
            if website.find('36kr.com') > 0 and c["name"].find('36') == -1:
                pass
            else:
                artifact = {
                    "sourceCompanyId": None,
                    "name": c["name"],
                    "description": None,
                    "link": website,
                    "domain": app_id,
                    "type": type
                }
                artifacts.append(artifact)
        elif (type == 4040 or type == 4050) and app_id is not None:
            domain = get_android_domain(market, app_id)
            if (type == 4040 or type == 4050) and domain is not None:
                artifact = {
                    "sourceCompanyId": None,
                    "name": c["name"],
                    "description": None,
                    "link": website,
                    "domain": domain,
                    "type": type
                }
                artifacts.append(artifact)

    weibo = c.get("weibo", "").strip()
    if weibo is not None and weibo != "":
        artifact = {
            "sourceCompanyId": None,
            "name": c["name"],
            "description": None,
            "link": weibo,
            "domain": None,
            "type": 4030
        }
        artifacts.append(artifact)

    weixin = c.get("weixin", "").strip()
    if weixin is not None and weixin != "":
        artifact = {
            "sourceCompanyId": None,
            "name": c["name"],
            "description": None,
            "link": weixin,
            "domain": None,
            "type": 4020
        }
        artifacts.append(artifact)

    iphoneAppstoreLink = c.get("iphoneAppstoreLink", "").strip()
    if iphoneAppstoreLink is not None and iphoneAppstoreLink != "":
        type, market, app_id = url_helper.get_market(iphoneAppstoreLink)
        domain = get_android_domain(market, app_id)
        if (type == 4040 or type == 4050) and domain is not None:
            artifact = {
                "sourceCompanyId": None,
                "name": c["name"],
                "description": None,
                "link": iphoneAppstoreLink,
                "domain": domain,
                "type": type
            }
            artifacts.append(artifact)

    ipadAppstoreLink = c.get("ipadAppstoreLink", "").strip()
    if ipadAppstoreLink is not None and ipadAppstoreLink != "":
        type, market, app_id = url_helper.get_market(ipadAppstoreLink)
        domain = get_android_domain(market, app_id)
        if (type == 4040 or type == 4050) and domain is not None:
            artifact = {
                "sourceCompanyId": None,
                "name": c["name"],
                "description": None,
                "link": ipadAppstoreLink,
                "domain": domain,
                "type": type
            }
            artifacts.append(artifact)

    androidLink = c.get("androidLink", "").strip()
    if androidLink is not None and androidLink != "":
        type, market, app_id = url_helper.get_market(androidLink)
        domain = get_android_domain(market, app_id)
        if (type == 4040 or type == 4050) and domain is not None:
            artifact = {
                "sourceCompanyId": None,
                "name": c["name"],
                "description": None,
                "link": androidLink,
                "domain": domain,
                "type": type
            }
            artifacts.append(artifact)

    return artifacts
Ejemplo n.º 27
0
def parse_artifact(source_company_id, item):
    logger.info("parse_artifact")
    company_key = item["key"]
    cc = item["content"]["company_base"]["data"]
    cp = item["content"]["product"]["data"]["companyProduct"]
    artifacts = []
    links = []
    # artifact
    for c in [cc, cp]:
        website = c.get("website", "").strip()
        website = url_helper.url_normalize(website)
        if website is not None and website != "" and website not in links:
            type, market, app_id = url_helper.get_market(website)
            if type == 4010:
                if website.find('36kr.com') > 0 and c["name"].find('36') == -1:
                    pass
                else:
                    artifact = {
                        "sourceCompanyId": source_company_id,
                        "name": c["name"],
                        "description": None,
                        "link": website,
                        "domain": app_id,
                        "type": type
                    }
                    artifacts.append(artifact)
                    links.append(website)
            elif (type == 4040 or type == 4050) and app_id is not None:
                domain = get_android_domain(market, app_id)
                if (type == 4040 or type == 4050) and domain is not None:
                    artifact = {
                        "sourceCompanyId": source_company_id,
                        "name": c["name"],
                        "description": None,
                        "link": website,
                        "domain": domain,
                        "type": type
                    }
                    artifacts.append(artifact)
                    links.append(website)

        weibo = c.get("weibo", "").strip()
        if weibo is not None and weibo != "" and weibo.find(
                "weibo") >= 0 and weibo not in links:
            artifact = {
                "sourceCompanyId": source_company_id,
                "name": c["name"],
                "description": None,
                "link": weibo,
                "domain": None,
                "type": 4030
            }
            artifacts.append(artifact)
            links.append(weibo)

        weixin = c.get("weixin", "").strip()
        if weixin is not None and weixin != "" and weixin not in links:
            artifact = {
                "sourceCompanyId": source_company_id,
                "name": c["name"],
                "description": None,
                "link": weixin,
                "domain": weixin,
                "type": 4020
            }
            artifacts.append(artifact)
            links.append(weixin)

        iphoneAppstoreLink = c.get("ios", "").strip()
        if iphoneAppstoreLink is not None and iphoneAppstoreLink != "" and iphoneAppstoreLink not in links:
            type, market, app_id = url_helper.get_market(iphoneAppstoreLink)
            domain = get_android_domain(market, app_id)
            if (type == 4040 or type == 4050) and domain is not None:
                artifact = {
                    "sourceCompanyId": source_company_id,
                    "name": c["name"],
                    "description": None,
                    "link": iphoneAppstoreLink,
                    "domain": domain,
                    "type": type
                }
                artifacts.append(artifact)
                links.append(iphoneAppstoreLink)

        # ipadAppstoreLink = c.get("ipadAppstoreLink","").strip()
        # if ipadAppstoreLink is not None and ipadAppstoreLink != "":
        #     type, market, app_id = url_helper.get_market(ipadAppstoreLink)
        #     domain = get_android_domain(market, app_id)
        #     if (type==4040 or type==4050) and domain is not None:
        #         artifact = {
        #             "sourceCompanyId": source_company_id,
        #             "name": c["name"],
        #             "description": None,
        #             "link": ipadAppstoreLink,
        #             "domain": domain,
        #             "type": type
        #         }
        #         artifacts.append(artifact)

        androidLink = c.get("android", "").strip()
        if androidLink is not None and androidLink != "" and androidLink not in links:
            type, market, app_id = url_helper.get_market(androidLink)
            domain = get_android_domain(market, app_id)
            if (type == 4040 or type == 4050) and domain is not None:
                artifact = {
                    "sourceCompanyId": source_company_id,
                    "name": c["name"],
                    "description": None,
                    "link": androidLink,
                    "domain": domain,
                    "type": type
                }
                artifacts.append(artifact)
                links.append(androidLink)

    return artifacts
Ejemplo n.º 28
0
def expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler, test=False):
    logger.info("source: %s, sourceId: %s Start expand!!!", source, sourceId)
    logger.info("clean old expanded data")

    expand_clean(source, sourceId)
    sourcecompany = collection_source_company.find_one({"source": source, "sourceId": sourceId})
    # exit()
    company_fullname = sourcecompany["source_company"]["fullName"]
    if company_fullname is not None and company_fullname.strip() != "":
        company_fullname = name_helper.company_name_normalize(company_fullname)

        scnames = sourcecompany["source_company_name"]
        check_fullname = False
        for scname in scnames:
            if scname["name"] == company_fullname:
                check_fullname = True
                break
        if check_fullname is False:
            (chinese, company) = name_helper.name_check(company_fullname)
            if chinese is True:
                chinese_type = "Y"
            else:
                chinese_type = "N"
            scname_data ={
                "name": company_fullname,
                "chinese": chinese_type,
                "type": 12010,
            }
            save_mongo_source_company_name(source, sourceId, scname_data)

    round = 1

    while True:
        if round >= 6:
            collection_source_company.update_one({"_id": sourcecompany["_id"]},{'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}})
            break

        source_company_names = find_mongo_data(collection_source_company, "source_company_name", source, sourceId)
        main_beianhaos = find_mongo_data(collection_source_company, "source_mainbeianhao", source, sourceId)
        artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId)

        logger.info(json.dumps(source_company_names, ensure_ascii=False, cls=util.CJsonEncoder))
        logger.info(json.dumps(main_beianhaos, ensure_ascii=False, cls=util.CJsonEncoder))
        logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder))

        # Check if there are new stuff which need to do expansion
        if len(source_company_names) == 0 and len(artifacts) == 0 and len(main_beianhaos) == 0:
            collection_source_company.update_one({"_id": sourcecompany["_id"]}, {'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}})
            break

        logger.info("source: %s, sourceId: %s expand for round %d", source, sourceId, round)

        # Step A/1:按公司名,备案查询
        logger.info("source: %s, sourceId: %s 按公司名备案查询", source, sourceId)
        for source_company_name in source_company_names:
            # Only check chinese company name
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            if source_company_name["chinese"] is None:
                (chinese, companyName) = name_helper.name_check(source_company_name["name"])
            else:
                chinese = source_company_name["chinese"]

            if chinese != "Y":
                continue

            check_name = list(collection_beian.find({"organizer": source_company_name["name"]}))
            # Case that one company_name has multiple beian# : 上海汇翼->(today.ai/teambition.com)#If only one found in Mongo.beian(organizer) it is fine
            if len(check_name) == 0:
                if test:
                    items_beianlinks = []
                else:
                    items_beianlinks = beian_links_crawler.query_by_company_name(source_company_name["name"])
                    save_collection_beian(collection_beian, items_beianlinks)  # insert infos into Mongo.beian
            else:
                items_beianlinks = check_name
            save_beian_artifacts(items_beianlinks, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_beianlinks, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_beianlinks, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao

            # beian
            # 发现更多的artifact(website)和公司名,主备案号

        # Step A/2:按domian,备案查询
        logger.info("source: %s, sourceId: %s 按domian备案查询", source, sourceId)
        for artifact in artifacts:
            # Only check is artifact is a website
            if artifact["type"] != 4010:
                continue
            if artifact["domain"] is None:
                link = url_helper.url_normalize(artifact["link"])
                (flag, domain) = url_helper.get_domain(link)
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            check_domain = list(collection_beian.find({"domain": domain}))

            if len(check_domain) == 0:
                if test:
                    items_merge =[]
                else:
                    items_beianlinks = beian_links_crawler.query_by_domain(domain)
                    items_icpchinaz = icp_chinaz_crawler.query_by_domain(domain)
                    items_merge = merge_beian(items_beianlinks, items_icpchinaz)

                    save_collection_beian(collection_beian, items_merge)  # insert infos into Mongo.beian
            else:
                items_merge = check_domain

            # filer by check domain to avoid sinaapp.cn case
            items_merge = filter_domain(items_merge, domain)

            save_beian_artifacts(items_merge, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_merge, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_merge, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao

            # beian
            # 发现更多的artifact(website)和公司名,主备案号

        # Step A/3 #按主备案号查询
        logger.info("source: %s, sourceId: %s 按主备案号查询", source, sourceId)
        for main_beianhao in main_beianhaos:
            mainBeianhao = main_beianhao["mainBeianhao"]
            check_mainBeianhao = collection_main_beianhao.find_one({"mainBeianhao": mainBeianhao})

            if check_mainBeianhao is None:
                if test:
                    items_merge =[]
                else:
                    items_beianlinks = beian_links_crawler.query_by_main_beianhao(mainBeianhao)
                    items_icpchinaz = icp_chinaz_crawler.query_by_main_beianhao(mainBeianhao)
                    items_merge = merge_beian(items_beianlinks, items_icpchinaz)

                    save_collection_beian(collection_beian, items_merge)  # insert infos into Mongo.beian
                # if mainBeianhao could be found in two links
                if len(items_merge) > 0:
                    items_main_beianhao = [{"mainBeianhao": mainBeianhao}]
                    save_collection_mainBeianhao(collection_main_beianhao, items_main_beianhao)  # insert mainBeianhao into Mongo.main_beianhao
            else:
                items_merge = list(collection_beian.find({"mainBeianhao": mainBeianhao}))

            save_beian_artifacts(items_merge, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_merge, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_merge, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao
            # 发现更多的artifact(website)和公司名

        # itunes扩展
        # Step B/1 #查询itunes artifact
        logger.info("source: %s, sourceId: %s 查询itunes artifact", source, sourceId)

        itunes_company_enames = {}
        app_by_name = {}

        for artifact in artifacts:
            if artifact["type"] != 4040:
                continue
            # Get trackid
            trackid = None
            if artifact["domain"] is None:
                (apptype, appmarket, trackid) = url_helper.get_market(artifact["link"])
                if apptype != 4040:
                    continue

            else:
                try:
                    trackid = int(artifact["domain"])
                except:
                    pass

            if trackid is not None:
                app = collection_itunes.find_one({"trackId": trackid})

                if app is None:
                    # mark it as Noactive
                    set_artifact_active(artifact, "N", source, sourceId)
                else:
                    copy_from_itunes(app, artifact, source, sourceId)  # 存在: copy from mongo.itunes
                    if app.has_key("offline") and app["offline"] is True:
                        set_artifact_active(artifact, "Offline", source, sourceId)
                    else:
                        set_artifact_active(artifact, "Y", source, sourceId)

                    english, is_company = name_helper.english_name_check(app["sellerName"])
                    if english and is_company:
                        itunes_company_enames["sellerName"] = 1
                        app_by_name = app
            else:
                set_artifact_active(artifact, "N", source, sourceId)

        # save the only english name
        if len(itunes_company_enames) == 1:
            company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name": {"$elemMatch": {"type": 12010, "chinese":"N"}}})

            if company_name is None:
                save_company_name(app_by_name, "sellerName", source, sourceId)

        # Step B/2根据公司名查询更多的itunes artifact
        logger.info("source: %s, sourceId: %s 根据公司名查询更多的itunes artifact", source, sourceId)
        for source_company_name in source_company_names:
            # producer name
            '''
            check_itunes_producers = list(collection_itunes.find({"developer": source_company_name["name"]}))
            if len(check_itunes_producers) > 0:
                for app in check_itunes_producers:
                    # Check if itunesId is already existed in artifacts
                    if find_itunesId(app["trackId"], source_company_id):
                        pass
                    else:
                        source_artifact_id = save_itunes_artifact(app, source_company_id)
                        #save_artifact_itunes_rel(app["_id"], source_artifact_id)
                    save_company_name(app, "developer", source_company_id)
            '''
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            check_itunes_sellers = list(collection_itunes.find({"sellerName": source_company_name["name"]}))
            if len(check_itunes_sellers) > 0:
                '''
                domains = {}
                for app in check_itunes_sellers:
                    sellerUrl = app.get("sellerUrl")
                    flag ,domain = url_helper.get_domain(sellerUrl)
                    if flag is not None and domain is not None:
                        domains[domain] = 1
                '''
                lens_domain = count_domains(check_itunes_sellers, "sellerUrl")
                artifact_status = check_source_artifact(source, sourceId)

                for app in check_itunes_sellers:
                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)

                        if app.has_key("sellerUrl"):
                            # if find_link(app["sellerUrl"], source_company_id) or check_source_artifact(source_company_id):
                            if artifact_status:
                                pass
                            elif lens_domain == 1:
                                artifact_id = save_itunesSellerUrl_artifact(app, source, sourceId)

                                if artifact_id is not None:
                                    artifact_status = True

                            # comment due to incorrect expand
                            '''
                            if app.has_key("supportUrl"):
                                if find_link(app["supportUrl"], source_company_id):
                                    pass
                                else:
                                    save_itunesSupportUrl_artifact(app, source_company_id)
                            '''

                            # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                            # save_company_name(app, "sellerName", source_company_id)

        # Step B/3根据域名查询更多的itunes artifact
        logger.info("source: %s, sourceId: %s 根据域名查询更多的itunes artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue

            if artifact["domain"] is None:
                (flag, domain) = url_helper.get_domain(artifact["link"])
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            if domain in itunesDomainEx:
                continue

            check_itunes_sellerDomains = list(collection_itunes.find({"sellerDomain": domain}))
            if len(check_itunes_sellerDomains) > 0:

                lens_company_names = count_company_names(check_itunes_sellerDomains, "sellerName")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_itunes_sellerDomains:

                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)

                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                        chinese, is_company = name_helper.name_check(app["sellerName"])
                        if chinese and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

                        english, is_company = name_helper.english_name_check(app["sellerName"])
                        if english and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

            check_itunes_supportDomains = list(collection_itunes.find({"supportDomain": domain}))
            if len(check_itunes_supportDomains) > 0 and len(check_itunes_supportDomains) < 100:

                lens_company_names = count_company_names(check_itunes_supportDomains, "sellerName")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_itunes_supportDomains:
                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)
                        # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:
                        chinese, is_company = name_helper.name_check(app["sellerName"])
                        if chinese and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

                        english, is_company = name_helper.english_name_check(app["sellerName"])
                        if english and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

        # 发现更多的artifact(website)和公司名,check if existed in source_art..and company_name


        # android扩展
        # Step C/1#查询android artifact
        logger.info("source: %s, sourceId: %s 查询android artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4050:
                continue
            # Get apkname
            apkname = None
            if artifact["domain"] is None:
                (apptype, appmarket, appid) = url_helper.get_market(artifact["link"])
                # Get apkname of baidu and 360 from android market
                if apptype != 4050:
                    continue

                if appmarket == 16010 or appmarket == 16020:
                    android_app = collection_android_market.find_one({"appmarket": appmarket, "key_int": appid})
                    if android_app:
                        apkname = android_app["apkname"]
                else:
                    apkname = appid
            else:
                apkname = artifact["domain"]

            if apkname is not None:
                app = collection_android.find_one({"apkname": apkname})

                if app is None:
                    # mark it as Noactive
                    set_artifact_active(artifact, "N", source, sourceId)
                else:
                    copy_from_android(app, artifact, source, sourceId)  # 存在: copy from mongo.android
                    set_artifact_active(artifact, "Y", source, sourceId)

                    # chinese, is_company = name_helper.name_check(app["author"])
                    # if is_company:
                    #     save_company_name(app, "author", source_company_id)
            else:
                set_artifact_active(artifact, "N", source, sourceId)

        # Step C/2根据公司名查询更多的android artifact
        logger.info("source: %s, sourceId: %s 根据公司名查询更多的android artifact", source, sourceId)
        for source_company_name in source_company_names:
            # producer name
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            check_android_authors = list(collection_android.find({"author": source_company_name["name"]}))
            if len(check_android_authors) > 0 and len(check_android_authors) < 200:

                lens_domain = count_domains(check_android_authors, "website")
                artifact_status = check_source_artifact(source, sourceId)

                # check if author is consistent
                for app in check_android_authors:
                    # Check if AnId have one 4010
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)

                        if artifact_status:
                            pass
                        elif lens_domain == 1:
                            artifact_id = save_androidWebsite_artifact(app, source, sourceId)

                            if artifact_id is not None:
                                artifact_status = True

                                # save_artifact_android_rel(app["_id"], source_artifact_id)
                                # save_company_name(app, "author", source_company_id)

        # Step C/3根据域名查询更多的android artifact
        logger.info("source: %s, sourceId: %s 根据域名查询更多的android artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue

            if artifact["domain"] is None:
                (flag, domain) = url_helper.get_domain(artifact["link"])
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            check_android_websiteDomains = list(collection_android.find({"website_domain": domain}))
            if len(check_android_websiteDomains) > 0:

                lens_company_names = count_company_names(check_android_websiteDomains, "author")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_android_websiteDomains:
                    # Check if AndroidId is already existed in artifacts
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)
                        # save_artifact_android_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        chinese, is_company = name_helper.name_check(app["author"])
                        if is_company:
                            save_company_name(app, "author", source, sourceId)
                            company_name_status = True

            check_android_apknameDomains = list(collection_android.find({"apkname_domain": domain}))
            # add threshold to avoid case: domain: com.wowotuan
            if len(check_android_apknameDomains) > 0 and len(check_android_apknameDomains) < 100:

                lens_company_names = count_company_names(check_android_apknameDomains, "author")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_android_apknameDomains:
                    # Check if AndroidId is already existed in artifacts
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)
                        # save_artifact_android_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        chinese, is_company = name_helper.name_check(app["author"])
                        if is_company:
                            save_company_name(app, "author", source, sourceId)
                            company_name_status = True
        # 发现更多的artifact(website)和公司名

        # 曾用名 TODO

        # 清洗website artfiact
        # 查询meta信息, 标记不能访问的?website?, 处理转跳的website
        logger.info("source: %s, sourceId: %s website meta", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue
            if artifact["link"] is None or artifact["link"].strip() == "":
                # set_active("source_artifact", "N", artifact["id"])
                set_artifact_active(artifact, "N", source, sourceId)
                continue

            url = artifact["link"].strip()
            meta = collection_website.find_one({"url": url})
            if meta is None or meta["httpcode"]==404:
                meta = website.get_meta_info(url)
                if meta:
                    websiteId = save_collection_website(collection_website, meta)
                    if websiteId is not None and not test:
                        #screenshot_wesbite(collection_website, websiteId, screenshot_crawler)
                        pass
                else:
                    meta = {
                        "url": artifact["link"],
                        "httpcode": 404
                    }
                    websiteId = save_collection_website(collection_website, meta)
                    set_artifact_active(artifact, "N", source, sourceId)

            if meta:
                # 发生转跳
                # logger.info(meta)
                if meta["httpcode"] == 200:
                    redirect_url = meta.get("redirect_url")
                    if artifact["link"] != redirect_url:
                        url = url_helper.url_normalize(meta["redirect_url"])
                        (flag_new, domain_new) = url_helper.get_domain(url)

                        meta_new = {
                            "url": url,
                            "domain": domain_new if flag_new is True else None,
                            "redirect_url": url,
                            "title": meta["title"],
                            "tags": meta["tags"],
                            "description": meta["description"],
                            "httpcode": 200

                        }

                        websiteId_new = save_collection_website(collection_website, meta_new)
                        if websiteId_new is not None and not test:
                            #screenshot_wesbite(collection_website, websiteId_new, screenshot_crawler)
                            pass

                        flag, domain = url_helper.get_domain(artifact["link"])
                        if domain_new != domain:  # 跳出原域名
                            set_artifact_active(artifact, "Redirect", source, sourceId)
                        else:
                            if flag is True:  # 这是个'好'地址
                                set_artifact_active(artifact, "Y", source, sourceId)
                            else:
                                if flag_new is True:  # 转跳后是个 '好'地址
                                    set_artifact_active(artifact, "Redirect", source, sourceId)
                                    save_website_artifact(meta_new, source, sourceId)
                                else:
                                    set_artifact_active(artifact, "Y", source, sourceId)
                    else:
                        set_artifact_active(artifact, "Y", source, sourceId)
                elif meta["httpcode"] == 404:
                    set_artifact_active(artifact, "N", source, sourceId)

        # verify -> source_artifacts/source_company_name set verify
        logger.info("source: %s, sourceId: %s set verify", source, sourceId)
        for artifact in artifacts:
            set_artifact_expand(artifact, source, sourceId)
        for source_company_name in source_company_names:
            set_scname_expand(source_company_name, source, sourceId)
        for main_beianhao in main_beianhaos:
            set_scbeianhao_expand(main_beianhao, source, sourceId)

        round += 1
Ejemplo n.º 29
0
        logger.info("investor aggregator start")
        #get source_investors
        conn = db.connect_torndb()
        #Check verify or processStatus
        source_investors = conn.query(
            "select * from source_investor where processStatus=0 order by id")
        conn.close()

        for source_investor in source_investors:
            logger.info(source_investor["id"])
            #get Domain
            source_investor["domain"] = None
            if source_investor["website"] is not None:
                source_investor["website"] = url_helper.url_normalize(
                    source_investor["website"])
                type, market, website_domain = url_helper.get_market(
                    source_investor["website"])
                if type == 4010 and website_domain is not None:
                    source_investor["domain"] = website_domain

            if source_investor["investorId"] is not None:

                investor = find_in_investor("id",
                                            source_investor["investorId"])
                update_investor(investor, source_investor)
                set_processStatus(source_investor["id"])
                continue

            else:
                #name check
                name = source_investor["name"]
                if name is not None and name.strip != "":
Ejemplo n.º 30
0
def parse_artifact(item):
    if item is None:
        return None

    artifacts = []
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    #artifact
    logger.info("*** artifact ***")
    lis = d('ul.list-prod> li> div.on-edit-hide')
    for li in lis:
        l = pq(li)
        strtype = l('h4> span.tag').text().strip()
        #logger.info(strtype)
        if strtype != u"网站" and strtype != "app":
            continue

        link = l('h4> b> a').attr("href").strip()
        if link == "":
            continue

        domain = None
        type = None
        if strtype == u"网站":
            type, app_market, app_id = url_helper.get_market(link)
            if type == 4010:
                link = url_helper.url_normalize(link)
                flag, domain = url_helper.get_domain(link)
                if flag is None:
                    continue
                if flag is False:
                    domain = None

        if type != 4010:
            type, app_market, app_id = url_helper.get_market(link)
            if type == 4040:
                domain = app_id
            elif type == 4050:
                if app_market == 16010 or app_market == 16020:
                    android_app = parser_db_util.find_android_market(app_market, app_id)
                    if android_app:
                        domain = android_app["apkname"]
                else:
                    domain = app_id
            if domain is None and type !=4030 and type != 4020:
                continue

        name = l('h4> b').text().strip()
        desc = l('p').text().strip()
        logger.info("type: %s, name: %s, link: %s, desc: %s" % (type, name,link,desc))
        artifact = {
            "type":type,
            "name":name,
            "desc":desc,
            "link":link,
            "domain": domain
        }
        artifacts.append(artifact)

    logger.info("")
    return artifacts