def insert(shortname, name, brief, website): name = name.replace("(开业)", "") sourceId = util.md5str(name) sid = parser_db_util.save_company_yitai(shortname, name, 13100, sourceId, brief) logger.info("sid:%s->sourceId:%s", sid, sourceId) parser_db_util.save_source_company_name(sid, name, 12010) parser_db_util.save_source_company_name(sid, shortname, 12020) if website is not None and website.strip() != "": website = url_helper.url_normalize(website) if website is not None and website != "": if website.find("http://") == -1 and website.find("https://"): website = "http://" + website type, market, app_id = url_helper.get_market(website) if type == 4010: if website.find('sse.com') > 0: pass else: artifact = { "sourceCompanyId": sid, "name": shortname, "description": None, "link": website, "domain": app_id, "type": type } parser_db_util.save_artifacts_standard(sid, [artifact])
def find_company_by_artifact(website): type, market, website_domain = url_helper.get_market(website) if type == 4010 and website_domain is not None: conn = db.connect_torndb() artifact = conn.get( "select a.* from artifact a join company c on c.id=a.companyId " "where (c.active is null or c.active !='N') and a.type=%s and a.link=%s limit 1", 4010, website) conn.close() if artifact is not None: logger.info("find_company_by_artifact 1, %s, %s", artifact["type"], artifact["link"]) return artifact["companyId"] conn = db.connect_torndb() artifact = conn.get( "select a.* from artifact a join company c on c.id=a.companyId " "where (c.active is null or c.active !='N') and a.type=%s and a.domain=%s limit 1", 4010, website_domain) conn.close() if artifact is not None: logger.info("find_company_by_artifact 2, %s, %s", artifact["type"], artifact["domain"]) return artifact["companyId"] return None
def find_itunesId(itunesId, companyId): conn = db.connect_torndb() artifacts = conn.query( "select * from artifact where companyId=%s and type=4040", companyId) conn.close() #Check if itunesId is already existed in artifacts for artifact in artifacts: trackid = None if artifact["domain"] is None: (apptype, appmarket, trackid) = url_helper.get_market(artifact["link"]) if apptype != 4040: continue else: try: trackid = int(artifact["domain"]) except: pass if trackid == itunesId: return True return False
def find_androidAppname(androidApk, source, sourceId): if androidApk is None or androidApk.strip() == "": return True artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId, nonexpand=False) #Check if apkname is already existed in artifacts for artifact in artifacts: if artifact["type"] != 4050: continue apkname = None if artifact["domain"] is None: (apptype, appmarket, appid) = url_helper.get_market(artifact["link"]) if apptype != 4050: continue # Get apkname of baidu and 360 from android market if appmarket == 16010 or appmarket == 16020: android_app = collection_android_market.find_one({"appmarket": appmarket, "key_int": appid}) if android_app: apkname = android_app["apkname"] else: apkname = appid else: apkname = artifact["domain"] #logger.info(apkname) if apkname == androidApk: return True return False
def aggregate(item, tt=1): company_ids = [] flag = False btb = item["data"]["btb"] basic = item["data"]["basic"] if btb["bz_code"] is not None and btb["bz_code"].strip() != "": # logger.info(btb["source"]) fullName = basic["company"] shortName = basic["product"] website = basic["gw_link"] type, market, app_id = url_helper.get_market(website) artifact = {"link": website, "domain": app_id, "type": type} website1 = btb["gw"] type, market, app_id = url_helper.get_market(website1) artifact1 = {"link": website, "domain": app_id, "type": type} if tt == 1: if len(find_companies_by_full_name_corporate([fullName])) > 0: company_ids = find_companies_by_full_name_corporate([fullName]) logger.info("%s found fullName by %s", btb["bz_code"], fullName) flag = 2 elif len(find_companies_by_artifacts([artifact])) > 0: company_ids = find_companies_by_artifacts([artifact]) logger.info("%s found artifact by %s", btb["bz_code"], website) flag = 3 elif len(find_reference([shortName, btb["bz_code"]])) > 0: company_ids = find_reference([shortName, btb["bz_code"]]) logger.info("%s found shortName by %s", btb["bz_code"], shortName) flag = 4 else: flag = 5 else: if len(find_companies_by_artifacts([artifact1])) > 0: logger.info("%s found artifact by %s", btb["bz_code"], website) flag = 4 else: flag = 5 else: flag = 0 return flag, company_ids
def update_domain_website(): conn = db.connect_torndb() arts = conn.query("select * from investor where (active ='Y' or active is null) and " "domain is null and website is not null") for art in arts: (linktype,appmarket , domain) = url_helper.get_market(art["website"]) if domain is not None: sql = "update investor set domain=%s where id=%s" conn.update(sql, domain, art["id"]) conn.close()
def update_domain_artifact(): # conn = db.connect_torndb() conn = db.connect_torndb_proxy() arts = conn.query( "select * from artifact where (active ='Y' or active is null) and domain is null" ) for art in arts: if art["type"] in [4010, 4040, 4050]: (linktype, appmarket, domain) = url_helper.get_market(art["link"]) if domain is not None: update_domain(domain, art["id"]) if art["type"] in [4020]: if art["link"] is not None and art["link"].strip() != "": update_domain(art["link"], art["id"]) conn.close()
def parse_artifact(source_company_id, r): type, market, app_id = url_helper.get_market(r['website']) artifacts = [] if type == 4010 and r['website'].strip( ) != '' and r['website'] is not None: artifact = { "sourceCompanyId": source_company_id, "name": r["name"], "description": None, "link": r['website'], "domain": app_id, "type": type } artifacts.append(artifact) return artifacts
def find_androidAppname(androidApk, companyId): # mongo mongo = db.connect_mongo() collection_android_market = mongo.market.android_market if androidApk is None or androidApk.strip() == "": mongo.close() return True conn = db.connect_torndb() artifacts = conn.query( "select * from artifact where companyId=%s and type=4050", companyId) conn.close() #Check if apkname is already existed in artifacts for artifact in artifacts: apkname = None if artifact["domain"] is None: (apptype, appmarket, appid) = url_helper.get_market(artifact["link"]) if apptype != 4050: continue # Get apkname of baidu and 360 from android market if appmarket == 16010 or appmarket == 16020: android_app = collection_android_market.find_one({ "appmarket": appmarket, "key_int": appid }) if android_app: apkname = android_app["apkname"] else: apkname = appid else: apkname = artifact["domain"] #logger.info(apkname) if apkname == androidApk: mongo.close() return True mongo.close() return False
def parse_base(item): if item is None: return None company_key = item["key"] content = item["content"] artifacts = [] link = url_helper.url_normalize(content["website"]) type, app_market, app_id = url_helper.get_market(link) if type == 4010 or \ ( (type == 4040 or type == 4050) and app_id): artifacts.append({ "type":type, "name":content["name"], "desc":content["desc"], "link":link, "domain":app_id }) return { "shortName": content["name"], "fullName": None, "productName": content["name"], "description": None, "brief": content["desc"], "round": 0, "roundDesc": "", "companyStatus": 2010, "fundingType": 0, "locationId": 0, "establishDate": None, "logo": None, "sourceId": company_key, "field": None, "subField": None, "tags": None, "type":41020, "score":content["score"], "artifacts":artifacts }
def parse_artifact(source_company_id,item): logger.info("parse_artifact") c = item["baseinfo"] artifacts = [] website = c.get("website","").strip() website = url_helper.url_normalize(website) if website is not None and website != "": if website.find("http://") == -1 and website.find("https://"): website = "http://"+website type, market, app_id = url_helper.get_market(website) if type == 4010: if website.find('neeq') > 0: pass else: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": website, "domain": app_id, "type": type } artifacts.append(artifact) elif (type==4040 or type==4050) and app_id is not None: domain = get_android_domain(market, app_id) if (type==4040 or type==4050) and domain is not None: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": website, "domain": domain, "type": type } artifacts.append(artifact) return artifacts
def find_itunesId(itunesId, source, sourceId): artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId, nonexpand=False) #Check if itunesId is already existed in artifacts for artifact in artifacts: if artifact["type"] != 4040: continue #Get trackid trackid = None if artifact["domain"] is None: (apptype, appmarket, trackid) = url_helper.get_market(artifact["link"]) if apptype != 4040: continue else: try: trackid = int(artifact["domain"]) except: pass if trackid == itunesId: return True return False
def process(): logger.info("itjuzi_next_parser begin...") items = parser_db_util.find_process(SOURCE, TYPE) for item in items: logger.info(item["url"]) r = parse_base(item) if r is None: continue #logger.info(r) source_company_id = parser_db_util.save_company( r, SOURCE, download_crawler) logger.info("source_company_id=%s", source_company_id) parser_db_util.save_company_score(source_company_id, r["score"]) artifacts = [] for artifact in r["artifacts"]: link = artifact["link"] type, app_market, app_id = url_helper.get_market(link) if type is None: continue if type == 4040 or type == 4050: if app_id is None: continue artifact["type"] = type artifact["domain"] = app_id artifacts.append(artifact) parser_db_util.save_artifacts(source_company_id, artifacts) parser_db_util.update_processed(item["_id"]) #break logger.info("itjuzi_next_parser end.")
def find(artifact): # app = collection_android.find_one({"apkname": apkname}) # if app is not None: # # logger.info("find domain:%s app: link:%s", apkname, app["link"]) # return app["link"] # else: # logger.info("cannot find domain:%s ", apkname) # return None apkname = None (apptype, appmarket, appid) = url_helper.get_market(artifact["link"]) # Get apkname of baidu and 360 from android market if apptype not in [4040, 4050]: return None if appmarket == 16010 or appmarket == 16020: android_app = collection_android_market.find_one({ "appmarket": appmarket, "key_int": appid }) if android_app: apkname = android_app["apkname"] else: apkname = appid app = None if apkname is not None: if apptype == 4040: app = collection_itunes.find_one({"trackId": appid}) else: app = collection_android.find_one({"apkname": apkname}) if app is None: return None else: return app
for bc in bcs: companyIds = [] num0 += 1 websitestr = bc["websites"] if websitestr is None: num1 += 1 continue else: # companyIds = [] websites = websitestr.split("|") for website in websites: tp, market, app_id = url_helper.get_market(website) # logger.info("%s-%s", type(tp),tp) artifact = {"link": website, "domain": app_id, "type": tp} for id in find_companies_by_artifacts([artifact]): if id not in companyIds: companyIds.append(id) # companyIds.extend(find_companies_by_artifacts([artifact])) if len(companyIds) == 0: num2 += 1 elif len(companyIds) > 1: num3 += 1 else: logger.info("%s matched company: %s %s", bc["symbol"], companyIds, num4)
def parse_company(item): logger.info("*** base ***") company_key = item["key"] html = item["content"] logger.info(company_key) d = pq(html) logo = d('.logo-block > img').attr('src') if logo == 'http://assets3.chuangyepu.com/chuangyepu/images/big-screenshot.png': logo = None basic_info = d('div.col-md-9> div> table> tr> td').eq(1) #logger.info(basic_info) name = pq(basic_info)('div.name').text().strip() brief = pq(basic_info)('div.desc').eq(0).text().strip() if name is None: return { "status": "No_Name", } #logger.info(name+" "+brief) try: website = pq(basic_info)('div.desc').eq(1)('a').text().strip() except: website = None #logger.info("website: %s",website) #parser artifact tags = pq(basic_info)('div.line-block').text().strip().replace(" ", ",") #logger.info(tags) main_blocks = d('div.col-md-9> div.col-sm-12') h4s = d('div.col-md-9> h4') logger.info("main: %d, h4: %d", len(main_blocks), len(h4s)) #产品介绍/团队成员/媒体报道/融资历史 if len(h4s) != len(main_blocks) - 1: return { "status": "No_Data", } desc = None round = None roundDesc = None source_fundings = [] for i in xrange(len(h4s)): h4 = h4s.eq(i).text().strip() d = main_blocks.eq(i + 1) #DESC if h4 == "产品介绍": desc = d('div.content> div> p.desc').text().strip() #parser finance if h4 == "融资历史": lines = d('table> tr') for li in lines: line = pq(li) if line.text().find("时间") >= 0: continue #logger.info(line) date = line('td.investment_date> span').text().strip() + "/01" try: fundingDate = datetime.datetime.strptime(date, '%Y/%m/%d') except: fundingDate = None #logger.info(fundingDate) roundStr = line('td.investment-round').text().strip() fundingRound, roundStr = chuangyepu_helper.getFundingRound( roundStr) #logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = line('td.money').text().strip() (currency, investment, precise) = chuangyepu_helper.getMoney(moneyStr) #logger.info("%s - %s - %s" % (currency, investment, precise)) fs = line('td').eq(3)('p> a') investors = [] for f in fs: iv = pq(f) investor_url = iv.attr("href") investor_name = iv.text().strip() if investor_name is not None and investor_url is not None and investor_url != "" and investor_url.find( "institutions") >= 0: investor_key = investor_url.strip().split("/")[-1] investor = {"name": investor_name, "key": investor_key} investors.append(investor) source_funding = { "investment": investment, "precise": precise, "round": fundingRound, "roundDesc": roundStr, "currency": currency, "fundingDate": fundingDate, "investors": investors } #logger.info(json.dumps(source_funding, ensure_ascii=False, cls=util.CJsonEncoder)) source_fundings.append(source_funding) if round is None or round < fundingRound: round = fundingRound roundDesc = roundStr if h4 == "团队成员": #not accurate member infos pass if h4 == "媒体报道": pass artifacts = [] if desc is None: desc = brief if brief is not None and len(brief.decode('utf-8')) > 200: brief = None type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) #logger.info("Desc: %s", desc) #logger.info("round: %s, roundDesc: %s", round, roundDesc) source_company = { "name": name, "fullName": None, "description": desc, "productDesc": None, "modelDesc": None, "operationDesc": None, "teamDesc": None, "marketDesc": None, "compititorDesc": None, "advantageDesc": None, "planDesc": None, "brief": brief, "round": round, "roundDesc": roundDesc, "companyStatus": 2010, 'fundingType': 0, "locationId": None, "address": None, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": None, "subField": None, "tags": None, "headCountMin": None, "headCountMax": None, "artifacts": artifacts, "fundings": source_fundings, "status": 1 } #for i in source_company: # logger.info("%s -> %s", i, source_company[i]) return source_company
def parse_company(item): if item is None: logger.info("here") return None logger.info("*** base ***") company_key = item["key"] html1 = item["content"] logger.info(company_key) d = pq((html.fromstring(html1.decode("utf-8")))) name = d('h1.name').text().strip() fullName = d('div.company-business> h4').text() if fullName.find("来源")>=0: fullName = fullName.split(" ")[-1] fullName = name_helper.company_name_normalize(fullName) if (name is None or name == "") or (fullName is None or fullName == ""): logger.info("here1: %s", name) return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: logger.info("here") return { "status": "No_Name", } logo = d('div.company-logo> img').attr('src') if logo.startswith("http") or logo.startswith("https") or logo.find("default") >= 0: pass else: logo = None # if logo.find("default") >= 0: # logo = None brief = None desc_text = d('div.job-sec> div.text').text() logger.info("desc: %s", desc_text) if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5: desc = None else: desc = desc_text.replace('公司简介:',"").replace("收起","").replace("展开","").replace(" ","").strip() field = '' stage = '' headCount = '' location = '' address = '' try: lll = d('div.info-primary> p').text().strip() if len(lll.split(" ")) == 3: field = lll.split(" ")[2] stage = lll.split(" ")[0] headCount = lll.split(" ")[1] except: pass headCount = headCount.replace("人", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 links = d('div.company-products> ul> li> div.text> div.name> a') artifacts = [] for linkp in links: link = pq(linkp)('a').attr("href") website = url_helper.url_normalize(link) logger.info("website: %s" % website) type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("zhipin") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": None, "link": website, "domain": domain }) #parser member members = [] lis = d('div.manager-list> div> ul >li> div') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('div.info-user> img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p> span.name').text() # member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p> span.job-title').text() member_desc = mem('div.item_manager_content').text() # weibo = None # if member_link is not None: # if 'weibo.com' in member_link: # weibo = member_link source_member = {'name': member_name, 'photo_url': logo_url, 'weibo': None, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None } members.append(source_member) except: pass sourceId2link = d('div.company-tab> a').eq(0).attr("href") if sourceId2link is not None and sourceId2link.find("gongsi") >=0: sourceId2 = sourceId2link.split("/")[-1].replace(".html","") else: sourceId2 = None source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": int(0), "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "sourceId2": sourceId2, "sourceUrl": "https://www.zhipin.com/gongsi/%s.html?ka=company-intro" % company_key, "field": field, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "members": members, "status": 1, "stage": 0, } return source_company
def parse_artifact(source_company_id, item): name = item['name'] logger.info('parse_artifact:%s' % name) artifacts = [] desc = '' descs = item['content']['company_base']['properties'] if descs.has_key('short_description'): desc = descs['short_description'] of = item['content']['company_base']['overview_fields2'] if of.has_key('website'): website = of['website']['value'] website = url_helper.url_normalize(website) # logger.info('website:%s'%website) if website is not None and website.find( 'twitter') == -1 and website.find( 'linkedin') == -1 and website.find('facebook') == -1: type, app_market, app_id = url_helper.get_market(website) # logger.info('type:%s---market:%s---app_id:%s'%(type,market,app_id)) if type == 4010: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) return artifacts
def find_fof_alias(investorId): conn = db.connect_torndb() investor = conn.get("select * from fof where (active is null or active='Y') and id=%s", investorId) if investor is None: logger.info("investor :%s is not available", investorId) conn.close() return aliases = conn.query("select * from fof_alias where (active is null or active='Y') and " "(verify is null or verify !='N') and fofId=%s", investorId) aliases_amac = conn.query("select iaa.* from fof_alias_amac iaa join fof_alias ia on " "iaa.fofAliasId = ia.id where (ia.active is null or ia.active='Y') and " "(ia.verify is null or ia.verify !='N') and ia.fofId=%s", investorId) managerIds = [alias["amacId"] for alias in aliases_amac if alias["amacId"] is not None and alias["amacId"] is not None and alias["amacType"] is not None and alias["amacType"] == 'M'] managerIds_mysql = [alias["fofAliasId"] for alias in aliases_amac if alias["amacId"] is not None and alias["amacId"] is not None and alias["amacType"] is not None and alias["amacType"] == 'M'] names = [alias["name"] for alias in aliases if alias["name"] is not None and alias["type"] == 12010] manager_names = [alias["name"] for alias in aliases if alias["name"] is not None and alias["type"] == 12010 and alias["id"] in managerIds_mysql] logger.info("managerId: %s", ";".join(managerIds)) logger.info("names: %s", ";".join(names)) logger.info("manager_names: %s", ";".join(manager_names)) #find by domains InvestorDms = [] if investor["website"] is not None and investor["website"].strip() != "": if investor["domain"] is not None and investor["domain"].strip() != "": InvestorDms.append(investor["domain"]) else: type, market, website_domain = url_helper.get_market(investor["website"]) if type == 4010 and website_domain is not None: conn.update("update investor set domain=%s where id=%s", website_domain, investorId) InvestorDms.append(website_domain) logger.info("investor: %s has self domain: %s", investor["name"], ":".join(InvestorDms)) if len(managerIds) > 0: amac_domains = get_websit_domains(managerIds) for amac_domain in amac_domains: if amac_domain not in InvestorDms: InvestorDms.append(amac_domain) logger.info("investor: %s has total domain: %s", investor["name"], ":".join(InvestorDms)) if len(InvestorDms) > 0: newMangers = find_amac_manager_by_domains(InvestorDms) logger.info("investor: %s has found %s amac managers by domain", investor["name"], len(newMangers)) if len(newMangers) > 0: for newManger in newMangers: if newManger["managerName"] is not None: if newManger["managerName"] not in names or newManger["managerName"] not in manager_names: logger.info("investor: %s has a new alias: %s", investor["name"], newManger["managerName"]) add_fof_alias_from_amac(investorId, newManger["managerName"], str(newManger["_id"]), addFund=True) names.append(newManger["managerName"]) else: logger.info("investor: %s already has alias: %s", investor["name"], newManger["managerName"])
def update_investor(investor,source_investor): conn = db.connect_torndb() investor_id = investor["id"] logger.info("****checking %s/%s/%s", investor["name"], investor["id"], source_investor["id"]) if investor["online"] is not None and investor["online"] == "Y": logger.info("online not update!!!") time.sleep(1) pass else: logger.info("Update investor : %d with source_investor: %d ", investor_id, source_investor["id"]) replace(investor, source_investor) #insert investor_alias for name in [source_investor["name"], source_investor["fullName"],source_investor["enName"], source_investor["enFullName"]]: if name is None or name.strip() == "": continue investor_alias = conn.get("select * from investor_alias where name=%s and " "investorId=%s and (active is null or active='Y') limit 1", name, investor["id"]) # logger.info("here: %s", investor_alias) if investor_alias is None: chinese, is_company = name_helper.name_check(name) if is_company: type = 12010 else: type = 12020 sql = "insert investor_alias(investorId, name, type, createTime,modifyTime) values(%s,%s,%s, now(),now())" logger.info("Add new investor alias: %s for %s", name, investor["id"]) conn.insert(sql, investor["id"], name, type) #insert investor_artifact: artifacts = [] if source_investor["website"] is not None and source_investor["website"] != "": type, market, app_id = url_helper.get_market(source_investor["website"]) if type == 4010: if source_investor["website"].find('36kr') > 0 and source_investor["website"].find("baidu") > 0: pass else: artifact = { "investorId": investor["id"], "name": investor["name"] , "description": None, "link": source_investor["website"], "domain": app_id, "type": type } artifacts.append(artifact) elif (type == 4040 or type == 4050) and app_id is not None: domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "investorId": investor["id"], "name": investor["name"] , "description": None, "link": source_investor["website"], "domain": domain, "type": type } artifacts.append(artifact) weibo = source_investor.get("weibo", "") if weibo is not None and weibo.strip() != "" and weibo.find("weibo") >= 0: artifact = { "investorId": investor["id"], "name": investor["name"] , "description": None, "link": weibo, "domain": None, "type": 4030 } artifacts.append(artifact) weixin = source_investor.get("wechatId", "") if weixin is not None and weixin.strip() != "": artifact = { "investorId": investor["id"], "name": investor["name"] , "description": None, "link": weixin, "domain": weixin, "type": 4020 } artifacts.append(artifact) if len(artifacts) > 0: for art in artifacts: if art["type"] not in [4030] and art["domain"] is not None and art["domain"].strip()!="": iart = conn.get("select * from investor_artifact where type=%s and investorId=%s and domain=%s limit 1", art["type"], investor["id"], art["domain"]) else: iart = conn.get("select * from investor_artifact where type=%s and investorId=%s and link=%s limit 1", art["type"], investor["id"], art["link"]) if iart is None: logger.info("add new artifact: %s/%s/%s", art["type"], art["name"], art["link"]) sql = "insert investor_artifact(investorId,type, name, link, domain, createTime,modifyTime) \ values(%s,%s,%s,%s,%s,now(),now())" conn.insert(sql, investor["id"], art["type"], art["name"], art["link"], art["domain"]) #insert contact contacts = conn.query("select * from source_investor_contact where sourceInvestorId=%s", source_investor["id"]) if len(contacts) >0: conn.execute("delete from investor_contact where investorId=%s and createUser=139", investor["id"]) for s in contacts: sql = "insert investor_contact(investorId, locationId, address, phone, email, createUser, " \ "createTime,modifyTime) \ values(%s,%s,%s,%s,%s,%s,now(),now())" conn.insert(sql, investor["id"], s["locationId"], s["address"], s["phone"], s["email"], 139) # insert member members = conn.query("select * from source_investor_member where sourceInvestorId=%s", source_investor["id"]) for m in members: member = conn.get("select * from investor_member where investorId=%s and name=%s limit 1", investor["id"], m["name"]) if member is not None: continue sql = "insert investor_member(investorId,name,logo, position, description,createUser,createTime,modifyTime) \ values(%s,%s,%s,%s,%s,%s,now(),now())" conn.insert(sql, investor["id"], m["name"], m["logo"], m["position"], m["description"], 139) conn.close()
def parse_base(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip() if product_name is None or product_name.strip() == "": product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip() temps = product_name.split("/",1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: %s" % product_name) logger.info("company short name: %s" % company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","") if company_name == "暂无" or company_name == "暂未收录": company_name = "" if company_name is None or company_name.strip() == "": try: company_name = d('div.des-more> h2').text().strip() except: pass if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = name_helper.company_name_normalize(company_name) logger.info("company name: %s" % company_name) if company_short_name == "" and company_name == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","") result = util.re_get_result('(\d*)\.(\d*)',str) if result != None: (year, month) = result try: if int(month) > 12: month = "1" except: month = "1" establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d') logger.info("establish date: %s" % establish_date) locationId=0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$',str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 result = parser_db_util.get_location(city) if result != None: locationId = result["locationId"] else: result = parser_db_util.get_location(province) if result != None: locationId = result["locationId"] if locationId == 0: loc1,loc2 = name_helper.get_location_from_company_name(company_name) if loc1 is not None: result = parser_db_util.get_location(loc1) if result != None: locationId = result["locationId"] logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info("融资需求: %s" % str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) try: brief = d("h2.seo-slogan").text().strip() except: brief = "" logger.info("brief: %s" % brief) if brief.find("暂未收录"): brief = "" field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: %s" % field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: %s" % sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",") logger.info("tags: %s" % tags) desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\ replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip() logger.info("********desc: %s" % desc) #logo logo = d("div.pic >img").attr("src") #if logo: # logo = logo.replace("http://", "https://") logger.info("logo: %s", logo) # website = d('div.link-line> a').text().strip() # if website is None or website == "": # website = d('div.link-line> a.webTink').text().strip() # if website is None or website == "": # try: # logger.info("here") # website = d('div.link-line> span.weblink> a').eq(1).text().strip() # logger.info(website) # except: # pass artifacts = [] for ty in [1,2,3]: if ty == 1: was = d('div.link-line> a') else: was = d('div.link-line> span.weblink,span.webTink> a') for wa in was: webs =[] try: website = pq(wa).attr("href").strip() if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass try: website = pq(wa).text().strip() if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass # # if website=="http://%e6%9a%82%e6%97%a0": # website = "" # website = url_helper.url_normalize(website) # logger.info("website: %s" % website) # artifacts = [] for website in webs: type, app_market, app_id = url_helper.get_market(website) if type == 4010: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type":4010, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4020: domain = app_id if domain is not None: artifacts.append({ "type": 4020, "name": product_name, "desc": None, "link": website, "domain": website }) elif type == 4030: domain = app_id if domain is not None: artifacts.append({ "type": 4030, "name": product_name, "desc": None, "link": website, "domain": None }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type":4040, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type":4050, "name":product_name, "desc":desc, "link":website, "domain": domain }) #获投状态 roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip() fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("获投状态: %d, %s", fundingRound, roundStr) logger.info("") return { "shortName": company_short_name, "fullName": company_name if company_name is not None and company_name.strip() != "" else None, "productName": product_name, "description": desc, "brief": brief, "round": fundingRound, "roundDesc": roundStr, "companyStatus": company_status, "fundingType": funding_type, "locationId": locationId, "establishDate": establish_date, "logo": logo, "sourceId": company_key, "field": field, "subField": sub_field, "tags": tags, "type":41010, "artifacts":artifacts }
def parse_company(item): logger.info("parse_company") company_key = item["postdata"]["id"] #company basic info c = item["data"]["basic"] tags = c["tags"] tags_str = tags.replace("|",",") logo=c["icon"] if logo.find("product_default.png") >= 0: logo = None establish_date = None if c.has_key("open_time"): try: establish_date = datetime.datetime.strptime(c["open_time"], "%Y-%m-%d") except: pass address1 = None address2 = None if c.has_key("city"): address2 = c["city"] if c.has_key("province"): address1 = c["province"] location_id = 0 if address2!=None and address2.strip()!="": location = parser_db_util.get_location(address2) if location != None: location_id= location["locationId"] if location_id==0 and address1 != None and address1.strip()!="": location = parser_db_util.get_location(address1) if location != None: location_id = location["locationId"] fullName = c["company"] if fullName is None or fullName.strip() == "": fullName = None else: fullName = fullName.replace("_","") idx = fullName.rfind(u"公司") if idx != -1: fullName = fullName[:(idx+len(u"公司"))] fullName = name_helper.company_name_normalize(fullName) name = c["product"] desc = "" brief = "" productDesc = None modelDesc = None operationDesc = None teamDesc = None marketDesc = None compititorDesc = None advantageDesc = None planDesc = None otherDesc = None if c.has_key("desc"): # 其他 # otherDesc = c["intro"].strip() desc = c["desc"].strip() if c.has_key("yewu"): # 其他 # otherDesc = c["intro"].strip() brief = c["yewu"].strip() if name is None or fullName is None: return { "status": "No_Name", } artifacts = [] websites = [] if c.has_key("gw_link") is True and c["gw_link"].strip() !="" and c["gw_link"] not in websites: websites.append(c["gw_link"]) if c.has_key("source_gw_link") is True and c["source_gw_link"].strip() != "" and c["source_gw_link"] not in websites: websites.append(c["source_gw_link"]) if item["data"].has_key("productinfos") is True: for pi in item["data"]["productinfos"]: if pi.has_key("link") is True and pi["link"].strip() !="" and pi["link"] not in websites: websites.append(pi["link"]) for website in websites: type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("qimingpian.com") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": brief, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": brief, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": brief, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": brief, "link": website, "domain": domain }) return { "name": name, "fullName": fullName, "description": desc, "productDesc": productDesc, "modelDesc": modelDesc, "operationDesc": operationDesc, "teamDesc": teamDesc, "marketDesc": marketDesc, "compititorDesc": compititorDesc, "advantageDesc": advantageDesc, "planDesc": planDesc, "otherDesc": otherDesc, "brief": brief, "round": 0, "roundDesc": None, "companyStatus": 2010, 'fundingType': 0, "locationId": location_id, "address": None, "phone": None, "establishDate": establish_date, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": None, "subField": None, "tags": tags_str, "headCountMin": None, "headCountMax": None, "artifacts": artifacts, }
def parse_company(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] logger.info(company_key) d = pq(html) # logo_id processed in parser_db_util ''' logo_id = None if logo_url is not None: logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url) ''' if html.decode("utf-8").find("这个公司的主页还在建设") >= 0: return { "status": "No_Name", } name = d('.company_main > h1 > a').text() link = d('.company_main > h1 > a').attr('href') fullName = d('.company_main > h1 > a').attr('title') fullName = name_helper.company_name_normalize(fullName) if name is None or fullName is None or name.find("拉勾") >= 0: return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: return { "status": "No_Name", } logo = d('.top_info_wrap > img').attr('src') if logo.startswith("http") or logo.startswith("https"): pass else: logo = "http:" + logo if logo.find("logo_default") >= 0: logo = None brief = d('.company_word').text() desc_text = d('.company_intro_text').text() if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10: desc = None else: desc = d('.company_intro_text > .company_content').html() desc = desc.replace('<span class="text_over">展开</span>', '') soup = BeautifulSoup(desc, "lxml") raw = soup.getText() # logger.info(desc) #logger.info(raw) desc = raw # if desc is None or desc.strip() == "": # return { # "status": "No_Name", # } field = '' stage = '' headCount = '' location = '' address = '' try: field = d( '#basic_container > .item_content >ul > li:eq(0) > span').text() stage = d( '#basic_container > .item_content >ul > li:eq(1) > span').text() headCount = d( '#basic_container > .item_content >ul > li:eq(2) > span').text() headCount = headCount[0:headCount.index(u'人')] location = d( '#basic_container > .item_content >ul > li:eq(3) > span').text() address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text() except: pass headCount = headCount.replace("people", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 location_id = 0 location_new = parser_db_util.get_location(location) if location_new != None: location_id = location_new["locationId"] #website = util.norm_url(link) website = url_helper.url_normalize(link) logger.info("website: %s" % website) artifacts = [] type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("lagou.com") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "productDesc": None, "modelDesc": None, "operationDesc": None, "teamDesc": None, "marketDesc": None, "compititorDesc": None, "advantageDesc": None, "planDesc": None, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": location_id, "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": field, "subField": None, "tags": None, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "status": 1 } return source_company
logger = loghelper.get_logger("prepare_source_artifact_domain") if __name__ == "__main__": start = 0 conn =db.connect_torndb() while True: items = list(conn.query("select * from source_artifact order by id limit %s,1000",start)) for item in items: if item["domain"] is not None and item["domain"].strip() != "": continue if item["type"] == 4010: link = url_helper.url_normalize(item["link"]) (flag, domain) = url_helper.get_domain(link) if flag is True: logger.info("%s, %s %s %s", item["id"], item["type"], link, domain) conn.update("update source_artifact set domain=%s where id=%s", domain, item["id"]) elif item["type"] == 4040 or item["type"] == 4050: (apptype, appmarket, trackid) = url_helper.get_market(item["link"]) if (apptype == 4040 or apptype == 4050) and trackid is not None: logger.info("%s %s %s %s", item["id"], apptype, item["link"], trackid) conn.update("update source_artifact set type=%s, domain=%s where id=%s",apptype,trackid,item["id"]) start += 1000 if len(items) == 0: break conn.close()
def parse_company(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] logger.info(company_key) d = pq(html) if html.decode("utf-8").find("这个公司的主页还在建设") >= 0: return { "status": "No_Name", } name = d('.company_main > h1 > a').text() link = d('.company_main > h1 > a').attr('href') fullName = d('.company_main > h1 > a').attr('title') fullName = name_helper.company_name_normalize(fullName) if name is None or fullName is None or (name.find("拉勾") >= 0 and company_key != "147"): return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: return { "status": "No_Name", } logo = d('.top_info_wrap > img').attr('src') if logo.startswith("http") or logo.startswith("https"): pass else: logo = "http:" + logo if logo.find("logo_default") >= 0: logo = None brief = d('.company_word').text() desc_text = d('.company_intro_text').text() if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10: desc = None else: desc = d('.company_intro_text > .company_content').html() desc = desc.replace('<span class="text_over">展开</span>', '') soup = BeautifulSoup(desc, "lxml") raw = soup.getText() desc = raw field = '' stage = '' headCount = '' location = '' address = '' try: field = d( '#basic_container > .item_content >ul > li:eq(0) > span').text() stage = d( '#basic_container > .item_content >ul > li:eq(1) > span').text() headCount = d( '#basic_container > .item_content >ul > li:eq(2) > span').text() headCount = headCount[0:headCount.index(u'人')] location = d( '#basic_container > .item_content >ul > li:eq(3) > span').text() address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text() except: pass headCount = headCount.replace("people", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 location_id = 0 location_new = parser_mongo_util.get_location(location) if location_new != None: location_id = location_new["locationId"] #website = util.norm_url(link) website = url_helper.url_normalize(link) logger.info("website: %s" % website) artifacts = [] type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("lagou.com") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) #parser member members = [] lis = d('.manager_list > li') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p.item_manager_name > span').text() member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p.item_manager_title').text() member_desc = mem('div.item_manager_content').text() weibo = None if member_link is not None: if 'weibo.com' in member_link: weibo = member_link source_member = { 'name': member_name, 'photo_url': logo_url, 'weibo': weibo, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None } members.append(source_member) except: pass source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": int(location_id), "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "sourceUrl": "https://www.lagou.com/gongsi/%s.html" % company_key, "field": field, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "members": members, "status": 1 } return source_company
def parse_artifact(item): logger.info("parse_artifact") company_key = item["key"] c = item["content"]["company_base"]["data"]["company"] artifacts = [] # artifact website = c.get("website", "").strip() website = url_helper.url_normalize(website) if website is not None and website != "": type, market, app_id = url_helper.get_market(website) if type == 4010: if website.find('36kr.com') > 0 and c["name"].find('36') == -1: pass else: artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": website, "domain": app_id, "type": type } artifacts.append(artifact) elif (type == 4040 or type == 4050) and app_id is not None: domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": website, "domain": domain, "type": type } artifacts.append(artifact) weibo = c.get("weibo", "").strip() if weibo is not None and weibo != "": artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": weibo, "domain": None, "type": 4030 } artifacts.append(artifact) weixin = c.get("weixin", "").strip() if weixin is not None and weixin != "": artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": weixin, "domain": None, "type": 4020 } artifacts.append(artifact) iphoneAppstoreLink = c.get("iphoneAppstoreLink", "").strip() if iphoneAppstoreLink is not None and iphoneAppstoreLink != "": type, market, app_id = url_helper.get_market(iphoneAppstoreLink) domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": iphoneAppstoreLink, "domain": domain, "type": type } artifacts.append(artifact) ipadAppstoreLink = c.get("ipadAppstoreLink", "").strip() if ipadAppstoreLink is not None and ipadAppstoreLink != "": type, market, app_id = url_helper.get_market(ipadAppstoreLink) domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": ipadAppstoreLink, "domain": domain, "type": type } artifacts.append(artifact) androidLink = c.get("androidLink", "").strip() if androidLink is not None and androidLink != "": type, market, app_id = url_helper.get_market(androidLink) domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": androidLink, "domain": domain, "type": type } artifacts.append(artifact) return artifacts
def parse_artifact(source_company_id, item): logger.info("parse_artifact") company_key = item["key"] cc = item["content"]["company_base"]["data"] cp = item["content"]["product"]["data"]["companyProduct"] artifacts = [] links = [] # artifact for c in [cc, cp]: website = c.get("website", "").strip() website = url_helper.url_normalize(website) if website is not None and website != "" and website not in links: type, market, app_id = url_helper.get_market(website) if type == 4010: if website.find('36kr.com') > 0 and c["name"].find('36') == -1: pass else: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": website, "domain": app_id, "type": type } artifacts.append(artifact) links.append(website) elif (type == 4040 or type == 4050) and app_id is not None: domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": website, "domain": domain, "type": type } artifacts.append(artifact) links.append(website) weibo = c.get("weibo", "").strip() if weibo is not None and weibo != "" and weibo.find( "weibo") >= 0 and weibo not in links: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": weibo, "domain": None, "type": 4030 } artifacts.append(artifact) links.append(weibo) weixin = c.get("weixin", "").strip() if weixin is not None and weixin != "" and weixin not in links: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": weixin, "domain": weixin, "type": 4020 } artifacts.append(artifact) links.append(weixin) iphoneAppstoreLink = c.get("ios", "").strip() if iphoneAppstoreLink is not None and iphoneAppstoreLink != "" and iphoneAppstoreLink not in links: type, market, app_id = url_helper.get_market(iphoneAppstoreLink) domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": iphoneAppstoreLink, "domain": domain, "type": type } artifacts.append(artifact) links.append(iphoneAppstoreLink) # ipadAppstoreLink = c.get("ipadAppstoreLink","").strip() # if ipadAppstoreLink is not None and ipadAppstoreLink != "": # type, market, app_id = url_helper.get_market(ipadAppstoreLink) # domain = get_android_domain(market, app_id) # if (type==4040 or type==4050) and domain is not None: # artifact = { # "sourceCompanyId": source_company_id, # "name": c["name"], # "description": None, # "link": ipadAppstoreLink, # "domain": domain, # "type": type # } # artifacts.append(artifact) androidLink = c.get("android", "").strip() if androidLink is not None and androidLink != "" and androidLink not in links: type, market, app_id = url_helper.get_market(androidLink) domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": androidLink, "domain": domain, "type": type } artifacts.append(artifact) links.append(androidLink) return artifacts
def expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler, test=False): logger.info("source: %s, sourceId: %s Start expand!!!", source, sourceId) logger.info("clean old expanded data") expand_clean(source, sourceId) sourcecompany = collection_source_company.find_one({"source": source, "sourceId": sourceId}) # exit() company_fullname = sourcecompany["source_company"]["fullName"] if company_fullname is not None and company_fullname.strip() != "": company_fullname = name_helper.company_name_normalize(company_fullname) scnames = sourcecompany["source_company_name"] check_fullname = False for scname in scnames: if scname["name"] == company_fullname: check_fullname = True break if check_fullname is False: (chinese, company) = name_helper.name_check(company_fullname) if chinese is True: chinese_type = "Y" else: chinese_type = "N" scname_data ={ "name": company_fullname, "chinese": chinese_type, "type": 12010, } save_mongo_source_company_name(source, sourceId, scname_data) round = 1 while True: if round >= 6: collection_source_company.update_one({"_id": sourcecompany["_id"]},{'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}}) break source_company_names = find_mongo_data(collection_source_company, "source_company_name", source, sourceId) main_beianhaos = find_mongo_data(collection_source_company, "source_mainbeianhao", source, sourceId) artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId) logger.info(json.dumps(source_company_names, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info(json.dumps(main_beianhaos, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) # Check if there are new stuff which need to do expansion if len(source_company_names) == 0 and len(artifacts) == 0 and len(main_beianhaos) == 0: collection_source_company.update_one({"_id": sourcecompany["_id"]}, {'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}}) break logger.info("source: %s, sourceId: %s expand for round %d", source, sourceId, round) # Step A/1:按公司名,备案查询 logger.info("source: %s, sourceId: %s 按公司名备案查询", source, sourceId) for source_company_name in source_company_names: # Only check chinese company name if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue if source_company_name["chinese"] is None: (chinese, companyName) = name_helper.name_check(source_company_name["name"]) else: chinese = source_company_name["chinese"] if chinese != "Y": continue check_name = list(collection_beian.find({"organizer": source_company_name["name"]})) # Case that one company_name has multiple beian# : 上海汇翼->(today.ai/teambition.com)#If only one found in Mongo.beian(organizer) it is fine if len(check_name) == 0: if test: items_beianlinks = [] else: items_beianlinks = beian_links_crawler.query_by_company_name(source_company_name["name"]) save_collection_beian(collection_beian, items_beianlinks) # insert infos into Mongo.beian else: items_beianlinks = check_name save_beian_artifacts(items_beianlinks, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_beianlinks, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_beianlinks, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # beian # 发现更多的artifact(website)和公司名,主备案号 # Step A/2:按domian,备案查询 logger.info("source: %s, sourceId: %s 按domian备案查询", source, sourceId) for artifact in artifacts: # Only check is artifact is a website if artifact["type"] != 4010: continue if artifact["domain"] is None: link = url_helper.url_normalize(artifact["link"]) (flag, domain) = url_helper.get_domain(link) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue check_domain = list(collection_beian.find({"domain": domain})) if len(check_domain) == 0: if test: items_merge =[] else: items_beianlinks = beian_links_crawler.query_by_domain(domain) items_icpchinaz = icp_chinaz_crawler.query_by_domain(domain) items_merge = merge_beian(items_beianlinks, items_icpchinaz) save_collection_beian(collection_beian, items_merge) # insert infos into Mongo.beian else: items_merge = check_domain # filer by check domain to avoid sinaapp.cn case items_merge = filter_domain(items_merge, domain) save_beian_artifacts(items_merge, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_merge, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_merge, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # beian # 发现更多的artifact(website)和公司名,主备案号 # Step A/3 #按主备案号查询 logger.info("source: %s, sourceId: %s 按主备案号查询", source, sourceId) for main_beianhao in main_beianhaos: mainBeianhao = main_beianhao["mainBeianhao"] check_mainBeianhao = collection_main_beianhao.find_one({"mainBeianhao": mainBeianhao}) if check_mainBeianhao is None: if test: items_merge =[] else: items_beianlinks = beian_links_crawler.query_by_main_beianhao(mainBeianhao) items_icpchinaz = icp_chinaz_crawler.query_by_main_beianhao(mainBeianhao) items_merge = merge_beian(items_beianlinks, items_icpchinaz) save_collection_beian(collection_beian, items_merge) # insert infos into Mongo.beian # if mainBeianhao could be found in two links if len(items_merge) > 0: items_main_beianhao = [{"mainBeianhao": mainBeianhao}] save_collection_mainBeianhao(collection_main_beianhao, items_main_beianhao) # insert mainBeianhao into Mongo.main_beianhao else: items_merge = list(collection_beian.find({"mainBeianhao": mainBeianhao})) save_beian_artifacts(items_merge, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_merge, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_merge, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # 发现更多的artifact(website)和公司名 # itunes扩展 # Step B/1 #查询itunes artifact logger.info("source: %s, sourceId: %s 查询itunes artifact", source, sourceId) itunes_company_enames = {} app_by_name = {} for artifact in artifacts: if artifact["type"] != 4040: continue # Get trackid trackid = None if artifact["domain"] is None: (apptype, appmarket, trackid) = url_helper.get_market(artifact["link"]) if apptype != 4040: continue else: try: trackid = int(artifact["domain"]) except: pass if trackid is not None: app = collection_itunes.find_one({"trackId": trackid}) if app is None: # mark it as Noactive set_artifact_active(artifact, "N", source, sourceId) else: copy_from_itunes(app, artifact, source, sourceId) # 存在: copy from mongo.itunes if app.has_key("offline") and app["offline"] is True: set_artifact_active(artifact, "Offline", source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: itunes_company_enames["sellerName"] = 1 app_by_name = app else: set_artifact_active(artifact, "N", source, sourceId) # save the only english name if len(itunes_company_enames) == 1: company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name": {"$elemMatch": {"type": 12010, "chinese":"N"}}}) if company_name is None: save_company_name(app_by_name, "sellerName", source, sourceId) # Step B/2根据公司名查询更多的itunes artifact logger.info("source: %s, sourceId: %s 根据公司名查询更多的itunes artifact", source, sourceId) for source_company_name in source_company_names: # producer name ''' check_itunes_producers = list(collection_itunes.find({"developer": source_company_name["name"]})) if len(check_itunes_producers) > 0: for app in check_itunes_producers: # Check if itunesId is already existed in artifacts if find_itunesId(app["trackId"], source_company_id): pass else: source_artifact_id = save_itunes_artifact(app, source_company_id) #save_artifact_itunes_rel(app["_id"], source_artifact_id) save_company_name(app, "developer", source_company_id) ''' if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue check_itunes_sellers = list(collection_itunes.find({"sellerName": source_company_name["name"]})) if len(check_itunes_sellers) > 0: ''' domains = {} for app in check_itunes_sellers: sellerUrl = app.get("sellerUrl") flag ,domain = url_helper.get_domain(sellerUrl) if flag is not None and domain is not None: domains[domain] = 1 ''' lens_domain = count_domains(check_itunes_sellers, "sellerUrl") artifact_status = check_source_artifact(source, sourceId) for app in check_itunes_sellers: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) if app.has_key("sellerUrl"): # if find_link(app["sellerUrl"], source_company_id) or check_source_artifact(source_company_id): if artifact_status: pass elif lens_domain == 1: artifact_id = save_itunesSellerUrl_artifact(app, source, sourceId) if artifact_id is not None: artifact_status = True # comment due to incorrect expand ''' if app.has_key("supportUrl"): if find_link(app["supportUrl"], source_company_id): pass else: save_itunesSupportUrl_artifact(app, source_company_id) ''' # save_artifact_itunes_rel(app["_id"], source_artifact_id) # save_company_name(app, "sellerName", source_company_id) # Step B/3根据域名查询更多的itunes artifact logger.info("source: %s, sourceId: %s 根据域名查询更多的itunes artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["domain"] is None: (flag, domain) = url_helper.get_domain(artifact["link"]) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue if domain in itunesDomainEx: continue check_itunes_sellerDomains = list(collection_itunes.find({"sellerDomain": domain})) if len(check_itunes_sellerDomains) > 0: lens_company_names = count_company_names(check_itunes_sellerDomains, "sellerName") company_name_status = check_source_company_name(source, sourceId) for app in check_itunes_sellerDomains: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) if company_name_status: pass elif lens_company_names == 1: # save_artifact_itunes_rel(app["_id"], source_artifact_id) chinese, is_company = name_helper.name_check(app["sellerName"]) if chinese and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True check_itunes_supportDomains = list(collection_itunes.find({"supportDomain": domain})) if len(check_itunes_supportDomains) > 0 and len(check_itunes_supportDomains) < 100: lens_company_names = count_company_names(check_itunes_supportDomains, "sellerName") company_name_status = check_source_company_name(source, sourceId) for app in check_itunes_supportDomains: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) # save_artifact_itunes_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["sellerName"]) if chinese and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True # 发现更多的artifact(website)和公司名,check if existed in source_art..and company_name # android扩展 # Step C/1#查询android artifact logger.info("source: %s, sourceId: %s 查询android artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4050: continue # Get apkname apkname = None if artifact["domain"] is None: (apptype, appmarket, appid) = url_helper.get_market(artifact["link"]) # Get apkname of baidu and 360 from android market if apptype != 4050: continue if appmarket == 16010 or appmarket == 16020: android_app = collection_android_market.find_one({"appmarket": appmarket, "key_int": appid}) if android_app: apkname = android_app["apkname"] else: apkname = appid else: apkname = artifact["domain"] if apkname is not None: app = collection_android.find_one({"apkname": apkname}) if app is None: # mark it as Noactive set_artifact_active(artifact, "N", source, sourceId) else: copy_from_android(app, artifact, source, sourceId) # 存在: copy from mongo.android set_artifact_active(artifact, "Y", source, sourceId) # chinese, is_company = name_helper.name_check(app["author"]) # if is_company: # save_company_name(app, "author", source_company_id) else: set_artifact_active(artifact, "N", source, sourceId) # Step C/2根据公司名查询更多的android artifact logger.info("source: %s, sourceId: %s 根据公司名查询更多的android artifact", source, sourceId) for source_company_name in source_company_names: # producer name if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue check_android_authors = list(collection_android.find({"author": source_company_name["name"]})) if len(check_android_authors) > 0 and len(check_android_authors) < 200: lens_domain = count_domains(check_android_authors, "website") artifact_status = check_source_artifact(source, sourceId) # check if author is consistent for app in check_android_authors: # Check if AnId have one 4010 if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) if artifact_status: pass elif lens_domain == 1: artifact_id = save_androidWebsite_artifact(app, source, sourceId) if artifact_id is not None: artifact_status = True # save_artifact_android_rel(app["_id"], source_artifact_id) # save_company_name(app, "author", source_company_id) # Step C/3根据域名查询更多的android artifact logger.info("source: %s, sourceId: %s 根据域名查询更多的android artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["domain"] is None: (flag, domain) = url_helper.get_domain(artifact["link"]) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue check_android_websiteDomains = list(collection_android.find({"website_domain": domain})) if len(check_android_websiteDomains) > 0: lens_company_names = count_company_names(check_android_websiteDomains, "author") company_name_status = check_source_company_name(source, sourceId) for app in check_android_websiteDomains: # Check if AndroidId is already existed in artifacts if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) # save_artifact_android_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["author"]) if is_company: save_company_name(app, "author", source, sourceId) company_name_status = True check_android_apknameDomains = list(collection_android.find({"apkname_domain": domain})) # add threshold to avoid case: domain: com.wowotuan if len(check_android_apknameDomains) > 0 and len(check_android_apknameDomains) < 100: lens_company_names = count_company_names(check_android_apknameDomains, "author") company_name_status = check_source_company_name(source, sourceId) for app in check_android_apknameDomains: # Check if AndroidId is already existed in artifacts if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) # save_artifact_android_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["author"]) if is_company: save_company_name(app, "author", source, sourceId) company_name_status = True # 发现更多的artifact(website)和公司名 # 曾用名 TODO # 清洗website artfiact # 查询meta信息, 标记不能访问的?website?, 处理转跳的website logger.info("source: %s, sourceId: %s website meta", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["link"] is None or artifact["link"].strip() == "": # set_active("source_artifact", "N", artifact["id"]) set_artifact_active(artifact, "N", source, sourceId) continue url = artifact["link"].strip() meta = collection_website.find_one({"url": url}) if meta is None or meta["httpcode"]==404: meta = website.get_meta_info(url) if meta: websiteId = save_collection_website(collection_website, meta) if websiteId is not None and not test: #screenshot_wesbite(collection_website, websiteId, screenshot_crawler) pass else: meta = { "url": artifact["link"], "httpcode": 404 } websiteId = save_collection_website(collection_website, meta) set_artifact_active(artifact, "N", source, sourceId) if meta: # 发生转跳 # logger.info(meta) if meta["httpcode"] == 200: redirect_url = meta.get("redirect_url") if artifact["link"] != redirect_url: url = url_helper.url_normalize(meta["redirect_url"]) (flag_new, domain_new) = url_helper.get_domain(url) meta_new = { "url": url, "domain": domain_new if flag_new is True else None, "redirect_url": url, "title": meta["title"], "tags": meta["tags"], "description": meta["description"], "httpcode": 200 } websiteId_new = save_collection_website(collection_website, meta_new) if websiteId_new is not None and not test: #screenshot_wesbite(collection_website, websiteId_new, screenshot_crawler) pass flag, domain = url_helper.get_domain(artifact["link"]) if domain_new != domain: # 跳出原域名 set_artifact_active(artifact, "Redirect", source, sourceId) else: if flag is True: # 这是个'好'地址 set_artifact_active(artifact, "Y", source, sourceId) else: if flag_new is True: # 转跳后是个 '好'地址 set_artifact_active(artifact, "Redirect", source, sourceId) save_website_artifact(meta_new, source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) elif meta["httpcode"] == 404: set_artifact_active(artifact, "N", source, sourceId) # verify -> source_artifacts/source_company_name set verify logger.info("source: %s, sourceId: %s set verify", source, sourceId) for artifact in artifacts: set_artifact_expand(artifact, source, sourceId) for source_company_name in source_company_names: set_scname_expand(source_company_name, source, sourceId) for main_beianhao in main_beianhaos: set_scbeianhao_expand(main_beianhao, source, sourceId) round += 1
logger.info("investor aggregator start") #get source_investors conn = db.connect_torndb() #Check verify or processStatus source_investors = conn.query( "select * from source_investor where processStatus=0 order by id") conn.close() for source_investor in source_investors: logger.info(source_investor["id"]) #get Domain source_investor["domain"] = None if source_investor["website"] is not None: source_investor["website"] = url_helper.url_normalize( source_investor["website"]) type, market, website_domain = url_helper.get_market( source_investor["website"]) if type == 4010 and website_domain is not None: source_investor["domain"] = website_domain if source_investor["investorId"] is not None: investor = find_in_investor("id", source_investor["investorId"]) update_investor(investor, source_investor) set_processStatus(source_investor["id"]) continue else: #name check name = source_investor["name"] if name is not None and name.strip != "":
def parse_artifact(item): if item is None: return None artifacts = [] company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) #artifact logger.info("*** artifact ***") lis = d('ul.list-prod> li> div.on-edit-hide') for li in lis: l = pq(li) strtype = l('h4> span.tag').text().strip() #logger.info(strtype) if strtype != u"网站" and strtype != "app": continue link = l('h4> b> a').attr("href").strip() if link == "": continue domain = None type = None if strtype == u"网站": type, app_market, app_id = url_helper.get_market(link) if type == 4010: link = url_helper.url_normalize(link) flag, domain = url_helper.get_domain(link) if flag is None: continue if flag is False: domain = None if type != 4010: type, app_market, app_id = url_helper.get_market(link) if type == 4040: domain = app_id elif type == 4050: if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is None and type !=4030 and type != 4020: continue name = l('h4> b').text().strip() desc = l('p').text().strip() logger.info("type: %s, name: %s, link: %s, desc: %s" % (type, name,link,desc)) artifact = { "type":type, "name":name, "desc":desc, "link":link, "domain": domain } artifacts.append(artifact) logger.info("") return artifacts