def find_company_by_full_name(companyId, full_name): global caflag logger.info("find_company_by_full_name") if full_name is None or full_name.strip() == "": return None full_name = name_helper.company_name_normalize(full_name) conn = db.connect_torndb() company = conn.get( "select * from company where fullName=%s and (active is null or active !='N') and id!=%s order by id desc limit 1", full_name, companyId) conn.close() if company is not None: logger.info("find_company_by_full_name 1") return company["id"] # add company_alias into checking list if caflag is True: conn = db.connect_torndb() company_alias = conn.get( "select a.* from company_alias a join company c on c.id=a.companyId where (c.active is null or c.active !='N') \ and a.type=12010 and a.name=%s and c.id!=%s order by c.id desc limit 1", full_name, companyId) conn.close() if company_alias is not None: logger.info("find_company_by_full_name 2") return company_alias["companyId"] return None
def find_company_by_full_name(full_name): # logger.info("find_company_by_full_name") if full_name is None or full_name == "": return None full_name = name_helper.company_name_normalize(full_name) conn = db.connect_torndb() corporate = conn.get( "select * from corporate_alias where name=%s and (active is null or active !='N') limit 1", full_name) conn.close() if corporate is not None: logger.info("find_corporate_by_full_name 1") return corporate["id"] conn = db.connect_torndb() company = conn.get( "select * from company_alias where name=%s and (active is null or active !='N') limit 1", full_name) conn.close() if company is not None: logger.info("find_company_by_full_name 1") return company["id"] return None
def parse_company(item): if item is None: return None #logger.info("*** base ***") company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) logo = d('.top_info_wrap > img').attr('src') if logo.startswith("http") or logo.startswith("https"): pass else: logo = "http:" + logo name = d('.company_main > h1 > a').text() fullName = d('.company_main > h1 > a').attr('title') fullName = name_helper.company_name_normalize(fullName) if name is None or fullName is None: return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if fullName.find("分公司") >= 0: return {"status": "Sub_company", "name": fullName} return {"status": "good"}
def find_company_by_name(names): companyIds = [] for name in names: name = name_helper.company_name_normalize(name) conn = db.connect_torndb() companies = conn.query( "select * from company where fullName=%s and (active is null or active !='N') order by id desc", name) companyIds.extend([ company["id"] for company in companies if company["id"] not in companyIds ]) # logger.info("a: %s",companyIds) companies2 = conn.query( "select * from company where name=%s and (active is null or active !='N') order by id desc", name) companyIds.extend([ company["id"] for company in companies2 if company["id"] not in companyIds ]) # logger.info("b: %s", companyIds) # company_alias = conn.query("select distinct a.companyId from company_alias a join company c on c.id=a.companyId where (c.active is null or c.active !='N') \ # and (a.active is null or a.active !='N') and a.name=%s order by c.id desc", name) # companyIds.extend([company["companyId"] for company in company_alias if company["companyId"] not in companyIds]) # logger.info("c: %s", companyIds) return companyIds
def process(company): company_name = name_helper.company_name_normalize(company["name"]) company_name = company_name.replace("'", "") while True: proxy = yield get_proxy() wait_time = random.randrange(3, 10) yield gen.sleep(wait_time) TYCID = proxy.get("TYCID") if TYCID is None: # step1 # 搜索公司页,获得TYCID flag = yield step1(proxy, company_name) if flag is False: continue # step2 # 已有TYCID, 访问/tongji/companyname.json flag = yield step2(proxy, company_name) if flag is False: continue # step3 # 搜索获得tyc_company_id tyc_company_id = yield step3(proxy, company_name) if tyc_company_id == -1: continue elif tyc_company_id == 0: # 搜不到 update_check_time(company, exist=False) break wait_time = random.randrange(1, 3) yield gen.sleep(wait_time) # step4 flag = yield step4(proxy, company_name, tyc_company_id) if flag is False: continue # step5 flag = yield step5(proxy, company_name, tyc_company_id) if flag is False: continue # step6 flag = yield step6(proxy, company_name, tyc_company_id) if flag is False: continue update_check_time(company, exist=True) # step7 yield step7(proxy, company_name, tyc_company_id) release_proxy(proxy) proxy_success(proxy) yield gen.sleep(1) break
def process(company, proxy): company_name = name_helper.company_name_normalize(company["name"]) company_name = company_name.replace("'", "") wait_time = random.randrange(3, 10) time.sleep(wait_time) TYCID = proxy.get("TYCID") if TYCID is None: # step1 # 搜索公司页,获得TYCID flag = step1(proxy, company_name) if flag is False: return False # step2 # 已有TYCID, 访问/tongji/companyname.json flag = step2(proxy, company_name) if flag is False: return False # step3 # 搜索获得tyc_company_id tyc_company_id = step3(proxy, company_name) if tyc_company_id == -1: return False elif tyc_company_id == 0: # 搜不到 update_check_time(company, exist=False) return True wait_time = random.randrange(1, 3) time.sleep(wait_time) # step4 flag = step4(proxy, company_name, tyc_company_id) if flag is False: return False # step5 flag = step5(proxy, company_name, tyc_company_id) if flag is False: return False # step6 flag = step6(proxy, company_name, tyc_company_id) if flag is False: return False update_check_time(company, exist=True) # step7 step7(proxy, company_name, tyc_company_id) time.sleep(1) return True
def find_from_gongshang(name): name = name_helper.company_name_normalize(name) if name is None: return chinese, company = name_helper.name_check(name) if chinese is True and company is True: gs = mongo.info.gongshang.find_one({"name": name}) if gs is not None: for investor in gs["investors"]: if investor["type"] == u"企业投资": logger.info("gongshang name: %s", investor["name"]) add_2_company_list(investor["name"]) if gs.has_key("invests"): for invest in gs["invests"]: add_2_company_list(invest["name"])
def add_2_company_list(name): name = name_helper.company_name_normalize(name) if name is None: return chinese, company = name_helper.name_check(name) if chinese is True and company is True: logger.info("fullname: %s", name) name_md5 = util.md5str(name) c = mongo.info.company_idx.find_one({"name_md5": name_md5}) if c is None: data = { "name": name, "name_md5": name_md5, "createTime": datetime.datetime.utcnow() } mongo.info.company_idx.insert_one(data)
def parser(item): if item is None: return None investor_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) investor_name = d('div.picinfo> p> span.title').text() investor_name = name_helper.company_name_normalize(investor_name) logger.info("investor_name: " + investor_name) if investor_name is None: logger.info("No investor name!!!") return None logo = d('div.pic> img').attr("src") if logo is not None: logo = logo.strip() logger.info("Investor Logo: %s" % logo) website = d('span.links >a[target="_black"]').attr("href") if website is None or website.strip() == "暂无": website = None website = url_helper.url_normalize(website) flag, domain = url_helper.get_domain(website) if flag is None: website = None logger.info("Investor website: %s" % website) stageStr = d('div.pad.block> div.list-tags.yellow').text().replace( " ", ",").strip() logger.info("Investor rounds: %s" % stageStr) fieldsStr = d('div.pad.block> div.list-tags.darkblue').text().replace( " ", ",").strip() logger.info("Investor fields: %s" % fieldsStr) desc = d('div.des').text().strip() logger.info("Investor desc: %s" % desc) return investor_key, investor_name, logo, website, stageStr, fieldsStr, desc
def save_beian_company_names(items, source, sourceId): for item in items: if item.has_key("whoisExpire") and item["whoisExpire"] == 'Y': continue if item["organizerType"] != "企业": continue company_name = name_helper.company_name_normalize(item["organizer"]) source_company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name.name": company_name}) if source_company_name is None: scndata = { "name": company_name, "chinese": 'Y', "type": 12010, "extended": 'Y', } save_mongo_source_company_name(source, sourceId, scndata)
def find_company_by_fullname(full_name): fcompanyIds = [] full_name = name_helper.company_name_normalize(full_name) conn = db.connect_torndb() fcompanies = conn.query( "select * from company where fullName=%s and (active is null or active !='N') order by id desc", full_name) fcompanyIds.extend([ company["id"] for company in fcompanies if company["id"] not in fcompanyIds ]) # logger.info("a: %s",companyIds) fcompanies2 = conn.query( "select * from company where name=%s and (active is null or active !='N') order by id desc", full_name) fcompanyIds.extend([ company["id"] for company in fcompanies2 if company["id"] not in fcompanyIds ]) # logger.info("b: %s", companyIds) conn.close() return fcompanyIds
def save_company_name(app, item_of_name, source, sourceId): company_name = app[item_of_name] if company_name is None or company_name.strip() == "": return company_name = name_helper.company_name_normalize(company_name) source_company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name.name": company_name}) if source_company_name is None: (chinese, company) = name_helper.name_check(app[item_of_name]) if chinese is True: chinese_type = "Y" else: chinese_type = "N" scnamedata = { "name": company_name, "chinese": chinese_type, "type": 12010, "extended": 'Y', } save_mongo_source_company_name(source, sourceId, scnamedata)
def find_companies_by_full_name_corporate(full_names): companyIds = [] for full_name in full_names: if full_name is None or full_name == "": continue full_name = name_helper.company_name_normalize(full_name) conn = db.connect_torndb() corporate_aliases = conn.query("select a.* from corporate_alias a join corporate c on c.id=a.corporateId where " "(c.active is null or c.active !='N') and (a.active is null or a.active !='N') " "and a.name=%s", full_name) # conn.close() for ca in corporate_aliases: # logger.info("*******found %s",ca) company = conn.get("select * from company where corporateId=%s and (active is null or active!='N') limit 1", ca["corporateId"]) if company is not None: logger.info("find_company_by_full_name %s: %s", full_name, company["id"]) if company["id"] not in companyIds: companyIds.append(company["id"]) conn.close() return companyIds
def parse_company(item): logger.info("parse_company") company_key = item["key"] #company basic info c = item["content"]["company_base"]["data"] #check if page is under development or is completed(CREATED) # if c["status"] == "INIT": # return { # "status":c["status"], # } tags = item["content"]["company_base"]["data"]["industryTag"] tags2 = [] for tag in tags: tags2.append(tag["name"]) tags_str = ",".join(tags2) logo = c["logo"] if logo: logo = logo.replace("https://", "http://") establish_date = None if c.has_key("startDate"): d = time.localtime(c["startDate"] / 1000) if d.tm_year > 1980: establish_date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday) address1 = None address2 = None if c.has_key("address1"): address1 = c["address1"] if c.has_key("address2"): address2 = c["address2"] location_id = 0 if address2 != None: city = kr36_cities.get(str(address2), None) if city != None: location = parser_db_util.get_location(formCityName(city)) if location != None: location_id = location["locationId"] if location_id == 0 and address1 != None: city = kr36_cities.get(str(address1), None) if city != None: location = parser_db_util.get_location(formCityName(city)) if location != None: location_id = location["locationId"] #logger.info("locationid =%s",location_id) fullName = c["fullName"] fullName = fullName.replace("_", "") idx = fullName.rfind(u"公司") if idx != -1: fullName = fullName[:(idx + len(u"公司"))] fullName = name_helper.company_name_normalize(fullName) desc = "" productDesc = None modelDesc = None operationDesc = None teamDesc = None marketDesc = None compititorDesc = None advantageDesc = None planDesc = None otherDesc = None if c.has_key("companyIntroduce"): if c["companyIntroduce"]["productService"] is not None and c[ "companyIntroduce"]["productService"].strip( ) != "": # productService productDesc = c["companyIntroduce"]["productService"] if c["companyIntroduce"]["userMarket"] is not None and c[ "companyIntroduce"]["userMarket"].strip() != "": marketDesc = c["companyIntroduce"]["userMarket"] # if c.has_key("dataLights"): # 我们的用户 # operationDesc = c["dataLights"].strip() # if c.has_key("projectPlan"): # 未来的我们 # modelDesc = c["projectPlan"].strip() # if c.has_key("competitor"): # 与我们相似的产品 # compititorDesc = c["competitor"].strip() if c.has_key("intro"): # 其他 # otherDesc = c["intro"].strip() desc = c["intro"].strip() # if c.has_key("story"): # 团队介绍 # teamDesc = c["story"].strip() headCount = c["scale"].replace("人", "") min_staff = None max_staff = None if headCount.strip() != "": if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: try: min_staff = int(staffarr[0]) max_staff = int(staffarr[1]) except: pass else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: pass return { "name": c["name"], "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "productDesc": productDesc, "modelDesc": modelDesc, "operationDesc": operationDesc, "teamDesc": teamDesc, "marketDesc": marketDesc, "compititorDesc": compititorDesc, "advantageDesc": advantageDesc, "planDesc": planDesc, "otherDesc": otherDesc, "brief": c["brief"], "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': 0, "locationId": location_id, "address": None, "phone": None, "establishDate": establish_date, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": None, "subField": None, "tags": tags_str, "headCountMin": min_staff, "headCountMax": max_staff }
def parse_company(item): if item is None: logger.info("here") return None logger.info("*** base ***") company_key = item["key"] html1 = item["content"] logger.info(company_key) d = pq((html.fromstring(html1.decode("utf-8")))) name = d('h1.name').text().strip() fullName = d('div.company-business> h4').text() if fullName.find("来源")>=0: fullName = fullName.split(" ")[-1] fullName = name_helper.company_name_normalize(fullName) if (name is None or name == "") or (fullName is None or fullName == ""): logger.info("here1: %s", name) return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: logger.info("here") return { "status": "No_Name", } logo = d('div.company-logo> img').attr('src') if logo.startswith("http") or logo.startswith("https") or logo.find("default") >= 0: pass else: logo = None # if logo.find("default") >= 0: # logo = None brief = None desc_text = d('div.job-sec> div.text').text() logger.info("desc: %s", desc_text) if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5: desc = None else: desc = desc_text.replace('公司简介:',"").replace("收起","").replace("展开","").replace(" ","").strip() field = '' stage = '' headCount = '' location = '' address = '' try: lll = d('div.info-primary> p').text().strip() if len(lll.split(" ")) == 3: field = lll.split(" ")[2] stage = lll.split(" ")[0] headCount = lll.split(" ")[1] except: pass headCount = headCount.replace("人", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 links = d('div.company-products> ul> li> div.text> div.name> a') artifacts = [] for linkp in links: link = pq(linkp)('a').attr("href") website = url_helper.url_normalize(link) logger.info("website: %s" % website) type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("zhipin") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": None, "link": website, "domain": domain }) #parser member members = [] lis = d('div.manager-list> div> ul >li> div') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('div.info-user> img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p> span.name').text() # member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p> span.job-title').text() member_desc = mem('div.item_manager_content').text() # weibo = None # if member_link is not None: # if 'weibo.com' in member_link: # weibo = member_link source_member = {'name': member_name, 'photo_url': logo_url, 'weibo': None, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None } members.append(source_member) except: pass sourceId2link = d('div.company-tab> a').eq(0).attr("href") if sourceId2link is not None and sourceId2link.find("gongsi") >=0: sourceId2 = sourceId2link.split("/")[-1].replace(".html","") else: sourceId2 = None source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": int(0), "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "sourceId2": sourceId2, "sourceUrl": "https://www.zhipin.com/gongsi/%s.html?ka=company-intro" % company_key, "field": field, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "members": members, "status": 1, "stage": 0, } return source_company
def process(url, key, content): global LATEST if content.find('360安全中心') == -1: return #logger.info(content) r = "var detail = \(function \(\) \{\s*?return\s*?(.*?);\s*?\}\)" result = util.re_get_result(r, content) (b, ) = result base = json.loads(b.replace("'", '"'), strict=False) name = base["sname"] type = base["type"] package = base["pname"].strip() #logger.info("%s, %s, %s" % (type, name, package)) d = pq(html.fromstring(content.decode("utf-8"))) desc = "" try: # desc = d('div.breif').contents()[0].strip() desc = d('div.breif').text().strip() ts = desc.split("【基本信息】") desc = ts[0].strip() except: pass if desc == "": try: desc = d('div#html-brief').text().strip() except: pass #logger.info(desc) author = d('div.base-info> table> tbody> tr> td').eq( 0).contents()[1].strip() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) author = None #logger.info(author) modify_date_str = d('div.base-info> table> tbody> tr> td').eq( 1).contents()[1].strip() #logger.info(modify_date_str) modify_date = datetime.datetime.strptime(modify_date_str, "%Y-%m-%d") #logger.info(modify_date) versionname = None try: versionname = d('div.base-info> table> tbody> tr> td').eq( 2).contents()[1].strip() if versionname.startswith("V"): versionname = versionname.replace("V", "") except: pass #logger.info(versionname) compatibility = d('div.base-info> table> tbody> tr> td').eq( 3).contents()[1].strip() language = d('div.base-info> table> tbody> tr> td').eq( 4).contents()[1].strip() if language == "其他": if hz.is_chinese_string(desc): language = "中文" #logger.info(language) icon = d('div#app-info-panel> div> dl> dt >img').attr("src").strip() #logger.info(icon) screenshots = [] try: screenshots = d('div#scrollbar').attr("data-snaps").split(",") except: pass commentbyeditor = None r = "<p><strong>【小编点评】</strong>(.*?)</p>" result = util.re_get_result(r, content) if result: (commentbyeditor, ) = result updates = None r = "<br/><b>【更新内容】</b><br/>(.*?)</div>" result = util.re_get_result(r, content) if result: (updates, ) = result updates = updates.replace("<br />", "\n").strip() tags = d("div.app-tags> a").text().replace(" ", ",") size = None r = "'size':'(.*?)'" result = util.re_get_result(r, content) if result: (size, ) = result size = int(size) downloadstr = d("span.s-3").eq(0).text().replace("下载:", "").replace( "次", "").replace("+", "").strip() download = None try: if downloadstr.endswith("千"): download = float(downloadstr.replace("千", "")) * 1000 elif downloadstr.endswith("万"): download = float(downloadstr.replace("万", "")) * 10000 elif downloadstr.endswith("亿"): download = float(downloadstr.replace("亿", "")) * 10000 * 10000 else: download = int(downloadstr) score = float(d("span.s-1").text().replace("分", "").strip()) * 0.5 except: traceback.print_exc() item = { "link": url, "apkname": package, "appmarket": APPMARKET, "name": name, "brief": None, "website": None, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": modify_date, "language": language, "tags": tags, "version": versionname, "updates": updates, "size": size, "compatibility": compatibility, "icon": icon, "author": author, "screenshots": screenshots, "type": type, "key": str(key), "key_int": key, "download": download, } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item) if LATEST < key: LATEST = key
def process(crawler, app, content): if content.find('请检查您所输入的URL地址是否有误') != -1: return key = app["key_int"] url = app["link"] d = pq(content) cate = d('div.nav> span >a').eq(1).text().strip() if cate == "游戏": return sub_cate = d('div.nav> span >a').eq(2).text().strip() name = d('h1.app-name> span').text().strip() downloadstr = d("span.download-num").eq(0).text().replace("下载次数:","").replace("+","").strip() if downloadstr.endswith("千"): download = float(downloadstr.replace("千","")) * 1000 elif downloadstr.endswith("万"): download = float(downloadstr.replace("万","")) * 10000 elif downloadstr.endswith("亿"): download = float(downloadstr.replace("亿","")) * 10000 * 10000 else: download = int(downloadstr) logger.info("%s-%s, %s, %s", cate, sub_cate, name, download) mosug_url = "http://m.baidu.com/mosug?wd=%s&type=soft" % urllib.quote(name.encode("utf-8")) while True: result = crawler.crawl(mosug_url) if result['get'] == 'success': mosug_content = result["content"] break #logger.info(mosug_content) data = json.loads(mosug_content) if data["result"].get("s") is None: return found = False for dt in data["result"].get("s"): if dt.get("package") is None: continue if long(dt["docid"]) == key: download = int(dt["download_num"]) score = int(dt["score"]) * 0.05 break # screenshot screenshots = [] imgs = d('img.imagefix') #logger.info(imgs) for img in imgs: surl = pq(img).attr("src") #logger.info(url) screenshots.append(surl) # content desc = d('p.content').text() #logger.info(desc) icon = d('div.app-pic> img').attr("src") #logger.info(icon) author = d('div.origin-wrap> span> span').eq(1).text() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) #logger.info("author: %s", author) commentbyeditor = d('span.head-content').text() item = { "link": url, "apkname": app["apkname"], "appmarket": APPMARKET, "name": name, "brief": None, "website": None, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": None, "language": None, "tags": sub_cate, "version": app["version"], "updates": None, "size": app["size"], "compatibility": None, "icon": icon, "author": author, "screenshots": screenshots, "type": app["type"], "key": str(key), "key_int": key, "download": download } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item)
def parse_company(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] logger.info(company_key) d = pq(html) # logo_id processed in parser_db_util ''' logo_id = None if logo_url is not None: logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url) ''' if html.decode("utf-8").find("这个公司的主页还在建设") >= 0: return { "status": "No_Name", } name = d('.company_main > h1 > a').text() link = d('.company_main > h1 > a').attr('href') fullName = d('.company_main > h1 > a').attr('title') fullName = name_helper.company_name_normalize(fullName) if name is None or fullName is None or name.find("拉勾") >= 0: return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: return { "status": "No_Name", } logo = d('.top_info_wrap > img').attr('src') if logo.startswith("http") or logo.startswith("https"): pass else: logo = "http:" + logo if logo.find("logo_default") >= 0: logo = None brief = d('.company_word').text() desc_text = d('.company_intro_text').text() if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10: desc = None else: desc = d('.company_intro_text > .company_content').html() desc = desc.replace('<span class="text_over">展开</span>', '') soup = BeautifulSoup(desc, "lxml") raw = soup.getText() # logger.info(desc) #logger.info(raw) desc = raw # if desc is None or desc.strip() == "": # return { # "status": "No_Name", # } field = '' stage = '' headCount = '' location = '' address = '' try: field = d( '#basic_container > .item_content >ul > li:eq(0) > span').text() stage = d( '#basic_container > .item_content >ul > li:eq(1) > span').text() headCount = d( '#basic_container > .item_content >ul > li:eq(2) > span').text() headCount = headCount[0:headCount.index(u'人')] location = d( '#basic_container > .item_content >ul > li:eq(3) > span').text() address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text() except: pass headCount = headCount.replace("people", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 location_id = 0 location_new = parser_db_util.get_location(location) if location_new != None: location_id = location_new["locationId"] #website = util.norm_url(link) website = url_helper.url_normalize(link) logger.info("website: %s" % website) artifacts = [] type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("lagou.com") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "productDesc": None, "modelDesc": None, "operationDesc": None, "teamDesc": None, "marketDesc": None, "compititorDesc": None, "advantageDesc": None, "planDesc": None, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": location_id, "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": field, "subField": None, "tags": None, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "status": 1 } return source_company
# lines = [] cnt = 0 tot = 0 pb = 0 for line in lines: # logger.info(line) names = [name.strip() for name in line.strip().split("+++")] if len(names) != 4: logger.info(line) exit() tot += 1 shortname = names[0] fullName = names[1] brief = names[2] website = names[3] fullName = name_helper.company_name_normalize(fullName) # if len(brief) < 100: # logger.info(brief) # if len(brief) == 0: # brief = None # logger.info("none") # if website is not None and website.strip() != "": # logger.info(website) # logger.info("name:%s, fullName:%s", shortname, fullName) # company_ids = find_company.find_companies_by_full_name_corporate([fullName]) # # if len(company_ids) != 0: # logger.info("found : %s, %s", fullName, company_ids) # cnt += 1 # insert(shortname,fullName,brief,website)
def parse_company(item): if item is None: logger.info("here") return None logger.info("*** base ***") company_key = item["key"] html1 = item["content"] logger.info(company_key) d = pq((html.fromstring(html1.decode("utf-8")))) name = d('h1').text().split()[0].strip() fullName = name fullName = name_helper.company_name_normalize(fullName) if (name is None or name == "") or (fullName is None or fullName == ""): logger.info("here1: %s", name) return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) # if companycheck is not True: # logger.info("here") # return { # "status": "No_Name", # } logo = d('.bigELogo').attr('src') if logo.startswith("http") or logo.startswith( "https") or logo.find("default") >= 0: pass else: logo = None brief = None desc_text = d('.profile').text() logger.info("desc: %s", desc_text) if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5: desc = None else: desc = desc_text.replace('公司简介:', "").replace("收起", "").replace( "展开", "").replace(" ", "").strip() field = d('.comp-industry').text().strip() stage = '' headCount = d('.new-compintro li:nth-child(2)').text().split()[-1] location = d('.new-compintro li:nth-child(3)').attr('data-city') address = d('.new-compintro li:nth-child(3)').text().replace('公司地址:', '').strip() headCount = headCount.replace("人", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None # # funding_type = 0 # if stage == '不需要融资': # stage = 0 # funding_type = 8010 # elif stage == '未融资': # stage = 0 # elif stage == '天使轮': # stage = 1010 # elif stage == 'A轮': # stage = 1030 # elif stage == 'B轮': # stage = 1040 # elif stage == 'C轮': # stage = 1050 # elif stage == 'D轮及以上': # stage = 1060 # elif stage == '上市公司': # stage = 1110 # else: # stage = 0 # # links = d('div.company-products> ul> li> div.text> div.name> a') artifacts = [] # for linkp in links: # link = pq(linkp)('a').attr("href") # website = url_helper.url_normalize(link) # logger.info("website: %s" % website) # # type, app_market, app_id = url_helper.get_market(website) # if type == 4010: # if item["url"] != website and website.find("zhipin") == -1: # flag, domain = url_helper.get_domain(website) # if flag is not None: # if flag is False: # domain = None # artifacts.append({ # "type": 4010, # "name": name, # "description": None, # "link": website, # "domain": domain # }) # elif type == 4020 or type == 4030: # domain = None # if domain is not None: # artifacts.append({ # "type": type, # "name": name, # "description": None, # "link": website, # "domain": domain # }) # elif type == 4040: # domain = app_id # if domain is not None: # artifacts.append({ # "type": 4040, # "name": name, # "description": None, # "link": website, # "domain": domain # }) # elif type == 4050: # domain = None # if app_market == 16010 or app_market == 16020: # android_app = parser_mongo_util.find_android_market(app_market, app_id) # if android_app: # domain = android_app["apkname"] # else: # domain = app_id # if domain is not None: # artifacts.append({ # "type": 4050, # "name": name, # "description": None, # "link": website, # "domain": domain # }) # parser member members = [] lis = d('div.executive dl') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p:nth-child(2)').text() # member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p:nth-child(3)').text() member_desc = mem('dd').text() # weibo = None # if member_link is not None: # if 'weibo.com' in member_link: # weibo = member_link source_member = { 'name': member_name, 'photo_url': logo_url, 'weibo': None, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None } members.append(source_member) except: pass source_company = { "name": name, "fullName": fullName, "description": desc, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': None, "locationId": int(0), "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "sourceUrl": "https://www.liepin.com/company/%s/" % company_key, "field": field, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "members": members, "status": 1, "stage": 0, } return source_company
def parse_company(item): # logger.info("parse_company") d = pq(html.fromstring(item['content'].decode("utf-8"))) company_key = item["key"] # company basic info tags = [] for tag in d('.word_list').text().split(): if tag.strip() not in tags: tags.append(tag) tags_str = ",".join(tags) logo = d('.peoimg img').attr('src') if logo: logo = logo.replace("https://", "http://") establish_date = None time_content = d('.time_content li:last-child') if d(time_content)('.upword').text().find('成立') > 0: establish_date = d(time_content)('.time_up').text() establish_date = datetime.datetime.strptime(establish_date, '%Y-%m-%d') companyName = d('.company_div h5').text() city = name_helper.get_location_from_company_name(companyName)[0] location_id = 0 if city != None: location = parser_db_util.get_location(city) if location != None: location_id = location["locationId"] # logger.info("locationid =%s",location_id) fullName = companyName.replace("_", "") fullName = name_helper.company_name_normalize(fullName) desc = d('#intro_srocll p').text() productDesc = '' website = '' for p in d('.procont_lis p'): if d(p).text().find('官网') > 0 and d(p)('a').attr('href') is not None: website = d(p)('a').attr('href') continue productDesc += d(p).text() + '\n' if desc == '' or desc is None: desc = productDesc shortName = d('.peo_center h4').text().split(':')[0].split(':')[0].split( '——')[0].split(',')[0].split('|')[0] companyResult = {} # isCompany # print companyName,company_key, ',', name_helper.name_check(companyName)[1], ',', len(website)>0 if name_helper.name_check(companyName)[1] == True: # English name if name_helper.name_check(shortName)[0] == False: pass else: cnt = 0 for s in shortName: if s in companyName: cnt += 1 if not cnt > 2: shortName = companyName else: if not len(website) > 0: return 0 else: companyResult['fakeName'] = fullName fullName = None companyResult.update({ "name": shortName, "fullName": fullName, "description": desc, "round": 0, "roundDesc": None, "companyStatus": 2010, 'fundingType': 0, "locationId": location_id, "address": None, "phone": None, "establishDate": establish_date, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": None, "subField": None, "tags": tags_str, "headCountMin": None, "headCountMax": None, "brief": None, "website": website, }) return companyResult
def parse_company(item): logger.info("parse_company") company_key = item["key"] #company basic info c = item["content"]["company_base"]["data"]["company"] #check if page is under development or is completed(CREATED) if c["status"] == "INIT": return { "status": c["status"], } tags = item["content"]["company_base"]["data"]["tags"] tags2 = [] for tag in tags: tags2.append(tag["name"]) tags_str = ",".join(tags2) logo = c["logo"] if logo: logo = logo.replace("https://", "http://") establish_date = None if c.has_key("startDate"): d = time.localtime(c["startDate"] / 1000) if d.tm_year > 1980: establish_date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday) address1 = None address2 = None if c.has_key("address1"): address1 = c["address1"] if c.has_key("address2"): address2 = c["address2"] location_id = 0 if address2 != None: city = kr36_cities.get(str(address2), None) if city != None: location = parser_mongo_util.get_location(formCityName(city)) if location != None: location_id = location["locationId"] if location_id == 0 and address1 != None: city = kr36_cities.get(str(address1), None) if city != None: location = parser_mongo_util.get_location(formCityName(city)) if location != None: location_id = location["locationId"] #logger.info("locationid =%s",location_id) fullName = c["fullName"] fullName = fullName.replace("_", "") idx = fullName.rfind(u"公司") if idx != -1: fullName = fullName[:(idx + len(u"公司"))] fullName = name_helper.company_name_normalize(fullName) desc = "" productDesc = None modelDesc = None operationDesc = None teamDesc = None marketDesc = None compititorDesc = None advantageDesc = None planDesc = None if c.has_key("projectAdvantage"): productDesc = c["projectAdvantage"].strip() if c.has_key("dataLights"): operationDesc = c["dataLights"].strip() if c.has_key("projectPlan"): modelDesc = c["projectPlan"].strip() if c.has_key("competitor"): compititorDesc = c["competitor"].strip() if c.has_key("intro"): desc = c["intro"].strip() if c.has_key("story"): teamDesc = c["story"].strip() return { "status": c["status"], "name": c["name"], "fullName": fullName, "description": desc, "productDesc": productDesc, "modelDesc": modelDesc, "operationDesc": operationDesc, "teamDesc": teamDesc, "marketDesc": marketDesc, "compititorDesc": compititorDesc, "advantageDesc": advantageDesc, "planDesc": planDesc, "brief": c["brief"], "round": 0, "roundDesc": None, "companyStatus": 2010, 'fundingType': 0, "locationId": location_id, "address": None, "phone": None, "establishDate": establish_date, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": c.get("industry"), "subField": None, "tags": tags_str, "headCountMin": None, "headCountMax": None }
def expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler, test=False): logger.info("source: %s, sourceId: %s Start expand!!!", source, sourceId) logger.info("clean old expanded data") expand_clean(source, sourceId) sourcecompany = collection_source_company.find_one({"source": source, "sourceId": sourceId}) # exit() company_fullname = sourcecompany["source_company"]["fullName"] if company_fullname is not None and company_fullname.strip() != "": company_fullname = name_helper.company_name_normalize(company_fullname) scnames = sourcecompany["source_company_name"] check_fullname = False for scname in scnames: if scname["name"] == company_fullname: check_fullname = True break if check_fullname is False: (chinese, company) = name_helper.name_check(company_fullname) if chinese is True: chinese_type = "Y" else: chinese_type = "N" scname_data ={ "name": company_fullname, "chinese": chinese_type, "type": 12010, } save_mongo_source_company_name(source, sourceId, scname_data) round = 1 while True: if round >= 6: collection_source_company.update_one({"_id": sourcecompany["_id"]},{'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}}) break source_company_names = find_mongo_data(collection_source_company, "source_company_name", source, sourceId) main_beianhaos = find_mongo_data(collection_source_company, "source_mainbeianhao", source, sourceId) artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId) logger.info(json.dumps(source_company_names, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info(json.dumps(main_beianhaos, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) # Check if there are new stuff which need to do expansion if len(source_company_names) == 0 and len(artifacts) == 0 and len(main_beianhaos) == 0: collection_source_company.update_one({"_id": sourcecompany["_id"]}, {'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}}) break logger.info("source: %s, sourceId: %s expand for round %d", source, sourceId, round) # Step A/1:按公司名,备案查询 logger.info("source: %s, sourceId: %s 按公司名备案查询", source, sourceId) for source_company_name in source_company_names: # Only check chinese company name if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue if source_company_name["chinese"] is None: (chinese, companyName) = name_helper.name_check(source_company_name["name"]) else: chinese = source_company_name["chinese"] if chinese != "Y": continue check_name = list(collection_beian.find({"organizer": source_company_name["name"]})) # Case that one company_name has multiple beian# : 上海汇翼->(today.ai/teambition.com)#If only one found in Mongo.beian(organizer) it is fine if len(check_name) == 0: if test: items_beianlinks = [] else: items_beianlinks = beian_links_crawler.query_by_company_name(source_company_name["name"]) save_collection_beian(collection_beian, items_beianlinks) # insert infos into Mongo.beian else: items_beianlinks = check_name save_beian_artifacts(items_beianlinks, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_beianlinks, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_beianlinks, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # beian # 发现更多的artifact(website)和公司名,主备案号 # Step A/2:按domian,备案查询 logger.info("source: %s, sourceId: %s 按domian备案查询", source, sourceId) for artifact in artifacts: # Only check is artifact is a website if artifact["type"] != 4010: continue if artifact["domain"] is None: link = url_helper.url_normalize(artifact["link"]) (flag, domain) = url_helper.get_domain(link) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue check_domain = list(collection_beian.find({"domain": domain})) if len(check_domain) == 0: if test: items_merge =[] else: items_beianlinks = beian_links_crawler.query_by_domain(domain) items_icpchinaz = icp_chinaz_crawler.query_by_domain(domain) items_merge = merge_beian(items_beianlinks, items_icpchinaz) save_collection_beian(collection_beian, items_merge) # insert infos into Mongo.beian else: items_merge = check_domain # filer by check domain to avoid sinaapp.cn case items_merge = filter_domain(items_merge, domain) save_beian_artifacts(items_merge, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_merge, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_merge, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # beian # 发现更多的artifact(website)和公司名,主备案号 # Step A/3 #按主备案号查询 logger.info("source: %s, sourceId: %s 按主备案号查询", source, sourceId) for main_beianhao in main_beianhaos: mainBeianhao = main_beianhao["mainBeianhao"] check_mainBeianhao = collection_main_beianhao.find_one({"mainBeianhao": mainBeianhao}) if check_mainBeianhao is None: if test: items_merge =[] else: items_beianlinks = beian_links_crawler.query_by_main_beianhao(mainBeianhao) items_icpchinaz = icp_chinaz_crawler.query_by_main_beianhao(mainBeianhao) items_merge = merge_beian(items_beianlinks, items_icpchinaz) save_collection_beian(collection_beian, items_merge) # insert infos into Mongo.beian # if mainBeianhao could be found in two links if len(items_merge) > 0: items_main_beianhao = [{"mainBeianhao": mainBeianhao}] save_collection_mainBeianhao(collection_main_beianhao, items_main_beianhao) # insert mainBeianhao into Mongo.main_beianhao else: items_merge = list(collection_beian.find({"mainBeianhao": mainBeianhao})) save_beian_artifacts(items_merge, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_merge, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_merge, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # 发现更多的artifact(website)和公司名 # itunes扩展 # Step B/1 #查询itunes artifact logger.info("source: %s, sourceId: %s 查询itunes artifact", source, sourceId) itunes_company_enames = {} app_by_name = {} for artifact in artifacts: if artifact["type"] != 4040: continue # Get trackid trackid = None if artifact["domain"] is None: (apptype, appmarket, trackid) = url_helper.get_market(artifact["link"]) if apptype != 4040: continue else: try: trackid = int(artifact["domain"]) except: pass if trackid is not None: app = collection_itunes.find_one({"trackId": trackid}) if app is None: # mark it as Noactive set_artifact_active(artifact, "N", source, sourceId) else: copy_from_itunes(app, artifact, source, sourceId) # 存在: copy from mongo.itunes if app.has_key("offline") and app["offline"] is True: set_artifact_active(artifact, "Offline", source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: itunes_company_enames["sellerName"] = 1 app_by_name = app else: set_artifact_active(artifact, "N", source, sourceId) # save the only english name if len(itunes_company_enames) == 1: company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name": {"$elemMatch": {"type": 12010, "chinese":"N"}}}) if company_name is None: save_company_name(app_by_name, "sellerName", source, sourceId) # Step B/2根据公司名查询更多的itunes artifact logger.info("source: %s, sourceId: %s 根据公司名查询更多的itunes artifact", source, sourceId) for source_company_name in source_company_names: # producer name ''' check_itunes_producers = list(collection_itunes.find({"developer": source_company_name["name"]})) if len(check_itunes_producers) > 0: for app in check_itunes_producers: # Check if itunesId is already existed in artifacts if find_itunesId(app["trackId"], source_company_id): pass else: source_artifact_id = save_itunes_artifact(app, source_company_id) #save_artifact_itunes_rel(app["_id"], source_artifact_id) save_company_name(app, "developer", source_company_id) ''' if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue check_itunes_sellers = list(collection_itunes.find({"sellerName": source_company_name["name"]})) if len(check_itunes_sellers) > 0: ''' domains = {} for app in check_itunes_sellers: sellerUrl = app.get("sellerUrl") flag ,domain = url_helper.get_domain(sellerUrl) if flag is not None and domain is not None: domains[domain] = 1 ''' lens_domain = count_domains(check_itunes_sellers, "sellerUrl") artifact_status = check_source_artifact(source, sourceId) for app in check_itunes_sellers: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) if app.has_key("sellerUrl"): # if find_link(app["sellerUrl"], source_company_id) or check_source_artifact(source_company_id): if artifact_status: pass elif lens_domain == 1: artifact_id = save_itunesSellerUrl_artifact(app, source, sourceId) if artifact_id is not None: artifact_status = True # comment due to incorrect expand ''' if app.has_key("supportUrl"): if find_link(app["supportUrl"], source_company_id): pass else: save_itunesSupportUrl_artifact(app, source_company_id) ''' # save_artifact_itunes_rel(app["_id"], source_artifact_id) # save_company_name(app, "sellerName", source_company_id) # Step B/3根据域名查询更多的itunes artifact logger.info("source: %s, sourceId: %s 根据域名查询更多的itunes artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["domain"] is None: (flag, domain) = url_helper.get_domain(artifact["link"]) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue if domain in itunesDomainEx: continue check_itunes_sellerDomains = list(collection_itunes.find({"sellerDomain": domain})) if len(check_itunes_sellerDomains) > 0: lens_company_names = count_company_names(check_itunes_sellerDomains, "sellerName") company_name_status = check_source_company_name(source, sourceId) for app in check_itunes_sellerDomains: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) if company_name_status: pass elif lens_company_names == 1: # save_artifact_itunes_rel(app["_id"], source_artifact_id) chinese, is_company = name_helper.name_check(app["sellerName"]) if chinese and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True check_itunes_supportDomains = list(collection_itunes.find({"supportDomain": domain})) if len(check_itunes_supportDomains) > 0 and len(check_itunes_supportDomains) < 100: lens_company_names = count_company_names(check_itunes_supportDomains, "sellerName") company_name_status = check_source_company_name(source, sourceId) for app in check_itunes_supportDomains: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) # save_artifact_itunes_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["sellerName"]) if chinese and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True # 发现更多的artifact(website)和公司名,check if existed in source_art..and company_name # android扩展 # Step C/1#查询android artifact logger.info("source: %s, sourceId: %s 查询android artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4050: continue # Get apkname apkname = None if artifact["domain"] is None: (apptype, appmarket, appid) = url_helper.get_market(artifact["link"]) # Get apkname of baidu and 360 from android market if apptype != 4050: continue if appmarket == 16010 or appmarket == 16020: android_app = collection_android_market.find_one({"appmarket": appmarket, "key_int": appid}) if android_app: apkname = android_app["apkname"] else: apkname = appid else: apkname = artifact["domain"] if apkname is not None: app = collection_android.find_one({"apkname": apkname}) if app is None: # mark it as Noactive set_artifact_active(artifact, "N", source, sourceId) else: copy_from_android(app, artifact, source, sourceId) # 存在: copy from mongo.android set_artifact_active(artifact, "Y", source, sourceId) # chinese, is_company = name_helper.name_check(app["author"]) # if is_company: # save_company_name(app, "author", source_company_id) else: set_artifact_active(artifact, "N", source, sourceId) # Step C/2根据公司名查询更多的android artifact logger.info("source: %s, sourceId: %s 根据公司名查询更多的android artifact", source, sourceId) for source_company_name in source_company_names: # producer name if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue check_android_authors = list(collection_android.find({"author": source_company_name["name"]})) if len(check_android_authors) > 0 and len(check_android_authors) < 200: lens_domain = count_domains(check_android_authors, "website") artifact_status = check_source_artifact(source, sourceId) # check if author is consistent for app in check_android_authors: # Check if AnId have one 4010 if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) if artifact_status: pass elif lens_domain == 1: artifact_id = save_androidWebsite_artifact(app, source, sourceId) if artifact_id is not None: artifact_status = True # save_artifact_android_rel(app["_id"], source_artifact_id) # save_company_name(app, "author", source_company_id) # Step C/3根据域名查询更多的android artifact logger.info("source: %s, sourceId: %s 根据域名查询更多的android artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["domain"] is None: (flag, domain) = url_helper.get_domain(artifact["link"]) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue check_android_websiteDomains = list(collection_android.find({"website_domain": domain})) if len(check_android_websiteDomains) > 0: lens_company_names = count_company_names(check_android_websiteDomains, "author") company_name_status = check_source_company_name(source, sourceId) for app in check_android_websiteDomains: # Check if AndroidId is already existed in artifacts if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) # save_artifact_android_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["author"]) if is_company: save_company_name(app, "author", source, sourceId) company_name_status = True check_android_apknameDomains = list(collection_android.find({"apkname_domain": domain})) # add threshold to avoid case: domain: com.wowotuan if len(check_android_apknameDomains) > 0 and len(check_android_apknameDomains) < 100: lens_company_names = count_company_names(check_android_apknameDomains, "author") company_name_status = check_source_company_name(source, sourceId) for app in check_android_apknameDomains: # Check if AndroidId is already existed in artifacts if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) # save_artifact_android_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["author"]) if is_company: save_company_name(app, "author", source, sourceId) company_name_status = True # 发现更多的artifact(website)和公司名 # 曾用名 TODO # 清洗website artfiact # 查询meta信息, 标记不能访问的?website?, 处理转跳的website logger.info("source: %s, sourceId: %s website meta", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["link"] is None or artifact["link"].strip() == "": # set_active("source_artifact", "N", artifact["id"]) set_artifact_active(artifact, "N", source, sourceId) continue url = artifact["link"].strip() meta = collection_website.find_one({"url": url}) if meta is None or meta["httpcode"]==404: meta = website.get_meta_info(url) if meta: websiteId = save_collection_website(collection_website, meta) if websiteId is not None and not test: #screenshot_wesbite(collection_website, websiteId, screenshot_crawler) pass else: meta = { "url": artifact["link"], "httpcode": 404 } websiteId = save_collection_website(collection_website, meta) set_artifact_active(artifact, "N", source, sourceId) if meta: # 发生转跳 # logger.info(meta) if meta["httpcode"] == 200: redirect_url = meta.get("redirect_url") if artifact["link"] != redirect_url: url = url_helper.url_normalize(meta["redirect_url"]) (flag_new, domain_new) = url_helper.get_domain(url) meta_new = { "url": url, "domain": domain_new if flag_new is True else None, "redirect_url": url, "title": meta["title"], "tags": meta["tags"], "description": meta["description"], "httpcode": 200 } websiteId_new = save_collection_website(collection_website, meta_new) if websiteId_new is not None and not test: #screenshot_wesbite(collection_website, websiteId_new, screenshot_crawler) pass flag, domain = url_helper.get_domain(artifact["link"]) if domain_new != domain: # 跳出原域名 set_artifact_active(artifact, "Redirect", source, sourceId) else: if flag is True: # 这是个'好'地址 set_artifact_active(artifact, "Y", source, sourceId) else: if flag_new is True: # 转跳后是个 '好'地址 set_artifact_active(artifact, "Redirect", source, sourceId) save_website_artifact(meta_new, source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) elif meta["httpcode"] == 404: set_artifact_active(artifact, "N", source, sourceId) # verify -> source_artifacts/source_company_name set verify logger.info("source: %s, sourceId: %s set verify", source, sourceId) for artifact in artifacts: set_artifact_expand(artifact, source, sourceId) for source_company_name in source_company_names: set_scname_expand(source_company_name, source, sourceId) for main_beianhao in main_beianhaos: set_scbeianhao_expand(main_beianhao, source, sourceId) round += 1
# tot += 1 shortname = names[1] fullName = names[2] if fullName is None or fullName.strip() == "": fullName = names[3] if fullName is None or fullName.strip() == "": logger.info(line) # logger.info("er1") continue # exit() # fullNames = [name_helper.company_name_normalize(unicode(name[2])), name_helper.company_name_normalize(unicode(name[3]))] fullNames = [] for fn in [names[2], names[3]]: if fn is not None and fn.strip() != "": fullNames.append( name_helper.company_name_normalize(unicode(fn))) fullName = name_helper.company_name_normalize(unicode(fullName)) roundstr = names[4] inv = names[5] fdate = names[6] investor = names[7] if investor is not None: investor = investor.split("/")[0] if investor is None or investor.strip() == "": if names[8] is not None: investor = names[8].split("/")[0] if investor is None: logger.info(line) logger.info("er2")
def process(crawler, url, apkname, content): # logger.info(content) if has_content(content,apkname): logger.info("hereherehere") #content = content.decode('utf-8') d = pq(html.fromstring(content.decode("utf-8", "ignore"))) #content = unicode(content, encoding="utf-8", errors='replace') #d = pq(content) name = d('span.title').text() # logger.info("name: %s",name) icon = d('div.app-icon> img').attr("src") brief = d('p.tagline').text() # logger.info(brief) commentbyeditor= d('div.editorComment> div').text() #logger.info(editor_comment) screenshots = [] imgs = d('div.overview> img') # logger.info(imgs) for img in imgs: imgurl = pq(img).attr("src") screenshots.append(imgurl) desc = d('div.desc-info> div').text() # logger.info(desc) updates = d('div.change-info> div').text() # logger.info(update_desc) try: size = int(d('meta[itemprop="fileSize"]').attr("content")) except: size = d('meta[itemprop="fileSize"]').attr("content") if size.find("KB") >= 0: size = int(float(size.replace("KB","").strip())* 1024) elif size.find("MB") >= 0: size = int(float(size.replace("MB","").strip())* 1024 * 1024) else: size = None tags = d('dd.tag-box >a').text().replace(" ",",") datestr = d('time#baidu_time').text() updatedate = datetime.datetime.strptime(datestr, "%Y年%m月%d日") #versionname = d(':contains("版本")').next() #logger.info(versionname) author = d('span.dev-sites').text() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) try: website=d('a.dev-sites').attr("href") website = url_helper.url_normalize(website) except: website=None compatibility=None if content.find("查看权限要求") == -1: r1 = "content=\"Android\">(.*?)</dd>.*<dt>来自" else: r1 = "content=\"Android\">(.*?)<div>.*" result1 = util.re_get_result(r1, content) if result1: (compatibility,)= result1 compatibility=compatibility.replace("\n","").replace("\r","").replace("\s","").replace(" ","") #logger.info(compatibility) versionname=None r2 = "<dt>版本</dt>.*<dd>(.*?)</dd>.*<dt>要求" result2 = util.re_get_result(r2, content) if result2: (versionname,)= result2 versionname = versionname.replace("\n", "").replace("\r", "").replace("\s", "").replace(" ","").strip() #logger.info(versionname) try: versionname = versionname.split()[0] if versionname.startswith("V"): versionname = versionname.replace("V", "") except: pass # download = int(d("i[itemprop='interactionCount']").attr("content").split(":")[1]) dnum = d("i[itemprop='interactionCount']").attr("content").split(":")[1] download = None try: download = int(dnum) except: if dnum.find("万") >= 0: download = int(float(dnum.replace("万", "").strip()) * 10000) elif dnum.find("亿") >= 0: download = int(float(dnum.replace("亿", "").strip()) * 10000 * 10000) else: logger.info("********download :%s cannot get", dnum) item = { "link": url, "apkname": apkname, "appmarket": APPMARKET, "name": name, "brief": brief, "website": website, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": updatedate, "language": None, "tags": tags, "version": versionname, "updates": updates, "size": size, "compatibility": compatibility, "icon": icon, "author": author, "screenshots": screenshots, "type": None, "key": apkname, "key_int": None, "download":download, } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item) collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": True}}) else: logger.info("App: %s has no content", apkname) #logger.info(content) collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": False}})