def start_run(concurrent_num): global raw_urls logger.info("website start...") items = beian_collection.find({}) for item in items: if item["domain"] is None or item["domain"] == "": continue url = "http://www." + item["domain"] logger.info(url) raw_urls.append(url) conn = db.connect_torndb() items = conn.query("select * from artifact where type=4010") for item in items: url = item["link"] if url is None or url == "": continue url = url_helper.url_normalize(url) logger.info(url) raw_urls.append(url) conn.close() threads = [gevent.spawn(run) for i in xrange(concurrent_num)] gevent.joinall(threads) logger.info("website end.")
def insert(shortname, name, brief, website): name = name.replace("(开业)", "") sourceId = util.md5str(name) sid = parser_db_util.save_company_yitai(shortname, name, 13100, sourceId, brief) logger.info("sid:%s->sourceId:%s", sid, sourceId) parser_db_util.save_source_company_name(sid, name, 12010) parser_db_util.save_source_company_name(sid, shortname, 12020) if website is not None and website.strip() != "": website = url_helper.url_normalize(website) if website is not None and website != "": if website.find("http://") == -1 and website.find("https://"): website = "http://" + website type, market, app_id = url_helper.get_market(website) if type == 4010: if website.find('sse.com') > 0: pass else: artifact = { "sourceCompanyId": sid, "name": shortname, "description": None, "link": website, "domain": app_id, "type": type } parser_db_util.save_artifacts_standard(sid, [artifact])
def run(): global raw_urls while True: if len(raw_urls) == 0: return url = raw_urls.pop(0) item = collection.find_one({"url": url}) if item is not None: continue flag, domain = url_helper.get_domain(url) result = website.get_meta_info(url) logger.info(url) logger.info( json.dumps(result, ensure_ascii=False, cls=util.CJsonEncoder)) if result is None: result = {"url": url, "httpcode": 404} else: if result["url"] != result["redirect_url"]: new_url = url_helper.url_normalize(result["redirect_url"]) flag1, domain1 = url_helper.get_domain(new_url) if domain != domain1: raw_urls.append(new_url) result["createTime"] = datetime.datetime.now() result["modifyTime"] = result["createTime"] try: collection.insert(result) except: pass
def get_meta_info(url): user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9' headers = { 'User-Agent': user_agent, 'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Accept-Encoding': 'gzip' } try: request = urllib2.Request(url, None, headers) except: return None opener = urllib2.build_opener() retries = 0 while True: try: r = opener.open(request, timeout=17) if r.info().get('Content-Encoding') == 'gzip': buf = StringIO(r.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() else: data = r.read() content = util.html_encode(data) redirect_url = url_helper.url_normalize(r.geturl()) #logger.info(redirect_url) #logger.info(content) d = pq(html.fromstring(content)) title = d("title").text() #logger.info(title) keywords = d("meta[name='keywords']").attr("content") if keywords is None: keywords = d("meta[name='Keywords']").attr("content") #logger.info(keywords) description = d("meta[name='description']").attr("content") if description is None: description = d("meta[name='Description']").attr("content") #logger.info(description) flag, domain = url_helper.get_domain(url) if flag is not True: domain = None return { "url": url, "redirect_url": redirect_url, "domain": domain, "title": title, "tags": keywords, "description": description, "httpcode": 200 } break except: retries += 1 if retries >= 3: return None return None
def save(collection_market, appmarket, item): item["website"] = url_helper.url_normalize(item["website"]) flag, domain = url_helper.get_domain(item["website"]) if flag: item["website_domain"] = domain else: item["website_domain"] = None temp = "http://" + ".".join(item["apkname"].split(".")[::-1]) flag, domain = url_helper.get_domain(temp) item["apkname_domain"] = domain record = collection_market.find_one( { "appmarket": appmarket, "apkname": item["apkname"] }, projection={'histories': False}) if record: _id = record.pop("_id") record.pop("key") record.pop("key_int") #logger.info(json.dumps(record, ensure_ascii=False, cls=util.CJsonEncoder)) if item["version"] is not None and item["version"].strip() != "": if record["version"] is not None and record["version"].strip( ) != "" and LooseVersion(item["version"]) > LooseVersion( record["version"]): item["createTime"] = record["createTime"] item["modifyTime"] = datetime.datetime.now() if item["updateDate"] is None: item["updateDate"] = datetime.datetime.now() collection_market.update_one({"_id": _id}, { '$set': item, '$addToSet': { "histories": record } }) elif record["version"] is None or record["version"].strip( ) == "" or LooseVersion(item["version"]) == LooseVersion( record["version"]): item["modifyTime"] = datetime.datetime.now() collection_market.update_one({"_id": _id}, {'$set': item}) else: item["createTime"] = datetime.datetime.now() item["modifyTime"] = item["createTime"] if item["updateDate"] is None: item["updateDate"] = datetime.datetime.now() try: collection_market.insert(item) except Exception, e: logger.info(e)
def parse_base(item): if item is None: return None company_key = item["key"] content = item["content"] return { "shortName": content["name"], "fullName": None, "productName": content["name"], "description": None, "brief": content["desc"], "round": 0, "roundDesc": "", "companyStatus": 2010, "fundingType": 0, "locationId": 0, "establishDate": None, "logo": None, "sourceId": company_key, "field": None, "subField": None, "tags": None, "type": 41020, "score": content["score"], "artifacts": [{ "name": content["name"], "desc": content["desc"], "link": url_helper.url_normalize(content["website"]) }] }
def process(item): logger.info("process: %s, %s", item["id"], item["name"]) deal_id = item["dealId"] if deal_id is None: set_deal_artifact_new_proceed(item["id"], "F") return conn = db.connect_torndb() deal = conn.get("select * from deal where id=%s", deal_id) conn.close() if deal is None: set_deal_artifact_new_proceed(item["id"], "F") return company_id = deal["companyId"] conn = db.connect_torndb() sc = conn.get( "select * from source_company where companyId=%s and source=13001 and sourceId=%s", company_id, str(deal_id)) if sc is None: source_company_id = conn.insert( "insert source_company(companyId,source,sourceId,createTime,processStatus) " "values(%s,%s,%s,now(),%s)", company_id, 13001, str(deal_id), 2) else: source_company_id = sc["id"] if item["sourceArtifactId"] is None: link = item["link"] domain = None if item["type"] == 4010: link = url_helper.url_normalize(link) flag, domain = url_helper.get_domain(link) if flag is False: domain = None sourceArtifactId = conn.insert( "insert source_artifact(sourceCompanyId,name,description,link,domain,type,createTime) " "values(%s,%s,%s,%s,%s,%s,now())", source_company_id, item["name"], item["description"], link, domain, item["type"]) conn.update( "update deal_artifact_new set sourceArtifactId=%s, proceed='Y' where id=%s", sourceArtifactId, item["id"]) conn.update("update source_company set processStatus=0 where id=%s", source_company_id) conn.close()
def parser(item): if item is None: return None investor_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) investor_name = d('div.picinfo> p> span.title').text() investor_name = name_helper.company_name_normalize(investor_name) logger.info("investor_name: " + investor_name) if investor_name is None: logger.info("No investor name!!!") return None logo = d('div.pic> img').attr("src") if logo is not None: logo = logo.strip() logger.info("Investor Logo: %s" % logo) website = d('span.links >a[target="_black"]').attr("href") if website is None or website.strip() == "暂无": website = None website = url_helper.url_normalize(website) flag, domain = url_helper.get_domain(website) if flag is None: website = None logger.info("Investor website: %s" % website) stageStr = d('div.pad.block> div.list-tags.yellow').text().replace( " ", ",").strip() logger.info("Investor rounds: %s" % stageStr) fieldsStr = d('div.pad.block> div.list-tags.darkblue').text().replace( " ", ",").strip() logger.info("Investor fields: %s" % fieldsStr) desc = d('div.des').text().strip() logger.info("Investor desc: %s" % desc) return investor_key, investor_name, logo, website, stageStr, fieldsStr, desc
def parse_base(item): if item is None: return None company_key = item["key"] content = item["content"] artifacts = [] link = url_helper.url_normalize(content["website"]) type, app_market, app_id = url_helper.get_market(link) if type == 4010 or \ ( (type == 4040 or type == 4050) and app_id): artifacts.append({ "type":type, "name":content["name"], "desc":content["desc"], "link":link, "domain":app_id }) return { "shortName": content["name"], "fullName": None, "productName": content["name"], "description": None, "brief": content["desc"], "round": 0, "roundDesc": "", "companyStatus": 2010, "fundingType": 0, "locationId": 0, "establishDate": None, "logo": None, "sourceId": company_key, "field": None, "subField": None, "tags": None, "type":41020, "score":content["score"], "artifacts":artifacts }
def parse_artifact(source_company_id,item): logger.info("parse_artifact") c = item["baseinfo"] artifacts = [] website = c.get("website","").strip() website = url_helper.url_normalize(website) if website is not None and website != "": if website.find("http://") == -1 and website.find("https://"): website = "http://"+website type, market, app_id = url_helper.get_market(website) if type == 4010: if website.find('neeq') > 0: pass else: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": website, "domain": app_id, "type": type } artifacts.append(artifact) elif (type==4040 or type==4050) and app_id is not None: domain = get_android_domain(market, app_id) if (type==4040 or type==4050) and domain is not None: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": website, "domain": domain, "type": type } artifacts.append(artifact) return artifacts
def parse_company(item): if item is None: logger.info("here") return None logger.info("*** base ***") company_key = item["key"] html1 = item["content"] logger.info(company_key) d = pq((html.fromstring(html1.decode("utf-8")))) name = d('h1.name').text().strip() fullName = d('div.company-business> h4').text() if fullName.find("来源")>=0: fullName = fullName.split(" ")[-1] fullName = name_helper.company_name_normalize(fullName) if (name is None or name == "") or (fullName is None or fullName == ""): logger.info("here1: %s", name) return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: logger.info("here") return { "status": "No_Name", } logo = d('div.company-logo> img').attr('src') if logo.startswith("http") or logo.startswith("https") or logo.find("default") >= 0: pass else: logo = None # if logo.find("default") >= 0: # logo = None brief = None desc_text = d('div.job-sec> div.text').text() logger.info("desc: %s", desc_text) if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5: desc = None else: desc = desc_text.replace('公司简介:',"").replace("收起","").replace("展开","").replace(" ","").strip() field = '' stage = '' headCount = '' location = '' address = '' try: lll = d('div.info-primary> p').text().strip() if len(lll.split(" ")) == 3: field = lll.split(" ")[2] stage = lll.split(" ")[0] headCount = lll.split(" ")[1] except: pass headCount = headCount.replace("人", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 links = d('div.company-products> ul> li> div.text> div.name> a') artifacts = [] for linkp in links: link = pq(linkp)('a').attr("href") website = url_helper.url_normalize(link) logger.info("website: %s" % website) type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("zhipin") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": None, "link": website, "domain": domain }) #parser member members = [] lis = d('div.manager-list> div> ul >li> div') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('div.info-user> img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p> span.name').text() # member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p> span.job-title').text() member_desc = mem('div.item_manager_content').text() # weibo = None # if member_link is not None: # if 'weibo.com' in member_link: # weibo = member_link source_member = {'name': member_name, 'photo_url': logo_url, 'weibo': None, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None } members.append(source_member) except: pass sourceId2link = d('div.company-tab> a').eq(0).attr("href") if sourceId2link is not None and sourceId2link.find("gongsi") >=0: sourceId2 = sourceId2link.split("/")[-1].replace(".html","") else: sourceId2 = None source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": int(0), "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "sourceId2": sourceId2, "sourceUrl": "https://www.zhipin.com/gongsi/%s.html?ka=company-intro" % company_key, "field": field, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "members": members, "status": 1, "stage": 0, } return source_company
def parse_artifact(source_company_id, item): name = item['name'] logger.info('parse_artifact:%s' % name) artifacts = [] desc = '' descs = item['content']['company_base']['properties'] if descs.has_key('short_description'): desc = descs['short_description'] of = item['content']['company_base']['overview_fields2'] if of.has_key('website'): website = of['website']['value'] website = url_helper.url_normalize(website) # logger.info('website:%s'%website) if website is not None and website.find( 'twitter') == -1 and website.find( 'linkedin') == -1 and website.find('facebook') == -1: type, app_market, app_id = url_helper.get_market(website) # logger.info('type:%s---market:%s---app_id:%s'%(type,market,app_id)) if type == 4010: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) return artifacts
logger = loghelper.get_logger("prepare_source_artifact_domain") if __name__ == "__main__": start = 0 conn =db.connect_torndb() while True: items = list(conn.query("select * from source_artifact order by id limit %s,1000",start)) for item in items: if item["domain"] is not None and item["domain"].strip() != "": continue if item["type"] == 4010: link = url_helper.url_normalize(item["link"]) (flag, domain) = url_helper.get_domain(link) if flag is True: logger.info("%s, %s %s %s", item["id"], item["type"], link, domain) conn.update("update source_artifact set domain=%s where id=%s", domain, item["id"]) elif item["type"] == 4040 or item["type"] == 4050: (apptype, appmarket, trackid) = url_helper.get_market(item["link"]) if (apptype == 4040 or apptype == 4050) and trackid is not None: logger.info("%s %s %s %s", item["id"], apptype, item["link"], trackid) conn.update("update source_artifact set type=%s, domain=%s where id=%s",apptype,trackid,item["id"]) start += 1000 if len(items) == 0: break conn.close()
def run(): crawler = ItunesCrawler() while True: if len(APPS) == 0: return item = APPS.pop(0) mongo = db.connect_mongo() record = mongo.market.itunes.find_one({"trackId": item["trackId"]}, projection={'histories': False}) mongo.close() if record is not None: mongo = db.connect_mongo() mongo.market.itunes_index.update({"_id": item["_id"]}, {"$set": { "processed": True }}) mongo.close() continue url = "https://itunes.apple.com/cn/lookup?id=%s" % item["trackId"] data = None while True: result = crawler.crawl(url) if result['get'] == 'success': rjson = json.loads(result["content"]) if rjson["resultCount"] > 0: data = rjson["results"][0] break if data is None: mongo = db.connect_mongo() mongo.market.itunes_index.update({"_id": item["_id"]}, {"$set": { "processed": True }}) mongo.close() continue #url = item["trackViewUrl"].replace("https://","http://") url = item["trackViewUrl"] while True: result = crawler.crawl(url) if result['get'] == 'success': #logger.info(result["content"]) d = pq(result["content"]) # developer = d("div.intro> div.left> h2").text() # if developer is not None: # developer = developer.replace("开发商:","") # data["developer"] = developer developer = d(".product-header__identity> a").text() if developer is not None: developer = developer.replace("开发商:", "") data["developer"] = developer # supportUrl = None # links = d('li.t-subbody>a.targeted-link.link.icon') # for i in links: # title = pq(i).text().strip() # if title.endswith("支持"): # supportUrl = pq(i).attr('href').strip() # break # data["supportUrl"] = url_helper.url_normalize(supportUrl) supportUrl = None links = d('li.t-subbody>a.targeted-link.link.icon') for i in links: title = pq(i).text().strip() if title.endswith("支持"): supportUrl = pq(i).attr('href').strip() break data["supportUrl"] = url_helper.url_normalize(supportUrl) relatedApps = [] # try: # divs = d('div.swoosh') # for div in divs: # e = pq(div) # if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有": # apps = e('div.content> div> div.application') # for app in apps: # app_id = pq(app).attr('adam-id') # relatedApps.append(int(app_id)) # # logger.info("*********************%s", app_id) # except: # pass try: apps = d('div.l-row.l-row--peek> a') for app in apps: appurl = pq(app).attr('href') r = util.re_get_result('/id(\d*)', appurl) if r is not None: track_id, = r try: app_id = int(track_id) relatedApps.append(int(app_id)) except: pass except: pass #logger.info("*********************%s", relatedApps) data["relatedApps"] = relatedApps userComments = [] # cdivs = d('div.customer-reviews> div.customer-review') # for cdiv in cdivs: # c = pq(cdiv) # try: # c_title = c('span.customerReviewTitle').text().strip() # c_commentator = c('span.user-info').text().replace("评论人:", "").strip() # c_content = c('p.content').text().strip() # # comment = { # "title": c_title, # "commentator": c_commentator, # "content": c_content # } # userComments.append(comment) # # except: # pass cdivs = d('div.l-row.l-row--peek> div.ember-view') for cdiv in cdivs: c = pq(cdiv) try: c_title = c( 'div.we-customer-review> div.we-customer-review__header> h3' ).eq(1).text().strip() c_commentator = c('div.we-customer-review__user').eq( 1).text().replace("评论人:", "").strip() c_content = c('p.we-customer-review__body').attr( "aria-label") comment = { "title": c_title, "commentator": c_commentator, "content": c_content } userComments.append(comment) except: pass logger.info( json.dumps(userComments, ensure_ascii=False, cls=util.CJsonEncoder)) data["userComments"] = userComments break elif result['get'] == 'fail' and result["content"] is not None: if result["content"].find( "Your request produced an error.") >= 0: break if data.has_key("supportUrl") and data["supportUrl"] is not None: flag, domain = url_helper.get_domain(data["supportUrl"]) if flag: data["supportDomain"] = domain else: data["supportDomain"] = None if data.has_key("sellerUrl") and data["sellerUrl"] is not None: data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"]) flag, domain = url_helper.get_domain(data["sellerUrl"]) if flag: data["sellerDomain"] = domain else: data["sellerDomain"] = None short_name = name_helper.get_short_name(data["trackName"]) data["trackShortName"] = short_name logger.info(json.dumps(data, ensure_ascii=False, cls=util.CJsonEncoder)) mongo = db.connect_mongo() record = mongo.market.itunes.find_one({"trackId": data["trackId"]}, projection={'histories': False}) if record: _id = record.pop("_id") if LooseVersion(data["version"]) > LooseVersion(record["version"]): data["createTime"] = record["createTime"] data["modifyTime"] = datetime.datetime.now() mongo.market.itunes.update_one({"_id": _id}, { '$set': data, '$addToSet': { "histories": record } }) # elif LooseVersion(data["version"]) == LooseVersion(record["version"]): # data["modifyTime"] = datetime.datetime.now() # collection.update_one({"_id": _id}, {'$set': data}) else: data["createTime"] = datetime.datetime.now() data["modifyTime"] = data["createTime"] mongo.market.itunes.insert(data) mongo.market.itunes_index.update({"_id": item["_id"]}, {"$set": { "processed": True }}) mongo.close()
def parse_artifact(item): logger.info("parse_artifact") company_key = item["key"] c = item["content"]["company_base"]["data"]["company"] artifacts = [] # artifact website = c.get("website", "").strip() website = url_helper.url_normalize(website) if website is not None and website != "": type, market, app_id = url_helper.get_market(website) if type == 4010: if website.find('36kr.com') > 0 and c["name"].find('36') == -1: pass else: artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": website, "domain": app_id, "type": type } artifacts.append(artifact) elif (type == 4040 or type == 4050) and app_id is not None: domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": website, "domain": domain, "type": type } artifacts.append(artifact) weibo = c.get("weibo", "").strip() if weibo is not None and weibo != "": artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": weibo, "domain": None, "type": 4030 } artifacts.append(artifact) weixin = c.get("weixin", "").strip() if weixin is not None and weixin != "": artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": weixin, "domain": None, "type": 4020 } artifacts.append(artifact) iphoneAppstoreLink = c.get("iphoneAppstoreLink", "").strip() if iphoneAppstoreLink is not None and iphoneAppstoreLink != "": type, market, app_id = url_helper.get_market(iphoneAppstoreLink) domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": iphoneAppstoreLink, "domain": domain, "type": type } artifacts.append(artifact) ipadAppstoreLink = c.get("ipadAppstoreLink", "").strip() if ipadAppstoreLink is not None and ipadAppstoreLink != "": type, market, app_id = url_helper.get_market(ipadAppstoreLink) domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": ipadAppstoreLink, "domain": domain, "type": type } artifacts.append(artifact) androidLink = c.get("androidLink", "").strip() if androidLink is not None and androidLink != "": type, market, app_id = url_helper.get_market(androidLink) domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": None, "name": c["name"], "description": None, "link": androidLink, "domain": domain, "type": type } artifacts.append(artifact) return artifacts
def parse_artifact(source_company_id, item): logger.info("parse_artifact") company_key = item["key"] cc = item["content"]["company_base"]["data"] cp = item["content"]["product"]["data"]["companyProduct"] artifacts = [] links = [] # artifact for c in [cc, cp]: website = c.get("website", "").strip() website = url_helper.url_normalize(website) if website is not None and website != "" and website not in links: type, market, app_id = url_helper.get_market(website) if type == 4010: if website.find('36kr.com') > 0 and c["name"].find('36') == -1: pass else: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": website, "domain": app_id, "type": type } artifacts.append(artifact) links.append(website) elif (type == 4040 or type == 4050) and app_id is not None: domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": website, "domain": domain, "type": type } artifacts.append(artifact) links.append(website) weibo = c.get("weibo", "").strip() if weibo is not None and weibo != "" and weibo.find( "weibo") >= 0 and weibo not in links: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": weibo, "domain": None, "type": 4030 } artifacts.append(artifact) links.append(weibo) weixin = c.get("weixin", "").strip() if weixin is not None and weixin != "" and weixin not in links: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": weixin, "domain": weixin, "type": 4020 } artifacts.append(artifact) links.append(weixin) iphoneAppstoreLink = c.get("ios", "").strip() if iphoneAppstoreLink is not None and iphoneAppstoreLink != "" and iphoneAppstoreLink not in links: type, market, app_id = url_helper.get_market(iphoneAppstoreLink) domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": iphoneAppstoreLink, "domain": domain, "type": type } artifacts.append(artifact) links.append(iphoneAppstoreLink) # ipadAppstoreLink = c.get("ipadAppstoreLink","").strip() # if ipadAppstoreLink is not None and ipadAppstoreLink != "": # type, market, app_id = url_helper.get_market(ipadAppstoreLink) # domain = get_android_domain(market, app_id) # if (type==4040 or type==4050) and domain is not None: # artifact = { # "sourceCompanyId": source_company_id, # "name": c["name"], # "description": None, # "link": ipadAppstoreLink, # "domain": domain, # "type": type # } # artifacts.append(artifact) androidLink = c.get("android", "").strip() if androidLink is not None and androidLink != "" and androidLink not in links: type, market, app_id = url_helper.get_market(androidLink) domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": androidLink, "domain": domain, "type": type } artifacts.append(artifact) links.append(androidLink) return artifacts
def process(crawler, url, apkname, content): # logger.info(content) if has_content(content,apkname): logger.info("hereherehere") #content = content.decode('utf-8') d = pq(html.fromstring(content.decode("utf-8", "ignore"))) #content = unicode(content, encoding="utf-8", errors='replace') #d = pq(content) name = d('span.title').text() # logger.info("name: %s",name) icon = d('div.app-icon> img').attr("src") brief = d('p.tagline').text() # logger.info(brief) commentbyeditor= d('div.editorComment> div').text() #logger.info(editor_comment) screenshots = [] imgs = d('div.overview> img') # logger.info(imgs) for img in imgs: imgurl = pq(img).attr("src") screenshots.append(imgurl) desc = d('div.desc-info> div').text() # logger.info(desc) updates = d('div.change-info> div').text() # logger.info(update_desc) try: size = int(d('meta[itemprop="fileSize"]').attr("content")) except: size = d('meta[itemprop="fileSize"]').attr("content") if size.find("KB") >= 0: size = int(float(size.replace("KB","").strip())* 1024) elif size.find("MB") >= 0: size = int(float(size.replace("MB","").strip())* 1024 * 1024) else: size = None tags = d('dd.tag-box >a').text().replace(" ",",") datestr = d('time#baidu_time').text() updatedate = datetime.datetime.strptime(datestr, "%Y年%m月%d日") #versionname = d(':contains("版本")').next() #logger.info(versionname) author = d('span.dev-sites').text() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) try: website=d('a.dev-sites').attr("href") website = url_helper.url_normalize(website) except: website=None compatibility=None if content.find("查看权限要求") == -1: r1 = "content=\"Android\">(.*?)</dd>.*<dt>来自" else: r1 = "content=\"Android\">(.*?)<div>.*" result1 = util.re_get_result(r1, content) if result1: (compatibility,)= result1 compatibility=compatibility.replace("\n","").replace("\r","").replace("\s","").replace(" ","") #logger.info(compatibility) versionname=None r2 = "<dt>版本</dt>.*<dd>(.*?)</dd>.*<dt>要求" result2 = util.re_get_result(r2, content) if result2: (versionname,)= result2 versionname = versionname.replace("\n", "").replace("\r", "").replace("\s", "").replace(" ","").strip() #logger.info(versionname) try: versionname = versionname.split()[0] if versionname.startswith("V"): versionname = versionname.replace("V", "") except: pass # download = int(d("i[itemprop='interactionCount']").attr("content").split(":")[1]) dnum = d("i[itemprop='interactionCount']").attr("content").split(":")[1] download = None try: download = int(dnum) except: if dnum.find("万") >= 0: download = int(float(dnum.replace("万", "").strip()) * 10000) elif dnum.find("亿") >= 0: download = int(float(dnum.replace("亿", "").strip()) * 10000 * 10000) else: logger.info("********download :%s cannot get", dnum) item = { "link": url, "apkname": apkname, "appmarket": APPMARKET, "name": name, "brief": brief, "website": website, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": updatedate, "language": None, "tags": tags, "version": versionname, "updates": updates, "size": size, "compatibility": compatibility, "icon": icon, "author": author, "screenshots": screenshots, "type": None, "key": apkname, "key_int": None, "download":download, } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item) collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": True}}) else: logger.info("App: %s has no content", apkname) #logger.info(content) collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": False}})
def parse_base(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip() if product_name is None or product_name.strip() == "": product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip() temps = product_name.split("/",1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: %s" % product_name) logger.info("company short name: %s" % company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","") if company_name == "暂无" or company_name == "暂未收录": company_name = "" if company_name is None or company_name.strip() == "": try: company_name = d('div.des-more> h2').text().strip() except: pass if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = name_helper.company_name_normalize(company_name) logger.info("company name: %s" % company_name) if company_short_name == "" and company_name == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","") result = util.re_get_result('(\d*)\.(\d*)',str) if result != None: (year, month) = result try: if int(month) > 12: month = "1" except: month = "1" establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d') logger.info("establish date: %s" % establish_date) locationId=0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$',str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 result = parser_db_util.get_location(city) if result != None: locationId = result["locationId"] else: result = parser_db_util.get_location(province) if result != None: locationId = result["locationId"] if locationId == 0: loc1,loc2 = name_helper.get_location_from_company_name(company_name) if loc1 is not None: result = parser_db_util.get_location(loc1) if result != None: locationId = result["locationId"] logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info("融资需求: %s" % str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) try: brief = d("h2.seo-slogan").text().strip() except: brief = "" logger.info("brief: %s" % brief) if brief.find("暂未收录"): brief = "" field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: %s" % field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: %s" % sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",") logger.info("tags: %s" % tags) desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\ replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip() logger.info("********desc: %s" % desc) #logo logo = d("div.pic >img").attr("src") #if logo: # logo = logo.replace("http://", "https://") logger.info("logo: %s", logo) # website = d('div.link-line> a').text().strip() # if website is None or website == "": # website = d('div.link-line> a.webTink').text().strip() # if website is None or website == "": # try: # logger.info("here") # website = d('div.link-line> span.weblink> a').eq(1).text().strip() # logger.info(website) # except: # pass artifacts = [] for ty in [1,2,3]: if ty == 1: was = d('div.link-line> a') else: was = d('div.link-line> span.weblink,span.webTink> a') for wa in was: webs =[] try: website = pq(wa).attr("href").strip() if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass try: website = pq(wa).text().strip() if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass # # if website=="http://%e6%9a%82%e6%97%a0": # website = "" # website = url_helper.url_normalize(website) # logger.info("website: %s" % website) # artifacts = [] for website in webs: type, app_market, app_id = url_helper.get_market(website) if type == 4010: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type":4010, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4020: domain = app_id if domain is not None: artifacts.append({ "type": 4020, "name": product_name, "desc": None, "link": website, "domain": website }) elif type == 4030: domain = app_id if domain is not None: artifacts.append({ "type": 4030, "name": product_name, "desc": None, "link": website, "domain": None }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type":4040, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type":4050, "name":product_name, "desc":desc, "link":website, "domain": domain }) #获投状态 roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip() fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("获投状态: %d, %s", fundingRound, roundStr) logger.info("") return { "shortName": company_short_name, "fullName": company_name if company_name is not None and company_name.strip() != "" else None, "productName": product_name, "description": desc, "brief": brief, "round": fundingRound, "roundDesc": roundStr, "companyStatus": company_status, "fundingType": funding_type, "locationId": locationId, "establishDate": establish_date, "logo": logo, "sourceId": company_key, "field": field, "subField": sub_field, "tags": tags, "type":41010, "artifacts":artifacts }
def handle_lookup_result(response, app, date_num): global total if response.error: logger.info("Error: %s, %s" % (response.error, response.request.url)) logger.info("Last Total number of current patch: %s", total) request(response.request.url, lambda r, app=app, date_num=date_num: handle_lookup_result( r, app, date_num)) return else: logger.info("Getting result from url: %s", response.request.url) trackId = int(app["domain"]) try: data = json.loads(response.body) if data["resultCount"] > 0: for result in data["results"]: if result.get("trackId") == trackId: score = result.get("averageUserRating") comment = result.get("userRatingCount") logger.info( "companyId=%s, artifactId=%s, score=%s, comment=%s, date_num=%s" % (app["companyId"], app["id"], score, comment, date_num)) if score is not None or comment is not None: save_comment(app["trackId"], score, comment) logger.info("Last Total number of current patch: %s", total) if result.has_key("sellerUrl") and result[ "sellerUrl"] is not None: result["sellerUrl"] = url_helper.url_normalize( result["sellerUrl"]) flag, domain = url_helper.get_domain( result["sellerUrl"]) if flag: result["sellerDomain"] = domain else: result["sellerDomain"] = None short_name = name_helper.get_short_name( result["trackName"]) result["trackShortName"] = short_name record = collection_itunes.find_one( {"trackId": result["trackId"]}, projection={'histories': False}) if record: collection_itunes.update_one( {"_id": record["_id"]}, { '$set': { "checkTime": datetime.datetime.now() } }) if record.get("offline_itunes", None) == 'Y': offrecord = { "offlineDetectTime": datetime.datetime.now(), "offline_itunes": 'N' } collection_itunes.update_one( {"_id": record["_id"]}, { '$set': { "offline_itunes": 'N', "offlineitunesDetectTime": datetime.datetime.now() }, '$addToSet': { "offline_itunes_histories": offrecord } }) _id = record.pop("_id") if LooseVersion(result["version"]) > LooseVersion( record["version"]): # if 1: page_url = result.get("trackViewUrl").replace( "&uo=4", "") if date_num == 6 and page_url is not None and page_url.strip( ) != "": # only do it when date is 6/16/226 logger.info( "Need to crawler page data: %s", page_url) total += 1 request(page_url, lambda r, appdata=result: save_itunes(r, appdata)) else: logger.info( json.dumps(result, ensure_ascii=False, cls=util.CJsonEncoder)) result["createTime"] = record["createTime"] result[ "modifyTime"] = datetime.datetime.now( ) collection_itunes.update_one( {"_id": _id}, { '$set': result, '$addToSet': { "histories": record } }) else: result["createTime"] = datetime.datetime.now() result["modifyTime"] = result["createTime"] collection_itunes.insert(result) break elif data["resultCount"] == 0: record = collection_itunes.find_one( {"trackId": trackId}, projection={'histories': False}) logger.info("***********Offline************") if record: if record.get("offline_itunes", None) is None or record.get( "offline_itunes", None) == 'N': offrecord = { "offlineDetectTime": datetime.datetime.now(), "offline_itunes": 'Y' } collection_itunes.update_one({"_id": record["_id"]}, { '$set': { "offline_itunes": 'Y', "offlineitunesDetectTime": datetime.datetime.now(), "checkTime": datetime.datetime.now() }, '$addToSet': { "offline_itunes_histories": offrecord } }) else: collection_itunes.update_one( {"_id": record["_id"]}, {'$set': { "checkTime": datetime.datetime.now() }}) except: traceback.print_exc() total -= 1 if total <= 0: begin()
def parse_investor(item): logger.info("*** investfirm ***") investor_key = item["key"] html = item["content"] logger.info(investor_key) d = pq(html) logo = d('.logo-block > img').attr('src') if logo == "http://assets3.chuangyepu.com/chuangyepu/images/big-screenshot.png": logo = None basic_info = d('div.col-md-9> div> table> tr> td').eq(1) #logger.info(logo) name = pq(basic_info)('div.name').text().strip() if name is None: logger.info("No investor name!!!") return None desc = pq(basic_info)('div.desc').eq(0).text().strip() #logger.info(name+" "+desc) try: website = pq(basic_info)('div').eq(2)('a').text().strip() except: website = None if website is None or website.strip() == "暂无": website = None website = url_helper.url_normalize(website) flag, domain = url_helper.get_domain(website) if flag is None: website = None #logger.info(website) main_blocks = d('div.col-md-3> div.col-sm-12') #no js data # # for block in main_blocks: # info = pq(block) # h4 = info('h4.list_title').text().strip() # logger.info(h4) # # if h4 == "投资行业分布图": # field = info('g.highcharts-axis-labels').text().strip() source_investor = { "name": name, "website": website, "description": desc, "logo_url": logo, "stage": None, "field": None, "type": 10020, "source": SOURCE, "sourceId": investor_key } logger.info( json.dumps(source_investor, ensure_ascii=False, cls=util.CJsonEncoder)) return source_investor
def parse_company(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] logger.info(company_key) d = pq(html) if html.decode("utf-8").find("这个公司的主页还在建设") >= 0: return { "status": "No_Name", } name = d('.company_main > h1 > a').text() link = d('.company_main > h1 > a').attr('href') fullName = d('.company_main > h1 > a').attr('title') fullName = name_helper.company_name_normalize(fullName) if name is None or fullName is None or (name.find("拉勾") >= 0 and company_key != "147"): return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: return { "status": "No_Name", } logo = d('.top_info_wrap > img').attr('src') if logo.startswith("http") or logo.startswith("https"): pass else: logo = "http:" + logo if logo.find("logo_default") >= 0: logo = None brief = d('.company_word').text() desc_text = d('.company_intro_text').text() if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10: desc = None else: desc = d('.company_intro_text > .company_content').html() desc = desc.replace('<span class="text_over">展开</span>', '') soup = BeautifulSoup(desc, "lxml") raw = soup.getText() desc = raw field = '' stage = '' headCount = '' location = '' address = '' try: field = d( '#basic_container > .item_content >ul > li:eq(0) > span').text() stage = d( '#basic_container > .item_content >ul > li:eq(1) > span').text() headCount = d( '#basic_container > .item_content >ul > li:eq(2) > span').text() headCount = headCount[0:headCount.index(u'人')] location = d( '#basic_container > .item_content >ul > li:eq(3) > span').text() address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text() except: pass headCount = headCount.replace("people", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 location_id = 0 location_new = parser_mongo_util.get_location(location) if location_new != None: location_id = location_new["locationId"] #website = util.norm_url(link) website = url_helper.url_normalize(link) logger.info("website: %s" % website) artifacts = [] type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("lagou.com") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) #parser member members = [] lis = d('.manager_list > li') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p.item_manager_name > span').text() member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p.item_manager_title').text() member_desc = mem('div.item_manager_content').text() weibo = None if member_link is not None: if 'weibo.com' in member_link: weibo = member_link source_member = { 'name': member_name, 'photo_url': logo_url, 'weibo': weibo, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None } members.append(source_member) except: pass source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": int(location_id), "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "sourceUrl": "https://www.lagou.com/gongsi/%s.html" % company_key, "field": field, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "members": members, "status": 1 } return source_company
def expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler, test=False): logger.info("source: %s, sourceId: %s Start expand!!!", source, sourceId) logger.info("clean old expanded data") expand_clean(source, sourceId) sourcecompany = collection_source_company.find_one({"source": source, "sourceId": sourceId}) # exit() company_fullname = sourcecompany["source_company"]["fullName"] if company_fullname is not None and company_fullname.strip() != "": company_fullname = name_helper.company_name_normalize(company_fullname) scnames = sourcecompany["source_company_name"] check_fullname = False for scname in scnames: if scname["name"] == company_fullname: check_fullname = True break if check_fullname is False: (chinese, company) = name_helper.name_check(company_fullname) if chinese is True: chinese_type = "Y" else: chinese_type = "N" scname_data ={ "name": company_fullname, "chinese": chinese_type, "type": 12010, } save_mongo_source_company_name(source, sourceId, scname_data) round = 1 while True: if round >= 6: collection_source_company.update_one({"_id": sourcecompany["_id"]},{'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}}) break source_company_names = find_mongo_data(collection_source_company, "source_company_name", source, sourceId) main_beianhaos = find_mongo_data(collection_source_company, "source_mainbeianhao", source, sourceId) artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId) logger.info(json.dumps(source_company_names, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info(json.dumps(main_beianhaos, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) # Check if there are new stuff which need to do expansion if len(source_company_names) == 0 and len(artifacts) == 0 and len(main_beianhaos) == 0: collection_source_company.update_one({"_id": sourcecompany["_id"]}, {'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}}) break logger.info("source: %s, sourceId: %s expand for round %d", source, sourceId, round) # Step A/1:按公司名,备案查询 logger.info("source: %s, sourceId: %s 按公司名备案查询", source, sourceId) for source_company_name in source_company_names: # Only check chinese company name if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue if source_company_name["chinese"] is None: (chinese, companyName) = name_helper.name_check(source_company_name["name"]) else: chinese = source_company_name["chinese"] if chinese != "Y": continue check_name = list(collection_beian.find({"organizer": source_company_name["name"]})) # Case that one company_name has multiple beian# : 上海汇翼->(today.ai/teambition.com)#If only one found in Mongo.beian(organizer) it is fine if len(check_name) == 0: if test: items_beianlinks = [] else: items_beianlinks = beian_links_crawler.query_by_company_name(source_company_name["name"]) save_collection_beian(collection_beian, items_beianlinks) # insert infos into Mongo.beian else: items_beianlinks = check_name save_beian_artifacts(items_beianlinks, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_beianlinks, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_beianlinks, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # beian # 发现更多的artifact(website)和公司名,主备案号 # Step A/2:按domian,备案查询 logger.info("source: %s, sourceId: %s 按domian备案查询", source, sourceId) for artifact in artifacts: # Only check is artifact is a website if artifact["type"] != 4010: continue if artifact["domain"] is None: link = url_helper.url_normalize(artifact["link"]) (flag, domain) = url_helper.get_domain(link) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue check_domain = list(collection_beian.find({"domain": domain})) if len(check_domain) == 0: if test: items_merge =[] else: items_beianlinks = beian_links_crawler.query_by_domain(domain) items_icpchinaz = icp_chinaz_crawler.query_by_domain(domain) items_merge = merge_beian(items_beianlinks, items_icpchinaz) save_collection_beian(collection_beian, items_merge) # insert infos into Mongo.beian else: items_merge = check_domain # filer by check domain to avoid sinaapp.cn case items_merge = filter_domain(items_merge, domain) save_beian_artifacts(items_merge, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_merge, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_merge, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # beian # 发现更多的artifact(website)和公司名,主备案号 # Step A/3 #按主备案号查询 logger.info("source: %s, sourceId: %s 按主备案号查询", source, sourceId) for main_beianhao in main_beianhaos: mainBeianhao = main_beianhao["mainBeianhao"] check_mainBeianhao = collection_main_beianhao.find_one({"mainBeianhao": mainBeianhao}) if check_mainBeianhao is None: if test: items_merge =[] else: items_beianlinks = beian_links_crawler.query_by_main_beianhao(mainBeianhao) items_icpchinaz = icp_chinaz_crawler.query_by_main_beianhao(mainBeianhao) items_merge = merge_beian(items_beianlinks, items_icpchinaz) save_collection_beian(collection_beian, items_merge) # insert infos into Mongo.beian # if mainBeianhao could be found in two links if len(items_merge) > 0: items_main_beianhao = [{"mainBeianhao": mainBeianhao}] save_collection_mainBeianhao(collection_main_beianhao, items_main_beianhao) # insert mainBeianhao into Mongo.main_beianhao else: items_merge = list(collection_beian.find({"mainBeianhao": mainBeianhao})) save_beian_artifacts(items_merge, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_merge, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_merge, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # 发现更多的artifact(website)和公司名 # itunes扩展 # Step B/1 #查询itunes artifact logger.info("source: %s, sourceId: %s 查询itunes artifact", source, sourceId) itunes_company_enames = {} app_by_name = {} for artifact in artifacts: if artifact["type"] != 4040: continue # Get trackid trackid = None if artifact["domain"] is None: (apptype, appmarket, trackid) = url_helper.get_market(artifact["link"]) if apptype != 4040: continue else: try: trackid = int(artifact["domain"]) except: pass if trackid is not None: app = collection_itunes.find_one({"trackId": trackid}) if app is None: # mark it as Noactive set_artifact_active(artifact, "N", source, sourceId) else: copy_from_itunes(app, artifact, source, sourceId) # 存在: copy from mongo.itunes if app.has_key("offline") and app["offline"] is True: set_artifact_active(artifact, "Offline", source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: itunes_company_enames["sellerName"] = 1 app_by_name = app else: set_artifact_active(artifact, "N", source, sourceId) # save the only english name if len(itunes_company_enames) == 1: company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name": {"$elemMatch": {"type": 12010, "chinese":"N"}}}) if company_name is None: save_company_name(app_by_name, "sellerName", source, sourceId) # Step B/2根据公司名查询更多的itunes artifact logger.info("source: %s, sourceId: %s 根据公司名查询更多的itunes artifact", source, sourceId) for source_company_name in source_company_names: # producer name ''' check_itunes_producers = list(collection_itunes.find({"developer": source_company_name["name"]})) if len(check_itunes_producers) > 0: for app in check_itunes_producers: # Check if itunesId is already existed in artifacts if find_itunesId(app["trackId"], source_company_id): pass else: source_artifact_id = save_itunes_artifact(app, source_company_id) #save_artifact_itunes_rel(app["_id"], source_artifact_id) save_company_name(app, "developer", source_company_id) ''' if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue check_itunes_sellers = list(collection_itunes.find({"sellerName": source_company_name["name"]})) if len(check_itunes_sellers) > 0: ''' domains = {} for app in check_itunes_sellers: sellerUrl = app.get("sellerUrl") flag ,domain = url_helper.get_domain(sellerUrl) if flag is not None and domain is not None: domains[domain] = 1 ''' lens_domain = count_domains(check_itunes_sellers, "sellerUrl") artifact_status = check_source_artifact(source, sourceId) for app in check_itunes_sellers: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) if app.has_key("sellerUrl"): # if find_link(app["sellerUrl"], source_company_id) or check_source_artifact(source_company_id): if artifact_status: pass elif lens_domain == 1: artifact_id = save_itunesSellerUrl_artifact(app, source, sourceId) if artifact_id is not None: artifact_status = True # comment due to incorrect expand ''' if app.has_key("supportUrl"): if find_link(app["supportUrl"], source_company_id): pass else: save_itunesSupportUrl_artifact(app, source_company_id) ''' # save_artifact_itunes_rel(app["_id"], source_artifact_id) # save_company_name(app, "sellerName", source_company_id) # Step B/3根据域名查询更多的itunes artifact logger.info("source: %s, sourceId: %s 根据域名查询更多的itunes artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["domain"] is None: (flag, domain) = url_helper.get_domain(artifact["link"]) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue if domain in itunesDomainEx: continue check_itunes_sellerDomains = list(collection_itunes.find({"sellerDomain": domain})) if len(check_itunes_sellerDomains) > 0: lens_company_names = count_company_names(check_itunes_sellerDomains, "sellerName") company_name_status = check_source_company_name(source, sourceId) for app in check_itunes_sellerDomains: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) if company_name_status: pass elif lens_company_names == 1: # save_artifact_itunes_rel(app["_id"], source_artifact_id) chinese, is_company = name_helper.name_check(app["sellerName"]) if chinese and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True check_itunes_supportDomains = list(collection_itunes.find({"supportDomain": domain})) if len(check_itunes_supportDomains) > 0 and len(check_itunes_supportDomains) < 100: lens_company_names = count_company_names(check_itunes_supportDomains, "sellerName") company_name_status = check_source_company_name(source, sourceId) for app in check_itunes_supportDomains: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) # save_artifact_itunes_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["sellerName"]) if chinese and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True # 发现更多的artifact(website)和公司名,check if existed in source_art..and company_name # android扩展 # Step C/1#查询android artifact logger.info("source: %s, sourceId: %s 查询android artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4050: continue # Get apkname apkname = None if artifact["domain"] is None: (apptype, appmarket, appid) = url_helper.get_market(artifact["link"]) # Get apkname of baidu and 360 from android market if apptype != 4050: continue if appmarket == 16010 or appmarket == 16020: android_app = collection_android_market.find_one({"appmarket": appmarket, "key_int": appid}) if android_app: apkname = android_app["apkname"] else: apkname = appid else: apkname = artifact["domain"] if apkname is not None: app = collection_android.find_one({"apkname": apkname}) if app is None: # mark it as Noactive set_artifact_active(artifact, "N", source, sourceId) else: copy_from_android(app, artifact, source, sourceId) # 存在: copy from mongo.android set_artifact_active(artifact, "Y", source, sourceId) # chinese, is_company = name_helper.name_check(app["author"]) # if is_company: # save_company_name(app, "author", source_company_id) else: set_artifact_active(artifact, "N", source, sourceId) # Step C/2根据公司名查询更多的android artifact logger.info("source: %s, sourceId: %s 根据公司名查询更多的android artifact", source, sourceId) for source_company_name in source_company_names: # producer name if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue check_android_authors = list(collection_android.find({"author": source_company_name["name"]})) if len(check_android_authors) > 0 and len(check_android_authors) < 200: lens_domain = count_domains(check_android_authors, "website") artifact_status = check_source_artifact(source, sourceId) # check if author is consistent for app in check_android_authors: # Check if AnId have one 4010 if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) if artifact_status: pass elif lens_domain == 1: artifact_id = save_androidWebsite_artifact(app, source, sourceId) if artifact_id is not None: artifact_status = True # save_artifact_android_rel(app["_id"], source_artifact_id) # save_company_name(app, "author", source_company_id) # Step C/3根据域名查询更多的android artifact logger.info("source: %s, sourceId: %s 根据域名查询更多的android artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["domain"] is None: (flag, domain) = url_helper.get_domain(artifact["link"]) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue check_android_websiteDomains = list(collection_android.find({"website_domain": domain})) if len(check_android_websiteDomains) > 0: lens_company_names = count_company_names(check_android_websiteDomains, "author") company_name_status = check_source_company_name(source, sourceId) for app in check_android_websiteDomains: # Check if AndroidId is already existed in artifacts if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) # save_artifact_android_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["author"]) if is_company: save_company_name(app, "author", source, sourceId) company_name_status = True check_android_apknameDomains = list(collection_android.find({"apkname_domain": domain})) # add threshold to avoid case: domain: com.wowotuan if len(check_android_apknameDomains) > 0 and len(check_android_apknameDomains) < 100: lens_company_names = count_company_names(check_android_apknameDomains, "author") company_name_status = check_source_company_name(source, sourceId) for app in check_android_apknameDomains: # Check if AndroidId is already existed in artifacts if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) # save_artifact_android_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["author"]) if is_company: save_company_name(app, "author", source, sourceId) company_name_status = True # 发现更多的artifact(website)和公司名 # 曾用名 TODO # 清洗website artfiact # 查询meta信息, 标记不能访问的?website?, 处理转跳的website logger.info("source: %s, sourceId: %s website meta", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["link"] is None or artifact["link"].strip() == "": # set_active("source_artifact", "N", artifact["id"]) set_artifact_active(artifact, "N", source, sourceId) continue url = artifact["link"].strip() meta = collection_website.find_one({"url": url}) if meta is None or meta["httpcode"]==404: meta = website.get_meta_info(url) if meta: websiteId = save_collection_website(collection_website, meta) if websiteId is not None and not test: #screenshot_wesbite(collection_website, websiteId, screenshot_crawler) pass else: meta = { "url": artifact["link"], "httpcode": 404 } websiteId = save_collection_website(collection_website, meta) set_artifact_active(artifact, "N", source, sourceId) if meta: # 发生转跳 # logger.info(meta) if meta["httpcode"] == 200: redirect_url = meta.get("redirect_url") if artifact["link"] != redirect_url: url = url_helper.url_normalize(meta["redirect_url"]) (flag_new, domain_new) = url_helper.get_domain(url) meta_new = { "url": url, "domain": domain_new if flag_new is True else None, "redirect_url": url, "title": meta["title"], "tags": meta["tags"], "description": meta["description"], "httpcode": 200 } websiteId_new = save_collection_website(collection_website, meta_new) if websiteId_new is not None and not test: #screenshot_wesbite(collection_website, websiteId_new, screenshot_crawler) pass flag, domain = url_helper.get_domain(artifact["link"]) if domain_new != domain: # 跳出原域名 set_artifact_active(artifact, "Redirect", source, sourceId) else: if flag is True: # 这是个'好'地址 set_artifact_active(artifact, "Y", source, sourceId) else: if flag_new is True: # 转跳后是个 '好'地址 set_artifact_active(artifact, "Redirect", source, sourceId) save_website_artifact(meta_new, source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) elif meta["httpcode"] == 404: set_artifact_active(artifact, "N", source, sourceId) # verify -> source_artifacts/source_company_name set verify logger.info("source: %s, sourceId: %s set verify", source, sourceId) for artifact in artifacts: set_artifact_expand(artifact, source, sourceId) for source_company_name in source_company_names: set_scname_expand(source_company_name, source, sourceId) for main_beianhao in main_beianhaos: set_scbeianhao_expand(main_beianhao, source, sourceId) round += 1
def save_itunes(response, data): global total if response.error: logger.info("Error: %s, %s" % (response.error, response.request.url)) # request(response.request.url, lambda r, data=data: save_itunes(r,data)) # return else: try: html = response.body d = pq(html) developer = d(".product-header__identity> a").text() if developer is not None: developer = developer.replace("开发商:", "") data["developer"] = developer supportUrl = None links = d('li.t-subbody>a.targeted-link.link.icon') for i in links: title = pq(i).text().strip() if title.endswith("支持"): supportUrl = pq(i).attr('href').strip() data["supportUrl"] = url_helper.url_normalize(supportUrl) logger.info("********************Developer: %s->supportUrl: %s", data["developer"], data["supportUrl"]) relatedApps = [] try: # divs = d('div.swoosh') # for div in divs: # e = pq(div) # if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有": # apps = e('div.content> div> div.application') # for app in apps: # app_id = pq(app).attr('adam-id') # relatedApps.append(int(app_id)) #logger.info("*********************%s", app_id) apps = d('div.l-row.l-row--peek> a') for app in apps: appurl = pq(app).attr('href') r = util.re_get_result('/id(\d*)', appurl) if r is not None: track_id, = r try: app_id = int(track_id) relatedApps.append(int(app_id)) except: pass except: pass logger.info("*********************%s", relatedApps) data["relatedApps"] = relatedApps userComments = [] cdivs = d('div.l-row.l-row--peek> div.ember-view') for cdiv in cdivs: c = pq(cdiv) try: c_title = c( 'div.we-customer-review> div.we-customer-review__header> h3' ).eq(1).text().strip() c_commentator = c('div.we-customer-review__user').eq( 1).text().replace("评论人:", "").strip() c_content = c('p.we-customer-review__body').attr( "aria-label") comment = { "title": c_title, "commentator": c_commentator, "content": c_content } userComments.append(comment) except: pass logger.info( json.dumps(userComments, ensure_ascii=False, cls=util.CJsonEncoder)) data["userComments"] = userComments if data["supportUrl"] is not None: flag, domain = url_helper.get_domain(data["supportUrl"]) if flag: data["supportDomain"] = domain else: data["supportDomain"] = None if data.has_key("sellerUrl") and data["sellerUrl"] is not None: data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"]) flag, domain = url_helper.get_domain(data["sellerUrl"]) if flag: data["sellerDomain"] = domain else: data["sellerDomain"] = None short_name = name_helper.get_short_name(data["trackName"]) data["trackShortName"] = short_name logger.info( json.dumps(data, ensure_ascii=False, cls=util.CJsonEncoder)) record = collection_itunes.find_one( {"trackId": data["trackId"]}, projection={'histories': False}) if record: _id = record.pop("_id") if LooseVersion(data["version"]) > LooseVersion( record["version"]): data["createTime"] = record["createTime"] data["modifyTime"] = datetime.datetime.now() collection_itunes.update_one({"_id": _id}, { '$set': data, '$addToSet': { "histories": record } }) # elif LooseVersion(data["version"]) == LooseVersion(record["version"]): # data["modifyTime"] = datetime.datetime.now() # collection_itunes.update_one({"_id": _id}, {'$set': data}) else: data["createTime"] = datetime.datetime.now() data["modifyTime"] = data["createTime"] collection_itunes.insert(data) except: traceback.print_exc() total -= 1 if total <= 0: begin()
def parse_company(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] logger.info(company_key) d = pq(html) # logo_id processed in parser_db_util ''' logo_id = None if logo_url is not None: logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url) ''' if html.decode("utf-8").find("这个公司的主页还在建设") >= 0: return { "status": "No_Name", } name = d('.company_main > h1 > a').text() link = d('.company_main > h1 > a').attr('href') fullName = d('.company_main > h1 > a').attr('title') fullName = name_helper.company_name_normalize(fullName) if name is None or fullName is None or name.find("拉勾") >= 0: return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: return { "status": "No_Name", } logo = d('.top_info_wrap > img').attr('src') if logo.startswith("http") or logo.startswith("https"): pass else: logo = "http:" + logo if logo.find("logo_default") >= 0: logo = None brief = d('.company_word').text() desc_text = d('.company_intro_text').text() if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10: desc = None else: desc = d('.company_intro_text > .company_content').html() desc = desc.replace('<span class="text_over">展开</span>', '') soup = BeautifulSoup(desc, "lxml") raw = soup.getText() # logger.info(desc) #logger.info(raw) desc = raw # if desc is None or desc.strip() == "": # return { # "status": "No_Name", # } field = '' stage = '' headCount = '' location = '' address = '' try: field = d( '#basic_container > .item_content >ul > li:eq(0) > span').text() stage = d( '#basic_container > .item_content >ul > li:eq(1) > span').text() headCount = d( '#basic_container > .item_content >ul > li:eq(2) > span').text() headCount = headCount[0:headCount.index(u'人')] location = d( '#basic_container > .item_content >ul > li:eq(3) > span').text() address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text() except: pass headCount = headCount.replace("people", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 location_id = 0 location_new = parser_db_util.get_location(location) if location_new != None: location_id = location_new["locationId"] #website = util.norm_url(link) website = url_helper.url_normalize(link) logger.info("website: %s" % website) artifacts = [] type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("lagou.com") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "productDesc": None, "modelDesc": None, "operationDesc": None, "teamDesc": None, "marketDesc": None, "compititorDesc": None, "advantageDesc": None, "planDesc": None, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": location_id, "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": field, "subField": None, "tags": None, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "status": 1 } return source_company
while True: logger.info("investor aggregator start") #get source_investors conn = db.connect_torndb() #Check verify or processStatus source_investors = conn.query( "select * from source_investor where processStatus=0 order by id") conn.close() for source_investor in source_investors: logger.info(source_investor["id"]) #get Domain source_investor["domain"] = None if source_investor["website"] is not None: source_investor["website"] = url_helper.url_normalize( source_investor["website"]) type, market, website_domain = url_helper.get_market( source_investor["website"]) if type == 4010 and website_domain is not None: source_investor["domain"] = website_domain if source_investor["investorId"] is not None: investor = find_in_investor("id", source_investor["investorId"]) update_investor(investor, source_investor) set_processStatus(source_investor["id"]) continue else: #name check
def parse_artifact(item): if item is None: return None artifacts = [] company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) #artifact logger.info("*** artifact ***") lis = d('ul.list-prod> li> div.on-edit-hide') for li in lis: l = pq(li) strtype = l('h4> span.tag').text().strip() #logger.info(strtype) if strtype != u"网站" and strtype != "app": continue link = l('h4> b> a').attr("href").strip() if link == "": continue domain = None type = None if strtype == u"网站": type, app_market, app_id = url_helper.get_market(link) if type == 4010: link = url_helper.url_normalize(link) flag, domain = url_helper.get_domain(link) if flag is None: continue if flag is False: domain = None if type != 4010: type, app_market, app_id = url_helper.get_market(link) if type == 4040: domain = app_id elif type == 4050: if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is None and type !=4030 and type != 4020: continue name = l('h4> b').text().strip() desc = l('p').text().strip() logger.info("type: %s, name: %s, link: %s, desc: %s" % (type, name,link,desc)) artifact = { "type":type, "name":name, "desc":desc, "link":link, "domain": domain } artifacts.append(artifact) logger.info("") return artifacts