def get_location_from_company_name(unicode_name): #上海测试公司 #测试(上海)公司 #北京测试上海分公司 conn = db.connect_torndb() locations = list(conn.query("select locationName from location")) conn.close() m_location = None sub_location = None for location in locations: if unicode_name.startswith(location["locationName"]): m_location = location["locationName"] break if m_location is None: r = util.re_get_result(u"((.*))", unicode_name) if r is not None: location, = r conn = db.connect_torndb() l = conn.query("select * from location where locationName=%s", location) conn.close() if l is not None: m_location = location r = util.re_get_result(u"(.*)分公司", unicode_name) if r is not None: name, = r for location in locations: if name.endswith(location["locationName"]): sub_location = location["locationName"] break return m_location, sub_location
def handle_app_result(response, app): global total if response.error: logger.info("Error: %s, %s" % (response.error, response.request.url)) if response.code == 302 or response.code == 301 or response.code == 500 or response.code == 0: pass else: http_client.fetch(response.request.url, lambda r, app=app: handle_app_result(r, app), request_timeout=10) return else: logger.info(response.request.url) try: #html = unicode(response.body,encoding="utf-8",errors='replace') html = response.body (download, ) = util.re_get_result('downTimes:"(.*?)"', html) (score, ) = util.re_get_result( '<div class="com-blue-star-num">(.*?)分</div>', html) score = float(score) save_download(app["companyId"], app["artifactId"], download, score) logger.info("download=%s, score=%s" % (download, score)) except: traceback.print_exc() total -= 1 if total <= 0: begin()
def handle_app_result(response, app, url, apkname, retry=0): global total if response.error: logger.info("Error: %s, %s" % (response.error, response.request.url)) if response.code == 302 or response.code == 301 or response.code == 500 or response.code == 0 or response.code == 403: logger.info("herereere") pass else: retry += 1 if response.code == 403: if retry < 20: http_client.fetch( response.request.url, lambda r, app=app, url=url, apkname=apkname, retry= retry: handle_app_result(r, app, url, apkname, retry), request_timeout=10) return else: pass else: http_client.fetch( response.request.url, lambda r, app=app, url=url, apkname=apkname, retry=retry: handle_app_result(r, app, url, apkname, retry), request_timeout=10) return else: logger.info(response.request.url) try: # Parser data for newupdates: #logger.info("%s->%s", apkname, url) myapp_parser.process(None, url, apkname, response.body) #html = unicode(response.body,encoding="utf-8",errors='replace') html = response.body (download, ) = util.re_get_result('downTimes:"(.*?)"', html) (score, ) = util.re_get_result( '<div class="com-blue-star-num">(.*?)分</div>', html) score = float(score) download = float(download) crawler_util.save_download(app["domain"], TYPE, download, score) logger.info("apkname=%s, download=%s, score=%s" % (app["domain"], download, score)) except: traceback.print_exc() total -= 1 logger.info("total: %s", total) if total <= 0: begin()
def handle_page(response): global total if response.error: logger.info("Error: %s, %s" % (response.error,response.request.url)) request(response.request.url, handle_page) else: #logger.info(response.body) d = pq(response.body) apps = d('div#selectedcontent> div> ul> li') for app in apps: name = pq(app).text() app_url = pq(app)('a').attr('href') (app_id,) = util.re_get_result(r"id(\d*)",app_url) logger.info("%s %s %s" % (app_id, name, app_url)) item = itunes_collection.find_one({"appId":app_id}) if item is None: data = { "appId":app_id, "name":name, "url":app_url, "date":datetime.datetime.now() } itunes_collection.insert_one(data) if re.match(u'[\u4e00-\u9fa5]+',name): if item is None or item.has_key("html")==False: total += 1 request(app_url, handle_html) if item is None or item.has_key("json")==False: total += 1 api_url = "https://itunes.apple.com/cn/lookup?id=%s" % app_id request(api_url, handle_json) if len(apps) > 10: #logger.info(response.request.url) result = util.re_get_result(r"page=(\d*)",response.request.url) if result != None: (strPage,) = result #logger.info(strPage) nextPage = str(int(strPage) + 1) url = response.request.url url = url.replace("page="+strPage, "page=" + nextPage) logger.info(url) total += 1 request(url, handle_page) total -= 1 if total <=0: exit(0)
def login(): while True: try: idx = random.randint(0, len(login_users)-1) login_user = login_users[idx] logger.info(login_user) data = { "backurl":" http://beian.links.cn", "bsave": "1", "opaction":"login", "username":login_user["name"], "password":login_user["pwd"],} s = my_request.get_http_session(new=True, agent=False) logger.info("proxies=%s" % s.proxies) r = s.post("http://my.links.cn/checklogin.asp",data=data, timeout=10) if r.status_code == 200: #html = util.html_encode_4_requests(r.text, r.content, r.encoding) r.encoding = r.apparent_encoding html = r.text #logger.info(html) if html is not None: if util.re_get_result(r"(loaduserinfo)",html): return True except Exception,ex: logger.exception(ex) time.sleep(10)
def parse_member(item): if item is None: return None company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) members = [] # members logger.info("*** member ****") lis = d('ul.list-prodcase> li') for li in lis: l = pq(li) member_name = l('h4> a> b> span.c').text().strip() position = l('h4> a> b> span.c-gray').text().strip() str = l('h4> a').attr("href").strip() (member_key, ) = util.re_get_result(r'person/(\d*?)$', str) logger.info("member_key: %s, member_name: %s, position: %s" % (member_key, member_name, position)) member = {"key": member_key, "name": member_name, "position": position} members.append(member) logger.info("") return members
def process(city, content): #logger.info(content) d = pq(html.fromstring(content.decode("utf-8"))) lis = d('div.search-tab-content-item> div') for li in lis: key = None c = pq(li) url = c('a').eq(0).attr("href") result = util.re_get_result('/event/(\d+)', url) if result: key, = result if key is None: continue url = "http://www.huodongxing.com/event/%s" % key logger.info(url) maxretry = 0 while True: result = crawler.crawl(url, agent=True) if result['get'] == 'success': break elif result['get'] == 'fail' and result["content"] is not None: logger.info(result["content"]) if result["content"].find("系统载入中") > 0: break if maxretry > 30: result["content"] = " " break maxretry += 1 try: process_activity(key, result['content']) except Exception, ex: logger.exception(ex)
def process_page(url, content): global page_urls d = pq(content) apps = d('div#selectedcontent> div> ul> li') for app in apps: name = pq(app).text() app_url = pq(app)('a').attr('href') (app_id, ) = util.re_get_result(r"id(\d+)", app_url) try: trackId = int(app_id) except Exception, e: logger.info(traceback.format_exc()) logger.info(app_url) logger.info("%s %s %s" % (trackId, name, app_url)) #if util.isChineseString(name): if 1 == 1: item = collection.find_one({"trackId": trackId}) if item is None: data = { "trackId": trackId, "trackName": name, "trackViewUrl": app_url, "createTime": datetime.datetime.now() } collection.insert_one(data) else: data = { "trackName": name, "trackViewUrl": app_url, "modifyTime": datetime.datetime.now() } collection.update_one({"trackId": trackId}, {'$set': data})
def is_crawl_success(self, url, content): if content.find('操作成功') == -1: logger.info(content) return False r = "companyId=(.*?)&pageSize" result = util.re_get_result(r, url) (id, ) = result try: j = json.loads(content) rjobs = j['content']['data']['page']['result'] if len(rjobs) == 0: logger.info("Failed due to 0 jobs under url: %s", url) return False if len(rjobs) > 0 and rjobs[0].has_key("companyId"): companyId = rjobs[0]["companyId"] logger.info("Url companyid: %s <-> lagou return companyId: %s", id, companyId) if str(companyId) != id: logger.info( "Failed due to different companyId: got: %s from request :%s", companyId, url) return False return True except: return True
def handle_alexa_cn_result(content, domain, crawler_cn): try: d = pq(content) data = d('script').text() data = ''.join(data) try: (ids,) = util.re_get_result("showHint\('(\S*)'\);", data) except: traceback.print_exc() # logger.info(html) return None id_arr = ids.split(',') data = {"url": id_arr[0], "sig": id_arr[1], "keyt": id_arr[2] } body = urllib.urlencode(data) url = "http://www.alexa.cn/api_150710.php" result = crawler_cn.crawl(url,postdata=body) if result['get'] == 'success': #logger.info(result["content"]) data_cn = handle_api_result(result["content"], domain) return data_cn except: traceback.print_exc() return None
def fetch_alexa(domain): alexa = trends_tool.get_alexa(domain) url = 'http://www.alexa.cn/index.php?url='+domain proxy = {'type': 'http', 'anonymity':'high', 'country': 'cn', 'ping': 5} while True: s = my_request.get_single_session(proxy, new=True, agent=False) (flag, r) = my_request.get(logger, url) if flag == 0: break d = pq(r.text) data = d('script').text() data = ''.join(data) (ids, ) = util.re_get_result("showHint\('(\S*)'\);", data) id_arr = ids.split(',') domain = id_arr[0] while True: timeout = 10 try: r = s.post("http://www.alexa.cn/api_150710.php", data={"url": id_arr[0], "sig": id_arr[1], "keyt":id_arr[2] }, timeout=timeout) break except Exception,ex: logger.exception(ex) timeout = 20
def process(content, sourceId, source, key): r = "var = (.*?);" result = util.re_get_result(r, content) (b, ) = result logger.info(b) c = b.decode("gbk", "ignore") j = json.loads(c) infos = j["data"] mongo = db.connect_mongo() collection = mongo.stock.announcement for info in infos: ntitle = info["NOTICETITLE"] ndate = info["NOTICEDATE"] nurl = info["Url"] cleantitle = ntitle.replace(":", "").replace(str(sourceId), "").strip() logger.info("%s-%s-%s", ntitle, cleantitle, ndate) item = collection.find_one({"title": cleantitle}) item1 = collection.find_one({"title": ntitle}) if item is not None or item1 is not None: logger.info("******already exists") else: logger.info("******missing, get it") crawler_rp(nurl, cleantitle, ndate, sourceId, source) if j.has_key("TotalCount") and j["TotalCount"] > (50 * key): cnt = 1 else: cnt = 0 mongo.close() return cnt
def get_pages(session, cookies, fullname): page_result = {'status': None, 'pages': None, 'proxies': None} code = get_verify_code(session, cookies) if code is None: page_result['status'] = 'nocode' return page_result time.sleep(2) # 搜索 res = 0 while True: res += 1 if res > 10: page_result['status'] = 'nopage' return page_result try: unitname = fullname.encode('gb2312', 'ignore') search_url = "http://www.miitbeian.gov.cn/icp/publish/query/icpMemoInfo_searchExecute.action" payload = { "siteName": "", "condition": "5", "siteDomain": '', "siteUrl": "", "mainLicense": "", "siteIp": "", "unitName": unitname, "mainUnitNature": "-1", "certType": "-1", "mainUnitCertNo": "", "verifyCode": code } proxies = get_proxy('http') r = session.post(search_url, data=payload, headers=headers, cookies=cookies, proxies=proxies, timeout=10) content = r.text if content.find("备案信息查询") >= 0: if content.find('没有符合条件的记录') == -1: result = util.re_get_result(r"1/(\d+)", content) if result is not None: pages, = result if pages is not None: page_result['status'] = 'got' page_result['pages'] = pages page_result['proxies'] = proxies return page_result else: page_result['status'] = 'nofind' else: page_result['status'] = 'nofind' else: page_result['status'] = 'nofind' return page_result except: pass
def fetch_bp(html, referer, cf_key): d = pq(html) script = d('script').text() script = ''.join(script) try: (pdf_key, ) = util.re_get_result("pptKey = \"(\S+)\"", script) except Exception, e: return None
def baidu_get_actual_link(url): r = requests.get(url) html = r.content r = util.re_get_result("URL='(.*?)'", html) if r is None: return None url, = r return url
def handle_json(response): global total if response.error: logger.info("Error: %s, %s" % (response.error,response.request.url)) request(response.request.url, handle_json) else: (app_id,) = util.re_get_result(r"id=(\d*)",response.request.url) itunes_collection.update_one({"appId":app_id},{'$set':{'json':response.body}}) total -= 1 if total <=0: exit(0)
def process(search_name, from_doc_id, content): d = pq(util.html_encode(content)) divs = d('div.app') for div in divs: e = pq(div) a = e('a.app-name') name = a.text().strip() #logger.info(name) href = a.attr("href") #logger.info(href) result = util.re_get_result("docid=(\d*)",href) if result: (docid_str,) = result try: docid = long(docid_str) except: continue else: continue data = e('a.inst-btn') if len(data) == 0: data = e('a.inst-btn-big') if len(data) == 0: continue type = data.attr("data_detail_type") apkname = data.attr("data_package") version = data.attr("data_versionname") size = None try: size = long(data.attr("data_size")) except: pass item = { "key_int": docid, "search_name": search_name, "name": name, "link": "http://shouji.baidu.com/software/%s.html" % docid, "type": type, "apkname": apkname, "version": version, "size": size } #logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) try: android.save_baidu_search(collection_search, item) except Exception,e: logger.info(e)
def get_id(session, cookies, domain): fid = None id_result = {'status': None, 'id': fid, 'proxies': None} code = get_verify_code(session, cookies) if code is None: id_result['status'] = 'wrong' return id_result res = 0 while True: res += 1 if res > 10: id_result['status'] = 'wrong' return id_result # 搜索 search_url = "http://www.miitbeian.gov.cn/icp/publish/query/icpMemoInfo_searchExecute.action" payload = { "siteName": "", "condition": "1", "siteDomain": domain, "siteUrl": "", "mainLicense": "", "siteIp": "", "unitName": "", "mainUnitNature": "-1", "certType": "-1", "mainUnitCertNo": "", "verifyCode": code } proxies = get_proxy('http') r = session.post(search_url, data=payload, headers=headers, cookies=cookies, proxies=proxies) content = r.text if content.find('备案信息查询') >= 0: if content.find('没有符合条件的记录') == -1: result = util.re_get_result(r"doDetail\('(.*?)'\)", content) if result is not None: _id, = result print("id: %s" % _id) id_result['status'] = 'got' id_result['id'] = _id id_result['proxies'] = proxies return id_result else: print('该 domain 未备案...') id_result['status'] = 'noid' return id_result
def is_crawl_success(self, url, content, redirect_url): if content.find("var") >= 0: r = "var = (.*?);" result = util.re_get_result(r, content) (b, ) = result logger.info(b) try: c = b.decode("gbk", "ignore") j = json.loads(c) if j.has_key("data") is True: return True else: return False except Exception, E: logger.info("here") logger.info(E)
def parse_footprint(item): if item is None: return [] company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) footprints = [] #footprint logger.info("*** footprint ***") lis = d('ul.list-milestone> li') for li in lis: l = pq(li) footDesc = l('div> p').eq(0).text().strip() logger.info(footDesc) if footDesc is None or footDesc == "": continue footDateText = l('div> p> span.t-small').text().strip() logger.info(footDateText) if footDateText is None or footDateText == "": continue result = util.re_get_result('(\d*?)\.(\d*?)$', footDateText) if result == None: continue (year, month) = result year = int(year) try: month = int(month) except: month = 1 if month <= 0 or month > 12: month = 1 if year < 1970 or year > 3000: year = 1970 footDate = datetime.datetime.strptime("%s-%s-1" % (year, month), '%Y-%m-%d') logger.info("%s: %s", footDate, footDesc) footprint = {"footDate": footDate, "footDesc": footDesc} footprints.append(footprint) logger.info("") return footprints
def process(content, keyword, link): # j = json.loads(content) # infos = j["value"] logger.info(content) cnt = 0 d = pq(html.fromstring(content.decode("utf-8"))) title = d('head> title').text().strip() logger.info("title: %s", title) ptype = None mongo = db.connect_mongo() collection = mongo.trend.index if link.find("MEDIA_WECHAT") >= 0: source = 13651 #搜狗微信热度 sourceDesc = "搜狗微信热度" if collection.find_one({"source": source, "keyword": keyword}) is None: ptype = 1 else: ptype = 2 if ptype is None: return if ptype == 1: logger.info("here") r = "root.SG.wholedata\s=\s(.*)?\;.*\}\(this" else: r = "root.SG.data = (.*?);root.SG.wholedata" try: result = util.re_get_result(r, content) except: logger.info("wwwwwww") return logger.info(result) (b, ) = result logger.info(b) base = json.loads(b, strict=False) for pv in base["pvList"]: logger.info(json.dumps(pv, ensure_ascii=False, cls=util.CJsonEncoder)) mongo.close() return cnt
def handle_alexa_cn_result(response, app): global total if response.error: logger.info("Error: %s, %s" % (response.error, response.request.url)) request(response.request.url, lambda r, app=app: handle_alexa_cn_result(r, app)) return else: try: html = unicode(response.body, encoding="utf-8", errors='replace') d = pq(html) data = d('script').text() data = ''.join(data) try: (ids, ) = util.re_get_result("showHint\('(\S*)'\);", data) except: # logger.info(html) request(response.request.url, lambda r, app=app: handle_alexa_cn_result(r, app)) return id_arr = ids.split(',') data = {"url": id_arr[0], "sig": id_arr[1], "keyt": id_arr[2]} body = urllib.urlencode(data) url = "http://www.alexa.cn/api_150710.php" total += 1 # proxy_ip = get_proxy() proxy_ip = None request(url, lambda r, app=app, body=body, proxy_ip=proxy_ip: handle_api_result(r, app, body, proxy_ip), body, proxy_ip) except: traceback.print_exc() total -= 1 if total <= 0: begin()
def fetch(url): (key, ) = util.re_get_result("https://itjuzi.com/album/(\d+)", url) logger.info("key=%s" % key) (flag, r) = my_request.get(logger, url) logger.info("flag=%d", flag) if flag == -1: return -1 if r.status_code == 404: logger.info("Page Not Found!!!") return r.status_code if r.status_code != 200: return r.status_code if r.url != url: logger.info("Page Redirect <--") return 302 content = { "date": datetime.datetime.now(), "url": url, "key": key, "content": r.text } # save if collection.find_one({"key": key}) != None: collection.delete_one({"key": key}) collection.insert_one(content) # msg = {"type":"itjuzi_album", "key":key} # logger.info(json.dumps(msg)) # kafka_producer.send_messages("itjuzi_album", json.dumps(msg)) return 200
def fetch_project(url): (cf_key, ) = util.re_get_result("http://dj.jd.com/funding/details/(\d+).html", url) (flag, r) = my_request.get(logger, url) if flag == 0: html = r.text support = fetch_support(cf_key) focus = fetch_focus(url, cf_key) team = fetch_team(cf_key) leader = fetch_leader(cf_key) bp = fetch_bp(html, url, cf_key) content = {'html': html, 'team': team, 'support': support, 'focus': focus } project = {"date":datetime.datetime.now(), "source":source, "url":url, "company_key": cf_key, "cf_key":cf_key, "content":content, 'leader': leader, 'bp': bp } result = cf_collection.find_one({"source":source, "company_key":cf_key, 'cf_key': cf_key}) if result != None: cf_collection.replace_one({'_id': result['_id']}, project) else: cf_collection.insert_one(project) msg = {"type":"cf", "source":source, "cf_key":cf_key} logger.info(json.dumps(msg)) kafka_producer.send_messages("crawler_cf_jd_v2", json.dumps(msg))
def parse_member(item): if item is None: return [] company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) members = [] # members logger.info("*** member ****") lis = d('ul.list-prodcase> li') for li in lis: try: l = pq(li) member_name = l('h4> a> b> span.c').text().strip() position = l('h4> a> b> span.c-gray').text().strip() str = l('h4> a').attr("href").strip() (member_key, ) = util.re_get_result(r'person/(\d*?)$', str) logger.info("member_key: %s, member_name: %s, position: %s" % (member_key, member_name, position)) memberId = parser_mongo_util.find_mongo_memberId( SOURCE, member_key) if memberId is None: continue type = name_helper.position_check(position) member = { "_memberId": memberId, "name": member_name, "position": position, "type": type } members.append(member) except Exception, ex: logger.exception(ex)
def getMoney(moneyStr): investment = 0 currency = 3020 precise = 'Y' investmentStr = "" if investment == 0: result = util.re_get_result(u'(数.*?)万人民币',moneyStr) if result != None: (investmentStr,) = result currency = 3020 precise = 'N' else: result = util.re_get_result(u'(数.*?)万美元',moneyStr) if result != None: (investmentStr,) = result currency = 3010 precise = 'N' if investmentStr != "": if investmentStr == u"数": investment = 1*10000 elif investmentStr == u"数十": investment = 10*10000 elif investmentStr == u"数百": investment = 100*10000 elif investmentStr == u"数千": investment = 1000*10000 if investment == 0: result = util.re_get_result(u'(数.*?)亿人民币',moneyStr) if result != None: (investmentStr,) = result currency = 3020 precise = 'N' else: result = util.re_get_result(u'(数.*?)亿美元',moneyStr) if result != None: (investmentStr,) = result currency = 3010 precise = 'N' if investmentStr != "": if investmentStr == u"数": investment = 1*10000*10000 elif investmentStr == u"数十": investment = 10*10000*10000 elif investmentStr == u"数百": investment = 100*10000*10000 elif investmentStr == u"数千": investment = 1000*10000*10000 if investment == 0: result = util.re_get_result(u'(\d*\.?\d*?)万人民币',moneyStr) if result != None: (investmentStr,) = result currency = 3020 investment = int(float(investmentStr) * 10000) else: result = util.re_get_result(u'(\d*\.?\d*?)万美元',moneyStr) if result != None: (investmentStr,) = result currency = 3010 investment = int(float(investmentStr) * 10000) if investment == 0: result = util.re_get_result(u'(\d*\.?\d*?)亿人民币',moneyStr) if result != None: (investmentStr,) = result currency = 3020 investment = int(float(investmentStr) * 100000000) else: result = util.re_get_result(u'(\d*\.?\d*?)亿美元',moneyStr) if result != None: (investmentStr,) = result currency = 3010 investment = int(float(investmentStr) * 100000000) if investment == 0: result = util.re_get_result(u'亿元及以上美元',moneyStr) if result != None: currency = 3020 investment = 100000000 precise = 'N' else: result = util.re_get_result(u'亿元及以上人民币',moneyStr) if result != None: currency = 3010 investment = 100000000 precise = 'N' return currency, investment, precise
def process_news(item, url, content): if has_news_content(content): d = pq(html.fromstring(content.decode("gbk"))) title = d( 'div.g-main> div> div.m-cont-hd> div.title> h1').text().strip() datecontent = d( 'div.g-main> div> div.m-cont-hd> div.m-info> div> div> div.box> div.origin' ).text().strip() result = util.re_get_result('(\d{4}\/.*?)$', datecontent) if result: post_time, = result news_time = datetime.datetime.strptime(post_time, "%Y/%m/%d %H:%M:%S") else: post_time = None news_time = None key = item["key"] column = d('div.g-main> div> div.m-cont-hd> div.tag').text().strip() brief = d('div.g-article> div> div.review').text().strip() postraw = item["post"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if column is not None: tags = column.split() else: tags = [] logger.info("%s, %s, %s, %s, %s, %s", key, title, post_time, news_time, brief, ":".join(tags)) article = d('div.g-article> div.m-article').html() #logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) # # for t in contents: # logger.info(t["data"]) # logger.info("") flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_posterId_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def save_itunes(response, data): global total if response.error: logger.info("Error: %s, %s" % (response.error, response.request.url)) # request(response.request.url, lambda r, data=data: save_itunes(r,data)) # return else: try: html = response.body d = pq(html) developer = d(".product-header__identity> a").text() if developer is not None: developer = developer.replace("开发商:", "") data["developer"] = developer supportUrl = None links = d('li.t-subbody>a.targeted-link.link.icon') for i in links: title = pq(i).text().strip() if title.endswith("支持"): supportUrl = pq(i).attr('href').strip() data["supportUrl"] = url_helper.url_normalize(supportUrl) logger.info("********************Developer: %s->supportUrl: %s", data["developer"], data["supportUrl"]) relatedApps = [] try: # divs = d('div.swoosh') # for div in divs: # e = pq(div) # if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有": # apps = e('div.content> div> div.application') # for app in apps: # app_id = pq(app).attr('adam-id') # relatedApps.append(int(app_id)) #logger.info("*********************%s", app_id) apps = d('div.l-row.l-row--peek> a') for app in apps: appurl = pq(app).attr('href') r = util.re_get_result('/id(\d*)', appurl) if r is not None: track_id, = r try: app_id = int(track_id) relatedApps.append(int(app_id)) except: pass except: pass logger.info("*********************%s", relatedApps) data["relatedApps"] = relatedApps userComments = [] cdivs = d('div.l-row.l-row--peek> div.ember-view') for cdiv in cdivs: c = pq(cdiv) try: c_title = c( 'div.we-customer-review> div.we-customer-review__header> h3' ).eq(1).text().strip() c_commentator = c('div.we-customer-review__user').eq( 1).text().replace("评论人:", "").strip() c_content = c('p.we-customer-review__body').attr( "aria-label") comment = { "title": c_title, "commentator": c_commentator, "content": c_content } userComments.append(comment) except: pass logger.info( json.dumps(userComments, ensure_ascii=False, cls=util.CJsonEncoder)) data["userComments"] = userComments if data["supportUrl"] is not None: flag, domain = url_helper.get_domain(data["supportUrl"]) if flag: data["supportDomain"] = domain else: data["supportDomain"] = None if data.has_key("sellerUrl") and data["sellerUrl"] is not None: data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"]) flag, domain = url_helper.get_domain(data["sellerUrl"]) if flag: data["sellerDomain"] = domain else: data["sellerDomain"] = None short_name = name_helper.get_short_name(data["trackName"]) data["trackShortName"] = short_name logger.info( json.dumps(data, ensure_ascii=False, cls=util.CJsonEncoder)) record = collection_itunes.find_one( {"trackId": data["trackId"]}, projection={'histories': False}) if record: _id = record.pop("_id") if LooseVersion(data["version"]) > LooseVersion( record["version"]): data["createTime"] = record["createTime"] data["modifyTime"] = datetime.datetime.now() collection_itunes.update_one({"_id": _id}, { '$set': data, '$addToSet': { "histories": record } }) # elif LooseVersion(data["version"]) == LooseVersion(record["version"]): # data["modifyTime"] = datetime.datetime.now() # collection_itunes.update_one({"_id": _id}, {'$set': data}) else: data["createTime"] = datetime.datetime.now() data["modifyTime"] = data["createTime"] collection_itunes.insert(data) except: traceback.print_exc() total -= 1 if total <= 0: begin()
def fetch_company(url): (company_key, ) = util.re_get_result("http://rong.36kr.com/api/company/(\d+)", url) logger.info("company_key=%s" % company_key) company_content = None member_contents = [] news_contents = [] investor_contents = [] member_ids = [] #company base info time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 if r.status_code == 404: logger.info("Page Not Found!!!") return r.status_code if r.status_code != 200: logger.info("status_code=%d" % r.status_code) return r.status_code company_base = r.json() logger.info(company_base) if company_base["code"] != 0: return 404 logger.info(company_base["data"]["company"]["name"]) #past-finance (investment events) url = "http://rong.36kr.com/api/company/%s/past-finance" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 past_finance = r.json() #past-investor url = "http://rong.36kr.com/api/company/%s/past-investor?pageSize=100" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 past_investor = r.json() #funds (非投资人没有查看权限) url = "http://rong.36kr.com/api/company/%s/funds" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 funds = r.json() #product url = "http://rong.36kr.com/api/company/%s/product" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 product = r.json() #past-investment url = "http://rong.36kr.com/api/company/%s/past-investment" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 past_investment = r.json() #company-fa? url ="http://rong.36kr.com/api/fa/company-fa?cid=%s" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 company_fa = r.json() #founders url = "http://rong.36kr.com/api/company/%s/founder?pageSize=1000" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 founders = r.json() #employee url ="http://rong.36kr.com/api/company/%s/employee?pageSize=1000" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 employees = r.json() #former-member url = "http://rong.36kr.com/api/company/%s/former-member?pageSize=1000" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 former_members = r.json() company_content = {"date":datetime.datetime.now(), "source":source, "url":url, "company_key":company_key, "company_key_int":int(company_key), "company_base":company_base, "past_finance":past_finance, "past_investor":past_investor, "funds":funds, "product":product, "past_investment":past_investment, "company_fa":company_fa, "founders":founders, "employees":employees, "former_members":former_members} #member for m in founders["data"]["data"]: m_id = m["id"] member_ids.append(m_id) for m in employees["data"]["data"]: m_id = m["id"] member_ids.append(m_id) for m in former_members["data"]["data"]: m_id = m["id"] member_ids.append(m_id) for v in past_investor["data"]["data"]: if v["entityType"] == "INDIVIDUAL": m_id = v["entityId"] member_ids.append(m_id) for m_id in member_ids: member_key = str(m_id) if member_collection.find_one({"source":source, "member_key":member_key}): continue #basic url = "http://rong.36kr.com/api/user/%s/basic" % member_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 member_base = r.json() #past-investment url = "http://rong.36kr.com/api/user/%s/past-investment" % member_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 member_past_investment = r.json() # url = "http://rong.36kr.com/api/user/%s/company" % member_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 member_company = r.json() # url = "http://rong.36kr.com/api/user/%s/work" % member_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 member_work = r.json() # url = "http://rong.36kr.com/api/p/lead-investor/%s/financing" % member_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 member_financing = r.json() member_content = {"date":datetime.datetime.now(), "source":source, "url":url, "member_key":member_key, "member_base":member_base, "member_past_investment":member_past_investment, "member_company":member_company, "member_work":member_work, "member_financing":member_financing} member_contents.append(member_content) #investor organization for e in past_finance["data"]["data"]: for investor in e.get("participants",{}): investor_key = str(investor["entityId"]) if investor_collection.find_one({"source":source, "investor_key":investor_key}): continue #base info url = "http://rong.36kr.com/api/organization/%s/basic" % investor_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 investor_base = r.json() #staffs url = "http://rong.36kr.com/api/organization/%s/user" % investor_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 staffs = r.json() #former-member url = "http://rong.36kr.com/api/organization/%s/former-member" % investor_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 former_members = r.json() investor_content = {"date":datetime.datetime.now(), "source":source, "url":url, "investor_key":investor_key, "investor_base":investor_base, "staffs":staffs, "former_members":former_members} investor_contents.append(investor_content) #logger.info(company_content) #logger.info("************") #logger.info(member_contents) #logger.info("************") #logger.info(investor_contents) #save if company_collection.find_one({"source":source, "company_key":company_key}) != None: company_collection.delete_one({"source":source, "company_key":company_key}) company_collection.insert_one(company_content) for member in member_contents: if member_collection.find_one({"source":source, "member_key":member["member_key"]}) == None: member_collection.insert_one(member) for news in news_contents: if news_collection.find_one({"source":source, "company_key":company_key, "news_key":news["news_key"]}) == None: news_collection.insert_one(news) for investor in investor_contents: if investor_collection.find_one({"source":source, "investor_key":investor["investor_key"]}) == None: investor_collection.insert_one(investor) msg = {"type":"company", "source":source, "company_key":company_key} logger.info(json.dumps(msg)) kafka_producer.send_messages("crawler_kr36_v2", json.dumps(msg)) return 200
def handle_app_result(response, app): global total if response.error: logger.info("Error: %s, %s" % (response.error, response.request.url)) #http_client.fetch(response.request.url, lambda r,app=app:handle_app_result(r, app),request_timeout=10) request(response.request.url, lambda r, app=app: handle_app_result(r, app)) return else: logger.info(response.request.url) try: html = unicode(response.body, encoding="utf-8", errors='replace') #logger.info(html) d = pq(html) downloadstr = d("span.s-3").eq(0).text().replace( "下载:", "").replace("次", "").replace("+", "").strip() download = 0 score = 0 try: if downloadstr.endswith("千"): download = float(downloadstr.replace("千", "")) * 1000 elif downloadstr.endswith("万"): download = float(downloadstr.replace("万", "")) * 10000 elif downloadstr.endswith("亿"): download = float(downloadstr.replace("亿", "")) * 10000 * 10000 else: download = int(downloadstr) score = float(d("span.s-1").text().replace("分", "").strip()) * 0.5 except: pass r = "var detail = \(function \(\) \{\s*?return\s*?(.*?);\s*?\}\)" result = util.re_get_result(r, html) if result is not None: (b, ) = result base = json.loads(b.replace("'", '"'), strict=False) baike_name = base["baike_name"].strip() save_download(app["companyId"], app["artifactId"], download, score) logger.info( "companyId=%s, artifactId=%s, download=%s, score=%s, baike_name=%s" % (app["companyId"], app["artifactId"], download, score, baike_name)) url = "http://zhushou.360.cn/search/index/?kw=%s" % urllib.quote( app["name"].encode("utf-8")) total += 1 #http_client.fetch(url, lambda r,app=app:handle_search_result(r, app),request_timeout=10) request(url, lambda r, app=app: handle_search_result(r, app)) url = "http://intf.baike.360.cn/index.php?name=%s&c=message&a=getmessagenum" % urllib.quote( baike_name.encode("utf-8")) total += 1 #http_client.fetch(url, lambda r,app=app:handle_comment_result(r, app),request_timeout=10) request(url, lambda r, app=app: handle_comment_result(r, app)) else: logger.info(html) except: traceback.print_exc() total -= 1 if total <= 0: begin()