def run(): global raw_urls while True: if len(raw_urls) == 0: return url = raw_urls.pop(0) item = collection.find_one({"url": url}) if item is not None: continue flag, domain = url_helper.get_domain(url) result = website.get_meta_info(url) logger.info(url) logger.info( json.dumps(result, ensure_ascii=False, cls=util.CJsonEncoder)) if result is None: result = {"url": url, "httpcode": 404} else: if result["url"] != result["redirect_url"]: new_url = url_helper.url_normalize(result["redirect_url"]) flag1, domain1 = url_helper.get_domain(new_url) if domain != domain1: raw_urls.append(new_url) result["createTime"] = datetime.datetime.now() result["modifyTime"] = result["createTime"] try: collection.insert(result) except: pass
def save(collection_market, appmarket, item): item["website"] = url_helper.url_normalize(item["website"]) flag, domain = url_helper.get_domain(item["website"]) if flag: item["website_domain"] = domain else: item["website_domain"] = None temp = "http://" + ".".join(item["apkname"].split(".")[::-1]) flag, domain = url_helper.get_domain(temp) item["apkname_domain"] = domain record = collection_market.find_one( { "appmarket": appmarket, "apkname": item["apkname"] }, projection={'histories': False}) if record: _id = record.pop("_id") record.pop("key") record.pop("key_int") #logger.info(json.dumps(record, ensure_ascii=False, cls=util.CJsonEncoder)) if item["version"] is not None and item["version"].strip() != "": if record["version"] is not None and record["version"].strip( ) != "" and LooseVersion(item["version"]) > LooseVersion( record["version"]): item["createTime"] = record["createTime"] item["modifyTime"] = datetime.datetime.now() if item["updateDate"] is None: item["updateDate"] = datetime.datetime.now() collection_market.update_one({"_id": _id}, { '$set': item, '$addToSet': { "histories": record } }) elif record["version"] is None or record["version"].strip( ) == "" or LooseVersion(item["version"]) == LooseVersion( record["version"]): item["modifyTime"] = datetime.datetime.now() collection_market.update_one({"_id": _id}, {'$set': item}) else: item["createTime"] = datetime.datetime.now() item["modifyTime"] = item["createTime"] if item["updateDate"] is None: item["updateDate"] = datetime.datetime.now() try: collection_market.insert(item) except Exception, e: logger.info(e)
def count_domains(apps, item_of_url): domains = {} for app in apps: url = app.get(item_of_url) flag, domain = url_helper.get_domain(url) if flag is not None and domain is not None: domains[domain] = 1 return len(domains)
def get_meta_info(url): user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9' headers = { 'User-Agent': user_agent, 'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Accept-Encoding': 'gzip' } try: request = urllib2.Request(url, None, headers) except: return None opener = urllib2.build_opener() retries = 0 while True: try: r = opener.open(request, timeout=17) if r.info().get('Content-Encoding') == 'gzip': buf = StringIO(r.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() else: data = r.read() content = util.html_encode(data) redirect_url = url_helper.url_normalize(r.geturl()) #logger.info(redirect_url) #logger.info(content) d = pq(html.fromstring(content)) title = d("title").text() #logger.info(title) keywords = d("meta[name='keywords']").attr("content") if keywords is None: keywords = d("meta[name='Keywords']").attr("content") #logger.info(keywords) description = d("meta[name='description']").attr("content") if description is None: description = d("meta[name='Description']").attr("content") #logger.info(description) flag, domain = url_helper.get_domain(url) if flag is not True: domain = None return { "url": url, "redirect_url": redirect_url, "domain": domain, "title": title, "tags": keywords, "description": description, "httpcode": 200 } break except: retries += 1 if retries >= 3: return None return None
def find_link(link, source, sourceId): if link is None: return True if link.strip() == "": return True artifact = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_artifact": {"$elemMatch": {"type": 4010, "link": link}}}) if artifact is None: flag, domain = url_helper.get_domain(link) if domain is not None: artifact = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_artifact": {"$elemMatch": {"type": 4010, "domain": domain}}}) if artifact is None: return False else: return True
def check_dup(websites, pattern): global Md global Ma global Mc linksmap = {} for website in websites: if website["link"] is not None and website["link"].strip() != "": linksmap.setdefault(website["link"].strip(), []).append(website) if website["domain"] is None or website["domain"].strip() == "": flag, domain = url_helper.get_domain(website["link"]) website["domain"] = domain logger.info("Website Missing domain for :%s , %s", website["id"], website["companyId"]) fullFill(website) Md += 1 else: logger.info("Website Missing link for :%s , %s", website["id"], website["companyId"]) remove_dup([website["id"]]) Ma += 1 dups = [] for link in linksmap: if len(linksmap[link]) < 2: continue maxscroe = 0 remainId = None allIds = [] Mc += 1 for web in linksmap[link]: allIds.append(web["id"]) logger.info("DUP: %s: %s /%s /%s", web["id"], web["link"], web["createTime"], web["companyId"]) score = len([ column for column in scores if web[column] is not None and str(web[column]).strip() != "" and str(web[column]).strip() != "0" ]) if remainId is None: remainId = web["id"] maxscroe = score elif score > maxscroe: remainId = web["id"] logger.info("Remain: %s", remainId) dups.extend([id for id in allIds if id != remainId]) if len(dups) > 0: logger.info("Remove: %s", dups) remove_dup(dups)
def process(item): logger.info("process: %s, %s", item["id"], item["name"]) deal_id = item["dealId"] if deal_id is None: set_deal_artifact_new_proceed(item["id"], "F") return conn = db.connect_torndb() deal = conn.get("select * from deal where id=%s", deal_id) conn.close() if deal is None: set_deal_artifact_new_proceed(item["id"], "F") return company_id = deal["companyId"] conn = db.connect_torndb() sc = conn.get( "select * from source_company where companyId=%s and source=13001 and sourceId=%s", company_id, str(deal_id)) if sc is None: source_company_id = conn.insert( "insert source_company(companyId,source,sourceId,createTime,processStatus) " "values(%s,%s,%s,now(),%s)", company_id, 13001, str(deal_id), 2) else: source_company_id = sc["id"] if item["sourceArtifactId"] is None: link = item["link"] domain = None if item["type"] == 4010: link = url_helper.url_normalize(link) flag, domain = url_helper.get_domain(link) if flag is False: domain = None sourceArtifactId = conn.insert( "insert source_artifact(sourceCompanyId,name,description,link,domain,type,createTime) " "values(%s,%s,%s,%s,%s,%s,now())", source_company_id, item["name"], item["description"], link, domain, item["type"]) conn.update( "update deal_artifact_new set sourceArtifactId=%s, proceed='Y' where id=%s", sourceArtifactId, item["id"]) conn.update("update source_company set processStatus=0 where id=%s", source_company_id) conn.close()
def parser(item): if item is None: return None investor_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) investor_name = d('div.picinfo> p> span.title').text() investor_name = name_helper.company_name_normalize(investor_name) logger.info("investor_name: " + investor_name) if investor_name is None: logger.info("No investor name!!!") return None logo = d('div.pic> img').attr("src") if logo is not None: logo = logo.strip() logger.info("Investor Logo: %s" % logo) website = d('span.links >a[target="_black"]').attr("href") if website is None or website.strip() == "暂无": website = None website = url_helper.url_normalize(website) flag, domain = url_helper.get_domain(website) if flag is None: website = None logger.info("Investor website: %s" % website) stageStr = d('div.pad.block> div.list-tags.yellow').text().replace( " ", ",").strip() logger.info("Investor rounds: %s" % stageStr) fieldsStr = d('div.pad.block> div.list-tags.darkblue').text().replace( " ", ",").strip() logger.info("Investor fields: %s" % fieldsStr) desc = d('div.des').text().strip() logger.info("Investor desc: %s" % desc) return investor_key, investor_name, logo, website, stageStr, fieldsStr, desc
def save_androidWebsite_artifact(app, source, sourceId): url = app["website"] flag, domain = url_helper.get_domain(url) if flag is not True: return None if find_link(url, source, sourceId): return None try: andwebsadata = { "name": app["name"], "description": app["description"], "link": app["website"], "type": 4010, "domain": app["website_domain"], "extended": 'Y', } save_mongo_source_artifact(source, sourceId, andwebsadata) return 1 except: return None
def save_itunesSellerUrl_artifact(app, source, sourceId): url = app["sellerUrl"] flag, domain = url_helper.get_domain(url) if flag is not True: return None if find_link(app["sellerUrl"], source, sourceId): return None try: itunessellersadata = { "name": app["sellerName"], "description": app["description"], "link": app["sellerUrl"], "type": 4010, "domain": app["sellerDomain"], "extended": 'Y', } save_mongo_source_artifact(source, sourceId, itunessellersadata) return 1 except: return None
def process(g, crawler, url, key, content): if has_content(content): #logger.info(content) main = pq(content)('div.article_content') d = pq(main) title = d('h1#article_title').text() brief = pq(content)("meta[name='description']").attr("content") # post_time =pq(content)("meta[property='article:published_time']").attr("content").split("+")[0] # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S") result = util.re_get_result("var publishTime = new Date\(\"(.*?)\"\)", content) if result: post_time, = result news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") else: logger.info("incorrcet post time") logger.info(content) exit() return contents = extract.extractContents(url, content) if title.find("融资") >= 0 or title.find("获投") >= 0: category = 60101 else: category = None tags = [] articletags = pq(content)("meta[name='keywords']").attr("content") if articletags is None: logger.info(content) else: for tag in articletags.split(): if tag is not None and tag.strip() != "" and tag not in tags: tags.append(tag) logger.info("%s, %s, %s, %s, %s", key, title, news_time, category, ":".join(tags)) #logger.info(news_time) #logger.info(contents) # for t in contents: # logger.info(t["data"]) #item = collection_news.find_one({"source": g.SOURCE, "key_int": int(key)}) craw = True #2016-10-01 pencilnews website upgrade, news keys changed! Have to redownload article with new keys if collection_news.find_one({ "source": g.SOURCE, "key_int": int(key) }) is not None: cnews = collection_news.find_one({ "source": g.SOURCE, "key_int": int(key) }) logger.info("%s, %s", url, cnews["link"]) if url == cnews["link"]: craw = False else: collection_news.delete_many({ "source": g.SOURCE, "key_int": int(key) }) logger.info("different link!") if craw: newses = list( collection_news.find({ "title": title, "source": { "$ne": g.SOURCE } })) for news in newses: if news.has_key("type") and news["type"] > 0: craw = False break if craw: if collection_news.find_one({ "title": title, "source": { "$ne": g.SOURCE } }) is not None: collection_news.delete_many({ "title": title, "source": { "$ne": g.SOURCE } }) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": g.SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain } dcontents = [] rank = 1 for c in contents: if c["data"] == "/The End/": break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) post = util.get_poster_from_news(dcontents) dnews["post"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) collection_news.insert(dnews) logger.info("*************DONE**************") g.latestIncr()
def parse_investor(item): logger.info("*** investfirm ***") investor_key = item["key"] html = item["content"] logger.info(investor_key) d = pq(html) logo = d('.logo-block > img').attr('src') if logo == "http://assets3.chuangyepu.com/chuangyepu/images/big-screenshot.png": logo = None basic_info = d('div.col-md-9> div> table> tr> td').eq(1) #logger.info(logo) name = pq(basic_info)('div.name').text().strip() if name is None: logger.info("No investor name!!!") return None desc = pq(basic_info)('div.desc').eq(0).text().strip() #logger.info(name+" "+desc) try: website = pq(basic_info)('div').eq(2)('a').text().strip() except: website = None if website is None or website.strip() == "暂无": website = None website = url_helper.url_normalize(website) flag, domain = url_helper.get_domain(website) if flag is None: website = None #logger.info(website) main_blocks = d('div.col-md-3> div.col-sm-12') #no js data # # for block in main_blocks: # info = pq(block) # h4 = info('h4.list_title').text().strip() # logger.info(h4) # # if h4 == "投资行业分布图": # field = info('g.highcharts-axis-labels').text().strip() source_investor = { "name": name, "website": website, "description": desc, "logo_url": logo, "stage": None, "field": None, "type": 10020, "source": SOURCE, "sourceId": investor_key } logger.info( json.dumps(source_investor, ensure_ascii=False, cls=util.CJsonEncoder)) return source_investor
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip().replace(".html", "").replace( 'detail_', '') type = TYPE category = None title = d('div.left.zb-n> h1').text().strip() tags = [] postraw = newspost # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # brief = d("meta[name='description']").attr("content").replace(u'一鸣网——让发生的发声|智慧共享新媒体平台|上海TMT媒体开创者、一鸣网ymtmt.com','') brief = d('div.left.zb-n> p.gy').text().strip() news_time = datetime.datetime.now() article = d('div.left.zb-n').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: logger.info('already exists %s', title) mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 start = False for c in contents: if start is False and c["data"].find( brief) >= 0 and c["data"].find(title) >= 0: start = True continue if start is False: continue if c["data"].find("-END-") >= 0: break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) mongo.close() # logger.info("*************DONE*************") else: logger.info('has no news content %s', newsurl) return
def process_news(column, newsurl, content, newspost, download_crawler): # if has_news_content(content): if 1: # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode('gb2312', 'ignore'))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".htm", "") title = d('h1.title').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) brief = None news_time = d('.timer').text() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M:%S') article = d('.content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, # "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) mongo.close() if news_classify.get_class(dcontents, 13866) == 1: logger.info('%s is fundingNews', title) TYPE = 60001 else: TYPE = 60010 logger.info('%s is not fundingNews', title) dnews['type'] = TYPE if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass return
def process_news(newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here.') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) category = None categoryNames = [] Type = TYPE tags = [] brief = None title = d('h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return key = d('article').attr('id').strip().split('-')[-1] try: (posturl, width, height) = parser_mysql_util.get_logo_id_new( newspost, download_crawler, SOURCE, key, "news") except: posturl = None if posturl is not None: post = str(posturl) else: post = None try: post_time = d("header> div> span> time").text().strip() res = re.search(u'(\d{4})年(\d+)月(\d+)日', post_time) year = res.group(1) month = res.group(2) if len(month) == 1: month = '0' + month day = res.group(3) if len(day) == 1: day = '0' + day post_time = '{}-{}-{}'.format(year, month, day) news_time = extract.extracttime(post_time) except Exception as e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.td-post-content').html() contents = extract.extractContents(newsurl, article, document=True) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": Type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 for c in contents: if c['type'] == 'text': dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process_news(column, newsurl, content, newspost, download_crawler, sort): if has_news_content(content): logger.info("here") d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1] type = TYPE if sort.find("投融资") >= 0: type = 60001 category = None title = d('div.mod-head> h1').text().strip() if title is None or title == "": return tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.replace(",", ",").split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # # # newspost1 = d('div.article-main> div> img').attr("src") # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # post = d("meta[property='og:image']").attr("content") try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d('span.time> time').text() logger.info(post_time) # if post_time == datetime.date.strftime(datetime.date.today(),'%Y-%m-%d'): # news_time = datetime.datetime.now() # else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M") except: news_time = datetime.datetime.now() article = d('div.mod-body> div.content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s, %s, %s", key, title, news_time, ":".join(tags), type, category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [], "sectors": [20] } dcontents = [] rank = 1 for c in contents: # if c["data"].find("◆END◆")>=0 or c["data"].find("…………………")>=0: # break # # if c["data"].find("ACG 领域最具影响力的产业新媒体") >= 0 or c["data"].find("访问三文娱网站3wyu.com查看产业必读文章") >=0: # continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) if title is not None and len(contents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE************* %s", mid) return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip().replace(".shtml", "") type = TYPE category = None title = d('div.subject> h1').text().strip() tags = [] post = newspost brief = d("meta[name='description']").attr("content") post_time = d('div.meta> span.meta-date').text().replace("发布", "") logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div.subject> div.subject-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: imgurl = parser_mysql_util.get_logo_id( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) collection_news.insert(dnews) mongo.close() logger.info("*************DONE*************") return
def save_itunes(response, data): global total if response.error: logger.info("Error: %s, %s" % (response.error, response.request.url)) # request(response.request.url, lambda r, data=data: save_itunes(r,data)) # return else: try: html = response.body d = pq(html) developer = d(".product-header__identity> a").text() if developer is not None: developer = developer.replace("开发商:", "") data["developer"] = developer supportUrl = None links = d('li.t-subbody>a.targeted-link.link.icon') for i in links: title = pq(i).text().strip() if title.endswith("支持"): supportUrl = pq(i).attr('href').strip() data["supportUrl"] = url_helper.url_normalize(supportUrl) logger.info("********************Developer: %s->supportUrl: %s", data["developer"], data["supportUrl"]) relatedApps = [] try: # divs = d('div.swoosh') # for div in divs: # e = pq(div) # if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有": # apps = e('div.content> div> div.application') # for app in apps: # app_id = pq(app).attr('adam-id') # relatedApps.append(int(app_id)) #logger.info("*********************%s", app_id) apps = d('div.l-row.l-row--peek> a') for app in apps: appurl = pq(app).attr('href') r = util.re_get_result('/id(\d*)', appurl) if r is not None: track_id, = r try: app_id = int(track_id) relatedApps.append(int(app_id)) except: pass except: pass logger.info("*********************%s", relatedApps) data["relatedApps"] = relatedApps userComments = [] cdivs = d('div.l-row.l-row--peek> div.ember-view') for cdiv in cdivs: c = pq(cdiv) try: c_title = c( 'div.we-customer-review> div.we-customer-review__header> h3' ).eq(1).text().strip() c_commentator = c('div.we-customer-review__user').eq( 1).text().replace("评论人:", "").strip() c_content = c('p.we-customer-review__body').attr( "aria-label") comment = { "title": c_title, "commentator": c_commentator, "content": c_content } userComments.append(comment) except: pass logger.info( json.dumps(userComments, ensure_ascii=False, cls=util.CJsonEncoder)) data["userComments"] = userComments if data["supportUrl"] is not None: flag, domain = url_helper.get_domain(data["supportUrl"]) if flag: data["supportDomain"] = domain else: data["supportDomain"] = None if data.has_key("sellerUrl") and data["sellerUrl"] is not None: data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"]) flag, domain = url_helper.get_domain(data["sellerUrl"]) if flag: data["sellerDomain"] = domain else: data["sellerDomain"] = None short_name = name_helper.get_short_name(data["trackName"]) data["trackShortName"] = short_name logger.info( json.dumps(data, ensure_ascii=False, cls=util.CJsonEncoder)) record = collection_itunes.find_one( {"trackId": data["trackId"]}, projection={'histories': False}) if record: _id = record.pop("_id") if LooseVersion(data["version"]) > LooseVersion( record["version"]): data["createTime"] = record["createTime"] data["modifyTime"] = datetime.datetime.now() collection_itunes.update_one({"_id": _id}, { '$set': data, '$addToSet': { "histories": record } }) # elif LooseVersion(data["version"]) == LooseVersion(record["version"]): # data["modifyTime"] = datetime.datetime.now() # collection_itunes.update_one({"_id": _id}, {'$set': data}) else: data["createTime"] = datetime.datetime.now() data["modifyTime"] = data["createTime"] collection_itunes.insert(data) except: traceback.print_exc() total -= 1 if total <= 0: begin()
def parse_company(item): logger.info("parse_company") company_key = item["postdata"]["id"] #company basic info c = item["data"]["basic"] tags = c["tags"] tags_str = tags.replace("|",",") logo=c["icon"] if logo.find("product_default.png") >= 0: logo = None establish_date = None if c.has_key("open_time"): try: establish_date = datetime.datetime.strptime(c["open_time"], "%Y-%m-%d") except: pass address1 = None address2 = None if c.has_key("city"): address2 = c["city"] if c.has_key("province"): address1 = c["province"] location_id = 0 if address2!=None and address2.strip()!="": location = parser_db_util.get_location(address2) if location != None: location_id= location["locationId"] if location_id==0 and address1 != None and address1.strip()!="": location = parser_db_util.get_location(address1) if location != None: location_id = location["locationId"] fullName = c["company"] if fullName is None or fullName.strip() == "": fullName = None else: fullName = fullName.replace("_","") idx = fullName.rfind(u"公司") if idx != -1: fullName = fullName[:(idx+len(u"公司"))] fullName = name_helper.company_name_normalize(fullName) name = c["product"] desc = "" brief = "" productDesc = None modelDesc = None operationDesc = None teamDesc = None marketDesc = None compititorDesc = None advantageDesc = None planDesc = None otherDesc = None if c.has_key("desc"): # 其他 # otherDesc = c["intro"].strip() desc = c["desc"].strip() if c.has_key("yewu"): # 其他 # otherDesc = c["intro"].strip() brief = c["yewu"].strip() if name is None or fullName is None: return { "status": "No_Name", } artifacts = [] websites = [] if c.has_key("gw_link") is True and c["gw_link"].strip() !="" and c["gw_link"] not in websites: websites.append(c["gw_link"]) if c.has_key("source_gw_link") is True and c["source_gw_link"].strip() != "" and c["source_gw_link"] not in websites: websites.append(c["source_gw_link"]) if item["data"].has_key("productinfos") is True: for pi in item["data"]["productinfos"]: if pi.has_key("link") is True and pi["link"].strip() !="" and pi["link"] not in websites: websites.append(pi["link"]) for website in websites: type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("qimingpian.com") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": brief, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": brief, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": brief, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": brief, "link": website, "domain": domain }) return { "name": name, "fullName": fullName, "description": desc, "productDesc": productDesc, "modelDesc": modelDesc, "operationDesc": operationDesc, "teamDesc": teamDesc, "marketDesc": marketDesc, "compititorDesc": compititorDesc, "advantageDesc": advantageDesc, "planDesc": planDesc, "otherDesc": otherDesc, "brief": brief, "round": 0, "roundDesc": None, "companyStatus": 2010, 'fundingType': 0, "locationId": location_id, "address": None, "phone": None, "establishDate": establish_date, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": None, "subField": None, "tags": tags_str, "headCountMin": None, "headCountMax": None, "artifacts": artifacts, }
def process_news(column, newsurl, content, newspost, download_crawler): logger.info('starting process_news %s', newsurl) # if has_news_content(content): if 1: download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1] # type = TYPE category = None title = d('.article_title p').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() logger.info('title:%s already exists' % title) return tags = [] articletags = d(".labs a").text().strip() if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) postraw = newspost # post = d('div#post_thumbnail> img').attr("src") # if post is not None: # post = "http://vcbeat.com"+ post brief = None # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip() # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content) news_time = d('.time').text().strip() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M') # dt = datetime.date.today() # today = datetime.datetime.now() # if news_time is None or news_time > today: # news_time = datetime.datetime.now() article = d('.art_text').html() contents = extract.extractContents(newsurl, article, document=False) # if len(contents)==0: # contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, postraw) # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # logger.info( 'title:%s already exists'%title) # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": TYPE, "original_tags": None, "processStatus": 0, # "companyId": None, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) dnews["brief"] = brief # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # update link content with oldId item = collection_news.find_one({"link": newsurl}) if item is None: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", newsurl) # collection_news.update_many({'link': newsurl},{'$set': dnews}) # oldId = collection_news.find_one({"link": newsurl})['_id'] # collection_news.delete_one({"link": newsurl}) # dnews['_id']=oldId # collection_news.insert(dnews) mongo.close() logger.info("*************DONE*************") return
def process_news(content, url): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) download_crawler = download.DownloadCrawler(use_proxy=False) title = d( 'div.post-img-left> div> div.post-head> h1.title').text().strip() post_time = d('article.post-article').attr("ptime") post_Date = time.localtime(int(post_time)) news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon, post_Date.tm_mday, post_Date.tm_hour, post_Date.tm_min, post_Date.tm_sec) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) if collection_news.find_one({ "title": title, "source": { "$ne": SOURCE } }) is not None: return key = d('article.post-article').attr("postid") try: key_int = int(key) except: key_int = None column = d('span.post-category').text().strip() brief = d("meta[name='description']").attr("content").strip() if column is not None: tags = column.split() else: tags = [] categoryNames = [] if "人物" in tags: category = 60103 elif "公司" in tags: category = 60105 categoryNames.append("大公司") else: category = None keywords = d("meta[name='keywords']").attr("content") if keywords is not None: for keyword in keywords.split(","): if keyword is not None and keyword.strip( ) not in tags and keyword.strip() not in ["PingWest", "品玩"]: tags.append(keyword.strip()) postraw = d("link[rel='image_src']").attr("href") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time, news_time, brief, ":".join(tags), category, post) article = d('div.box-con> div#sc-container').html() # logger.info(article) contents = extract.extractContents(url, article) # if collection_news.find_one({"link": url}) is not None: # return # # collection_news.delete_one({"link": url}) # # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: # return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=16), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""), # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"].replace("?imageView2/2/w/750/q/90", ""), } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"].replace("?imageView2/2/w/750/q/90", ""), download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def handle_lookup_result(response, app, date_num): global total if response.error: logger.info("Error: %s, %s" % (response.error, response.request.url)) logger.info("Last Total number of current patch: %s", total) request(response.request.url, lambda r, app=app, date_num=date_num: handle_lookup_result( r, app, date_num)) return else: logger.info("Getting result from url: %s", response.request.url) trackId = int(app["domain"]) try: data = json.loads(response.body) if data["resultCount"] > 0: for result in data["results"]: if result.get("trackId") == trackId: score = result.get("averageUserRating") comment = result.get("userRatingCount") logger.info( "companyId=%s, artifactId=%s, score=%s, comment=%s, date_num=%s" % (app["companyId"], app["id"], score, comment, date_num)) if score is not None or comment is not None: save_comment(app["trackId"], score, comment) logger.info("Last Total number of current patch: %s", total) if result.has_key("sellerUrl") and result[ "sellerUrl"] is not None: result["sellerUrl"] = url_helper.url_normalize( result["sellerUrl"]) flag, domain = url_helper.get_domain( result["sellerUrl"]) if flag: result["sellerDomain"] = domain else: result["sellerDomain"] = None short_name = name_helper.get_short_name( result["trackName"]) result["trackShortName"] = short_name record = collection_itunes.find_one( {"trackId": result["trackId"]}, projection={'histories': False}) if record: collection_itunes.update_one( {"_id": record["_id"]}, { '$set': { "checkTime": datetime.datetime.now() } }) if record.get("offline_itunes", None) == 'Y': offrecord = { "offlineDetectTime": datetime.datetime.now(), "offline_itunes": 'N' } collection_itunes.update_one( {"_id": record["_id"]}, { '$set': { "offline_itunes": 'N', "offlineitunesDetectTime": datetime.datetime.now() }, '$addToSet': { "offline_itunes_histories": offrecord } }) _id = record.pop("_id") if LooseVersion(result["version"]) > LooseVersion( record["version"]): # if 1: page_url = result.get("trackViewUrl").replace( "&uo=4", "") if date_num == 6 and page_url is not None and page_url.strip( ) != "": # only do it when date is 6/16/226 logger.info( "Need to crawler page data: %s", page_url) total += 1 request(page_url, lambda r, appdata=result: save_itunes(r, appdata)) else: logger.info( json.dumps(result, ensure_ascii=False, cls=util.CJsonEncoder)) result["createTime"] = record["createTime"] result[ "modifyTime"] = datetime.datetime.now( ) collection_itunes.update_one( {"_id": _id}, { '$set': result, '$addToSet': { "histories": record } }) else: result["createTime"] = datetime.datetime.now() result["modifyTime"] = result["createTime"] collection_itunes.insert(result) break elif data["resultCount"] == 0: record = collection_itunes.find_one( {"trackId": trackId}, projection={'histories': False}) logger.info("***********Offline************") if record: if record.get("offline_itunes", None) is None or record.get( "offline_itunes", None) == 'N': offrecord = { "offlineDetectTime": datetime.datetime.now(), "offline_itunes": 'Y' } collection_itunes.update_one({"_id": record["_id"]}, { '$set': { "offline_itunes": 'Y', "offlineitunesDetectTime": datetime.datetime.now(), "checkTime": datetime.datetime.now() }, '$addToSet': { "offline_itunes_histories": offrecord } }) else: collection_itunes.update_one( {"_id": record["_id"]}, {'$set': { "checkTime": datetime.datetime.now() }}) except: traceback.print_exc() total -= 1 if total <= 0: begin()
article = d( 'div.pfcng-row-02> div.pfcng-col-2> div.pos-0> div.pane-content').html( ) if article.find('<form') >= 0: form_str = re.search('<form(.*?)</form>', article).group(1) article = article.replace(form_str, '') # elif article.find('<iframe') >= 0: # iframe_str = re.search('<iframe(.*?)</iframe>',article).group(1) # article = article.replace(iframe_str, '') contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain,
def process_news(item, url, content): if has_news_content(content): d = pq(html.fromstring(content.decode("gbk"))) title = d( 'div.g-main> div> div.m-cont-hd> div.title> h1').text().strip() datecontent = d( 'div.g-main> div> div.m-cont-hd> div.m-info> div> div> div.box> div.origin' ).text().strip() result = util.re_get_result('(\d{4}\/.*?)$', datecontent) if result: post_time, = result news_time = datetime.datetime.strptime(post_time, "%Y/%m/%d %H:%M:%S") else: post_time = None news_time = None key = item["key"] column = d('div.g-main> div> div.m-cont-hd> div.tag').text().strip() brief = d('div.g-article> div> div.review').text().strip() postraw = item["post"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if column is not None: tags = column.split() else: tags = [] logger.info("%s, %s, %s, %s, %s, %s", key, title, post_time, news_time, brief, ":".join(tags)) article = d('div.g-article> div.m-article').html() #logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) # # for t in contents: # logger.info(t["data"]) # logger.info("") flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_posterId_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, j_content, content, download_crawler): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = j_content['id'] type = TYPE title = j_content['title'] newspost = j_content.get('image') (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] brief = j_content['description'] newsurl = j_content['uri'] try: date = j_content['date'] post_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(str(date)[:-3]))) news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") - datetime.timedelta(days=1) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() article = d('div.article-container').html() contents = extract.extractContents(newsurl, article,document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": if c["data"].find("Share to facebookShare to twitterShare to linkedin") >= 0: c['data'] = c['data'].replace('Share to facebookShare to twitterShare to linkedin', '') dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("gbk"))) key = newsurl.split("/")[-1].replace(".htm", "") type = TYPE category = None categoryNames = [] title = d('div.hd> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # post = d('div#post_thumbnail> img').attr("src") # postraw = newspost # # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") # (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: # post = None post = None brief = d("meta[name='description']").attr("content") post_time = d('div.a_Info> span.a_time').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div.bd> div.Cnt-Main-Article-QQ').html() contents = extract.extractContents(newsurl, article) # logger.info(contents) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) key = content["news"]["id"] newsurl = "https://www.chinaventure.com.cn/cmsmodel/news/detail/%s.shtml" % key type = TYPE category = None categoryNames = [] if content["news"].has_key("newsChannelId"): if content["news"]["newsChannelId"] == 52: category = 60101 categoryNames.append("融资") if content["news"].has_key("tagName"): if content["news"]["tagName"] == '人物': category = 60103 tags = [] if content.has_key("keywordList") is True and len( content["keywordList"]) > 0: for tag in content["keywordList"]: if tag.has_key("keyword") and tag[ "keyword"] is not None and tag["keyword"].strip( ) != "" and tag["keyword"] not in tags: tags.append(tag["keyword"]) title = content["news"]["title"].replace(""", "\"") mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: logger.info( "***************************News existed!!!***********************" ) mongo.close() return # post = d('div#post_thumbnail> img').attr("src") postraw = "http://pic.chinaventure.com.cn/" + content["news"][ "coverImg"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = content["news"]["introduction"] post_time = content["news"]["updateAt"] news_time = extract.extracttime(str(post_time)) if news_time is None: news_time = datetime.datetime.now() article = pq(content["news"]["content"]).html() contents = extract.extractContents(newsurl, article) # for c in contents: # logger.info(c["data"]) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post) # return # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # logger.info("***************************News existed!!!***********************") # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("img.mp.itc.cn") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post # dnews["post"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) # logger.info("*************DONE*************") return
def test_get_domain(self): self.assertEqual('md.openapi.360.cn', uh.get_domain('http://md.openapi.360.cn/list/get')) self.assertEqual('short.weixin.qq.com', uh.get_domain('http://short.weixin.qq.com/cgi-bin/micromsg-bin/getcdndns')) self.assertEqual('inews.test.com.cn', uh.get_domain('http://inews.test.com.cn/redisTool?type=get&key=downloadNews_158008435%2CdownloadVideo_158008435')) self.assertEqual('127.0.0.1', uh.get_domain('http://127.0.0.1/redisTool?type=get&key=downloadNews_158008435%2CdownloadVideo_158008435'))
def process_news(content, news_key, url): if has_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content)) brief = d("meta[name='description']").attr("content").split(",")[-1] title = d('div#article> div.single-item> div.article-hd> h1').text().strip() pagetitle = d('head> title').text().strip() temp = pagetitle.split("-")[-2] categoryNames = [] if temp.strip() == "初页": category = 60102 categoryNames.append("产品") elif temp.strip() == 'IPO/并购': category = 60105 categoryNames.append("大公司") else: category = None post_time = d('div.author-time> span.date-time').attr("data-time") post_date = time.localtime(int(post_time)) news_time = datetime.datetime(post_date.tm_year, post_date.tm_mon, post_date.tm_mday, post_date.tm_hour, post_date.tm_min, post_date.tm_sec) key = news_key column = d('div.article-tags> a').text() tags = column.split() logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, post_time, news_time, temp, category, ":".join(tags)) article = d('div#article> div> div.article-content').html() # # logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None: return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) # for t in contents: # logger.info(t["data"]) # logger.info("") flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "" or desc_helper.check_desc(brief,2) is False: brief = util.get_brief_from_news(dcontents) # post = util.get_poster_from_news(dcontents) # dnews["post"] = post post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('div.da-title> h2').text().strip() if title.find("融资") >= 0: type = 60001 category = 60101 (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d("span.article-time").eq(0).text().strip() logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.data-article').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("btm地址") >= 0 or \ c["data"].find("版权声明") >= 0: continue if c["data"].find("8btctest1/custom/images") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process(content, citykey, crawler): cnt = 0 if has_content(content): DT = datetime.date.today() TODAY = datetime.datetime(DT.year, DT.month, DT.day) #logger.info(content) d = pq(html.fromstring(content.decode("utf-8"))) lis = d('div.wrap> div> div> ul.ativities> li.item') for li in lis: c = pq(li) img = c('a> img').attr("src").strip().replace("|130w","") if img is not None: # logger.info("poster: %s", poster) # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(img, download_crawler, SOURCE, key, "news") if posturl is not None: poster = str(posturl) else: poster = None title = c('h3.title> a').text() link = c('h3.title> a').attr("href") if link.find("http") ==-1: continue key = link.split("/")[-1] key_int = int(key) location = c('div.intro> div.address').text() sponors = c('div.intro> div.sponors> span').text().replace(","," ").replace(","," ").split() spans = c('div.intro> div.time> span') if len(spans) == 3: date = c('div.intro> div.time> span').eq(0).text() times = c('div.intro> div.time> span').eq(2).text().split("~") beginTime = date+" "+times[0] endTime = date+" "+times[1] elif len(spans) == 5: date = c('div.intro> div.time> span').eq(0).text() year = date.split("-")[0] times = c('div.intro> div.time> span').eq(2).text().split("~") beginTime = date+" "+times[0] endTime = year+"-"+times[1]+" "+c('div.intro> div.time> span').eq(4).text() else: continue try: beginDate = datetime.datetime.strptime(beginTime, "%Y-%m-%d %H:%M") endDate = datetime.datetime.strptime(endTime, "%Y-%m-%d %H:%M") except: beginDate = None if beginDate is None or beginDate < TODAY or key_int is None: # Not save active activity continue result = crawler.crawl(link) while True: if result['get'] == 'success': break else: result = crawler.crawl(link) if has_content(result['content']): contents = extract.extractContents(link, result['content']) flag, domain = url_helper.get_domain(link) dact = { "beginDate": beginDate - datetime.timedelta(hours=8), "endDate": endDate - datetime.timedelta(hours=8), "date": beginDate - datetime.timedelta(hours=8), "title": title, "link": link, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": TYPE, "original_tags": [], "processStatus": 0, "companyIds": [], "location": location, "city": citymap[citykey], "sponors": sponors, "post": poster, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": if c["data"].find("我要报名") >= 0: logger.info("************************over") break dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"],download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dact["contents"] = dcontents value = activity_simhash.get_simhash_value(dcontents) dact["simhashValue"] = value record = collection_news.find_one({"source": SOURCE, "key_int": key_int}) if record is not None: city = record["city"] if record["beginDate"] == dact["beginDate"] and record["endDate"] == dact["endDate"] and record["title"] == dact["title"] and record["city"] == citymap[citykey] and record["location"] == dact["location"]: logger.info("%s activity already existed", title) cnt += 1 continue else: collection_news.delete_one({"source": SOURCE, "key_int": key_int}) if city != citymap[citykey]: logger.info("%s has two city : %s and %s with location %s, something is wrong", title, city, citymap[citykey], location) cnt += 1 continue collection_news.insert(dact) logger.info("%s, %s, %s->%s, %s, %s, %s, %s", key, title, beginDate, endDate, ":".join(sponors),location, link, img) else: if activity_simhash.check_same_act(dact) is True: pass else: collection_news.insert(dact) logger.info("%s, %s, %s->%s, %s, %s, %s, %s", key, title, beginDate, endDate, ":".join(sponors), location, link, img) cnt += 1 logger.info("************Done***************") logger.info("*******%s activities has been checked or recorded", cnt) return cnt