def crawler_rp(nurl, nctitle, ndate, sourceId, source): retry = 0 while True: result = espcralwer.crawl(nurl, agent=True) if result['get'] == 'success': if result["redirect_url"].find("pdf") >= 0: logger.info("we got pdf : %s ", result["redirect_url"]) fileTime = extract.extracttime(ndate.split("+")[0]) content = { 'stockExchangeId': 2 if source == 13401 else 3, 'source': source, 'stockSymbol': str(sourceId), 'title': nctitle, 'link': result["redirect_url"], "date": fileTime - datetime.timedelta(hours=8), 'createTime': fileTime - datetime.timedelta(hours=8), } mongo = db.connect_mongo() collection = mongo.stock.announcement collection.insert(content) mongo.close() else: try: d = pq( html.fromstring(result["content"].decode( 'gbk', 'ignore'))) filelink = d('div.detail-header> h1> span> a').attr("href") if filelink is not None and filelink.find("pdf") >= 0: fileTime = extract.extracttime(ndate.split("+")[0]) content = { 'stockExchangeId': 2 if source == 13401 else 3, 'source': source, 'stockSymbol': str(sourceId), 'title': nctitle, 'link': filelink, "date": fileTime - datetime.timedelta(hours=8), 'createTime': fileTime - datetime.timedelta(hours=8), } mongo = db.connect_mongo() collection = mongo.stock.announcement collection.insert(content) mongo.close() # exit() except Exception, ex: logger.exception(ex) break retry += 1 if retry > 8: break
def process(content, sourceId, source, key): j = json.loads(content) infos = j["result"] mongo = db.connect_mongo() collection = mongo.stock.announcement for info in infos: ntitle = info["title"] ndate = info["date"].replace("T", " ") fileTime = extract.extracttime(ndate) nurl = info["attachUrl"] cleantitle = "[临时公告]" + ntitle cleantitle1 = "[定期报告]" + ntitle cleantitle2 = "[临时报告]" + ntitle logger.info("%s-%s-%s", ntitle, cleantitle, ndate) if fileTime > datetime.datetime(2017, 01, 11): logger.info("******we have") continue item = collection.find_one( {"title": { '$in': [cleantitle1, cleantitle2, cleantitle] }}) item1 = collection.find_one({"title": ntitle}) if item is not None or item1 is not None: logger.info("******already exists") else: logger.info("******missing, get it") crawler_rp(nurl, cleantitle, ndate, sourceId, source)
def get_link(crawler, concurrent_num, contentcrawler): for page in xrange(5): url = 'http://lanxiongsports.com/mservice/?c=news&format=json&page=%s' % ( page + 1) while True: result = crawler.crawl(url) if result['get'] == 'success': j = json.loads(result['content']) for item in j['items']: if item.has_key('ad_code'): continue key = item['id'] title = item['title'] post = item['logo'] brief = item['summary'] category = item['_category']['name'] date = item['created_at'] if not isinstance(date, datetime.datetime): logger.info('%s not datetime', date) date = extract.extracttime(date) href = 'http://lanxiongsports.com/?c=posts&a=view&id=%s' % key linkDict = { "href": href, "title": title, "post": post, "brief": brief, "category": category, "date": date, } mongo = db.connect_mongo() collection_news = mongo.article.news item = collection_news.find_one({ "link": href, 'title': title }) if item is None: # logger.info( 'not exists %s ,%s '%(href,title)) URLS.append(linkDict) else: logger.info('already exists %s , %s', href, title) mongo.close() break if len(URLS) == 0 and page > 0: logger.info('page %s got no fresh news,quiting............', page + 1) break threads = [ gevent.spawn(process, contentcrawler) for i in xrange(concurrent_num) ] gevent.joinall(threads)
def process(content): cnt = 0 res = content.replace('var szzbAffiches=', '')[:-2] # logger.info(res) infos = eval(res.decode("gbk").strip()) # logger.info(contentnew) cnt = 0 if len(infos) == 0: return 0 for info in infos: # logger.info(info) # logger.info(type(info)) if len(info) < 4: continue try: stockid = info[0] filelink = "http://disclosure.szse.cn/" + info[1] filetitle = info[2] fileTime = extract.extracttime(info[-1]) content = { 'stockExchangeId': 3, 'source': 13402, 'stockSymbol': str(stockid), 'title': filetitle, 'link': filelink, "date": fileTime - datetime.timedelta(hours=8), 'createTime': datetime.datetime.now(), } # check mongo data if link is existed mongo = db.connect_mongo() collection = mongo.stock.announcement item = collection.find_one({"link": filelink}) if item is None: collection.insert(content) cnt += 1 else: logger.info("already exists file: %s", filelink) mongo.close() logger.info("Stock: %s, file: %s|%s|%s", stockid, filetitle, fileTime, filelink) except Exception, e: logger.info(e) logger.info("cannot get info")
def process(content, flag, type): if content.find("result") >= 0: # logger.info(content) d = pq(html.fromstring(content.replace(" ", "bamy").decode("utf-8"))) for a in d('div> div.result'): try: link = d(a)('h3> a').attr("href") title = "".join(d(a)('h3> a').text().split()) # logger.info(link) if title is not None and title.strip() != "": # logger.info("Link: %s is right news link %s", link, title) # title = d(a)('h3> a').text() if type == 'title': ndate = d(a)('div.c-title-author').text().split("bamybamy")[1].replace("查看更多相关新闻>>", "").strip() else: ndate = d(a)('.c-author').text().split("bamybamy")[1].replace("查看更多相关新闻>>", "").strip() newsdate = extract.extracttime(ndate) newsdate = newsdate - datetime.timedelta(hours=8) if newsdate is not None else newsdate # newsdate = datetime.datetime.strptime(ndate, "%Y年%m月%d日 %H:%M") - datetime.timedelta(hours=8) # ndate = d(a)('div.c-title-author').text() logger.info("Link: %s is right news link %s|%s|%s", link, title, ndate, type) mongo = db.connect_mongo() collection_news = mongo.article.news item = collection_news.find_one({'$or': [{"link": link}, {'title': title}]}) collection_news_more = mongo.article.news_more item2 = collection_news_more.find_one({'$or': [{"link": link}, {'title': title}]}) mongo.close() if ((item is None and item2 is None) or flag == "all") and link not in URLS: linkmap = { "link": link, "title": title, "newsdate": newsdate } URLS.append(linkmap) else: logger.info('already exists %s', title) # if item is not None: # add_companyIds(item["link"], companyId) # add_newsdate(item["link"], newsdate) # elif item2 is not None: # add_companyIds(item2["link"], companyId) # add_newsdate(item2["link"], newsdate) else: pass except Exception, e: logger.info(e) logger.info("cannot get link")
def process(content): res = content.replace('null([', '')[:-2] j = json.loads(res) # logger.info(j) infos = j["listInfo"]["content"] cnt = 0 if len(infos) == 0: return cnt mongo = db.connect_mongo() # collection = mongo.stock.neeq_announcement collection = mongo.stock.announcement for info in infos: try: stockid = info["companyCd"] stockName = info["companyName"] filelink = "http://www.neeq.com.cn" + info["destFilePath"] filetitle = info["disclosureTitle"] fileTime = extract.extracttime(str(info["upDate"]["time"])) logger.info("Stock: %s|%s, file: %s|%s|%s", stockid, stockName, filetitle, filelink, fileTime) item = collection.find_one({"link": filelink}) if item is None: item = { # "source": SOURCE, # "sourceId": int(stockid), # "title": filetitle, # "link": filelink, # "date": fileTime- datetime.timedelta(hours=8), # "createTime": datetime.datetime.now() 'stockExchangeId': 1, 'source': 13400, 'stockSymbol': str(stockid), 'title': filetitle, 'link': filelink, "date": fileTime - datetime.timedelta(hours=8), 'createTime': datetime.datetime.now(), } collection.insert(item) cnt += 1 logger.info("Stock: %s|%s, file: %s|%s|%s", stockid, stockName, filetitle, filelink, fileTime) except Exception, e: logger.info(e) logger.info("cannot get info")
def crawler_rp(nurl, nctitle, ndate, sourceId, source): filelink = nurl if filelink is not None and filelink.find("pdf") >= 0: fileTime = extract.extracttime(ndate) content = { 'stockExchangeId': 1, 'source': source, 'stockSymbol': str(sourceId), 'title': nctitle, 'link': filelink, "date": fileTime - datetime.timedelta(hours=8), 'createTime': fileTime - datetime.timedelta(hours=8), } logger.info( json.dumps(content, ensure_ascii=False, cls=util.CJsonEncoder)) mongo = db.connect_mongo() collection = mongo.stock.announcement collection.insert(content) mongo.close()
def process_news(content, download_crawler): download_crawler = download.DownloadCrawler(use_proxy=False) category = None categoryNames = [] key = content['id'] type = TYPE title = content['title'] mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return newspost = content.get('featured_image').get('source') (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # logger.info(post) tags = [] for tag in content['tags']: tags.append(tag['name']) brief = content['seo']['description'] try: post_time = content['modified_gmt'] news_time = None if post_time.find('T'): post_time = post_time.replace('T', ' ') news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now()
def process_news(column, newsurl, content, newspost): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) key = content["news"]["id"] newsurl = "https://www.chinaventure.com.cn/cmsmodel/news/detail/%s.shtml" % key type = TYPE category = None categoryNames = [] if content["news"].has_key("newsChannelId"): if content["news"]["newsChannelId"] == 52: category = 60101 categoryNames.append("融资") if content["news"].has_key("tagName"): if content["news"]["tagName"] == '人物': category = 60103 tags = [] if content.has_key("keywordList") is True and len( content["keywordList"]) > 0: for tag in content["keywordList"]: if tag.has_key("keyword") and tag[ "keyword"] is not None and tag["keyword"].strip( ) != "" and tag["keyword"] not in tags: tags.append(tag["keyword"]) title = content["news"]["title"].replace(""", "\"") mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: logger.info( "***************************News existed!!!***********************" ) mongo.close() return # post = d('div#post_thumbnail> img').attr("src") postraw = "http://pic.chinaventure.com.cn/" + content["news"][ "coverImg"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = content["news"]["introduction"] post_time = content["news"]["updateAt"] news_time = extract.extracttime(str(post_time)) if news_time is None: news_time = datetime.datetime.now() article = pq(content["news"]["content"]).html() contents = extract.extractContents(newsurl, article) # for c in contents: # logger.info(c["data"]) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post) # return # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # logger.info("***************************News existed!!!***********************") # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("img.mp.itc.cn") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post # dnews["post"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) # logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8","ignore"))) key = newsurl.split("/")[-1].replace("i","") type = TYPE category = None title = d('head> title').text().strip() r = "content: '(.*?)',.*groupId" result = util.re_get_result(r.strip()[:-1], content) (b,) = result logger.info(b) # exit() tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.replace(",", ",").split(","): if tag is not None and tag.strip() != "" and tag not in tags and tag != title: tags.append(tag) post = None brief = None news_time = None try: r1 = "time: '(.*?)'.*},.*tagInfo" result = util.re_get_result(r1, content) (post_time,) = result logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except: pass if news_time is None: news_time = datetime.datetime.now() # exit() # article = d('div.post> div.post-content').html() # contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": 60101, "domain": domain, "categoryNames": [], # "sectors": [20] } dcontents = [] rank = 1 bb = b.replace('<', "<").replace(">",">").replace(""","\"").replace("=","=") logger.info(bb) contents = extract.extractContents(newsurl, bb, document=False) for c in contents: logger.info(c["data"]) if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 # for c in b.replace("<div><p>",'').replace("</p></div>","").split('</p><p>'): # logger.info(c) # if c.find("转载务必署名来源")>=0 or c.find("</p></div>")>=0 or c.find("<div><p> ")>=0: # continue # if c.find("img") >= 0: # c = re.sub(r'<(.*)?img.*"0">',"",c) # dc = { # "rank": rank, # "content": c, # "image": "", # "image_src": "", # } # else: # dc = { # "rank": rank, # "content": c, # "image": "", # "image_src": "", # } # # else: # # if download_crawler is None: # # dc = { # # "rank": rank, # # "content": "", # # "image": "", # # "image_src": c, # # } # # else: # # (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c, download_crawler, SOURCE, key, "news") # # if imgurl is not None: # # dc = { # # "rank": rank, # # "content": "", # # "image": str(imgurl), # # "image_src": "", # # "height": int(height), # # "width": int(width) # # } # # else: # # continue # # logger.info(c) # dcontents.append(dc) # rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) mid = None if title is not None and len(dcontents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE*************%s",mid) return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) j = json.loads(content) d = j["obj"]["current"] category = None categoryNames = [] key = newsurl.split("?")[0].split("/")[-1] type = TYPE title = d["title"].strip() if title.find("融资") >= 0: type = 60001 category = 60101 mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] articletags = d["tags"] if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) brief = d["synopsis"] try: post_time = topic logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() article = d["content"] contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": str(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("追究法律责任") >= 0 or \ c["data"].find("details") >= 0 or \ c["data"].find("转载") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): # logger.info(content) # logger.info(content.decode('ISO-8859-1').encode('utf-8')) d = pq(html.fromstring(content.decode('utf-8', "ignore"))) # d = pq(html.fromstring(content)) key = newsurl.split("/")[-1].replace(".shtml", "") type = TYPE if column["category"] == 60003: type = 60003 category = 60107 else: category = column["category"] title = d('div.content> div.main_c> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("meta[name='description']").attr("content") post_time = d( 'div.content> div.main_c> div.article_info> div.infos> span.time' ).text().replace(".", "-") logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() # else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") article = d('div.content> div.main_c> div#content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, # "sectors": [20] "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, download_crawler): logger.info('starting process_news %s', newsurl) # if has_news_content(content): if 1: download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].replace('.html', '') # type = TYPE category = None title = d('.single-post-title').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() logger.info('title:%s already exists' % title) return tags = [] articletags = d(".mb-2 a").text().strip() if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) postraw = None # post = d('div#post_thumbnail> img').attr("src") # if post is not None: # post = "http://luxe.com"+ post # brief = d(".intr").text() # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip() brief = None # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content) # news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %H:%M:%S') news_time = d('.post-meta').text().split()[-1] news_time = extract.extracttime(news_time) # dt = datetime.date.today() today = datetime.datetime.now() if news_time is None or news_time > today: news_time = datetime.datetime.now() article = d('.post-body').html() contents = extract.extractContents(newsurl, article, document=False) # if len(contents)==0: # contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, postraw) # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # logger.info( 'title:%s already exists'%title) # return flag, domain = url_helper.get_domain(newsurl) typeNames = d('.breadcrumb-item a').text() TYPE = 60001 if typeNames.find(u'金融') >= 0 or typeNames.find( u'融资') >= 0 else 60005 if title.find(u'融资') >= 0 or title.find(u'投资') >= 0: TYPE = 60001 category = 60101 else: category = None dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": None, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) dnews["brief"] = brief # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # update link content with oldId item = collection_news.find_one({"link": newsurl}) if item is None: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", newsurl) # collection_news.update_many({'link': newsurl},{'$set': dnews}) # oldId = collection_news.find_one({"link": newsurl})['_id'] # collection_news.delete_one({"link": newsurl}) # dnews['_id']=oldId # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") return
def process_news(item, url, content): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) titleraw = d('head> title').text().strip() temp = titleraw.split("|") title = temp[0].strip() # title = d('h1.article-title').text().strip() if item is None: news_time = extractArticlePublishedDate.extractArticlePublishedDate( url, content) if news_time is None: news_time = datetime.datetime.now() key = url.split("/")[-1].replace(".html", "") else: news_time = extract.extracttime(item["post_date"]) if news_time is None: news_time = datetime.datetime.now() key = item["key"] mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({ "source": SOURCE, "key_int": int(key) }) is not None: mongo.close() return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({ "title": title, "source": { "$ne": SOURCE } }) is not None: mongo.close() return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) mongo.close() #column = d('div.main-text> div.addd-fl-tag').text() # column = d('div.article-source> span.article-tag-top').text() if isinstance(item, dict): column = item["columns"] else: column = None brief = d('div.article-digest').text() if column is not None and column.strip() != "": tags = column.split() else: tags = [] categoryNames = [] if "课堂" in tags or "专栏" in tags: TYPE = 60003 category = 60107 else: TYPE = 60001 if "融资汇" in tags: category = 60101 categoryNames.append("融资") elif "早期项目" in tags: categoryNames.append("早期项目") if title.find("融资") >= 0: category = 60101 categoryNames.append("融资") else: category = 60102 elif "A轮后" in tags and title.find("融资") >= 0: category = 60101 categoryNames.append("融资") elif "大公司" in tags: category = 60105 categoryNames.append("大公司") elif "投行" in tags: category = 60104 categoryNames.append("投资人观点") else: category = None # tagsmore = d('div.article-tag> ul').text().split() # for a in tagsmore: # if a not in tags: # tags.append(a) if isinstance(item, dict): postraw = item["post"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new( postraw, download_crawler, SOURCE, key, "news") else: posturl = None if posturl is not None: post = str(posturl) else: post = None #article_img = d('div.article> div.main-text> p> img').attr('src') logger.info("%s, %s, %s, %s, %s, %s", key, title, news_time, TYPE, category, ":".join(tags)) article = d('div.article-main > div.main-text').html() #logger.info(article) contents = extract.extractContents(url, article, document=False) # dcontents = [] # if article_img is not None: # dc = { # "rank": 1, # "content": "", # "image": "", # "image_src": article_img, # } # dcontents.append(dc) # logger.info(article_img) # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None: # mongo.close() # return # # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) # # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: # mongo.close() # return # # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) # mongo.close() # for t in contents: # logger.info(t["data"]) # logger.info("") flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain, # "sectors": [20] } dcontents = [] rank = 1 for c in contents: if column is not None and c["data"].strip() == column.strip(): continue if c["data"].find("default/images/theme/company_code.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "" or desc_helper.check_desc( brief, 2) is False: brief = util.get_brief_from_news(dcontents) # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here.') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) category = None categoryNames = [] Type = TYPE tags = [] brief = None title = d('h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return key = d('article').attr('id').strip().split('-')[-1] try: (posturl, width, height) = parser_mysql_util.get_logo_id_new( newspost, download_crawler, SOURCE, key, "news") except: posturl = None if posturl is not None: post = str(posturl) else: post = None try: post_time = d("header> div> span> time").text().strip() res = re.search(u'(\d{4})年(\d+)月(\d+)日', post_time) year = res.group(1) month = res.group(2) if len(month) == 1: month = '0' + month day = res.group(3) if len(day) == 1: day = '0' + day post_time = '{}-{}-{}'.format(year, month, day) news_time = extract.extracttime(post_time) except Exception as e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.td-post-content').html() contents = extract.extractContents(newsurl, article, document=True) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": Type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 for c in contents: if c['type'] == 'text': dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process_news(column, newsurl, content): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1] category = None categoryNames = [] if column.has_key("column") and column["column"] in [ "guandian", "guancha" ]: type = 60003 category = 60107 else: type = TYPE if column.has_key("column") and column["column"] in ["touzirongzi"]: category = 60101 categoryNames.append("融资") # else: # category = None tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip() != "" and tag not in tags: tags.append(tag) title = d('div#post_content> div> div#post_title').text().strip() if title is None or title.strip() == "": title = d('div#post_content> div> h1#post_title').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return postraw = d('div#post_thumbnail> img').attr("src") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("meta[name='description']").attr("content") post_time = d('div#post_info> div> div#post_date').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div#post_description').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("不代表亿欧对观点赞同或支持") != -1 or c["data"] == "5元": break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8", "ignore"))) key = newsurl.split("/")[-1].replace(".html", "") type = TYPE category = None title = d('article> h1').text().strip() tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) if "English" in tags or "english" in tags: logger.info("Englis not needed, get out!") return if "商业价值杂志" in tags: type = 60003 category = 60107 # post = d('div#post_thumbnail> img').attr("src") # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: # post = None postraw = d("meta[property='og:image']").attr("content") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("article> p.post-abstract").text().strip().replace( '摘要: ', "") post_time = d('article> div.post-info> span.time').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('article> div.inner').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] # "sectors": [20] } dcontents = [] rank = 1 for c in contents: if c["data"] == "http://www.tmtpost.com/public/css/images/wzny_ewm.jpg": continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) if title is not None and len(contents) > 0: # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) mongo.close() # logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newspost, download_crawler, sort): if has_news_content(content): logger.info("here") d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1] type = TYPE if sort.find("投融资") >= 0: type = 60001 category = None title = d('div.m-sd-post-box> h1').text().strip() if title is None or title == "": return tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.replace(",", ",").split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # # # (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: # post = None post = None # post = d("meta[property='og:image']").attr("content") # try: # brief = d("meta[name='description']").attr("content") # except: # brief = None brief = None try: post_time = d('span.time').text() logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except: news_time = datetime.datetime.now() article = d('div.g-post-content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s, %s, %s", key, title, news_time, ":".join(tags), type, category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [], "sectors": [20] } dcontents = [] rank = 1 for c in contents: # if c["data"].find("◆END◆")>=0 or c["data"].find("…………………")>=0: # break # # if c["data"].find("ACG 领域最具影响力的产业新媒体") >= 0 or c["data"].find("访问三文娱网站3wyu.com查看产业必读文章") >=0: # continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) if title is not None and len(contents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE************* %s", mid) return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8", "ignore"))) key = newsurl.split("/")[-1] type = TYPE category = None title = d('div.post> div.post-title> h1.title').text().strip() # logger.info("title: %s", title) if title is None or title == "": return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.replace(",", ",").split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # post = d('div#post_thumbnail> img').attr("src") # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # post = d("meta[property='og:image']").attr("content") brief = d("meta[name='description']").attr("content") news_time = None try: post_time = d( 'div.post> div.post-title> div> span.postclock').text() logger.info("post-time: %s", post_time) # for span in spans: # if d(span).text() is not None and d(span).text().find("日期") >= 0: # post_time = d(span).text().replace("日期:","").strip() # logger.info(post_time) # try: # news_time = datetime.datetime.strptime(post_time,"%Y年%m月%d日 %H:%M") # except Exception, e: # logger.info(e) # pass # break news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except: pass if news_time is None: news_time = datetime.datetime.now() article = d('div.post> div.post-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [], # "sectors": [20] } dcontents = [] rank = 1 for c in contents: if c["data"].find("转载请联系原出处") >= 0 or c["data"].find( "网页转载须在文首") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) mid = None if title is not None and len(contents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE*************%s",mid) return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): key = content["id"] type = TYPE category = None title = content["title"] mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return try: tags = content["keywords"].split(",") except: tags = [] postraw = newspost # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = content["description"] post_time = content["pubdate"] news_time = extract.extracttime(str(post_time)) if news_time is None: news_time = datetime.datetime.now() article = pq(content["content"]).html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) # d = pq(html.fromstring(content.decode("utf-8","ignore"))) if content.find("charset=GBK") == -1: d = pq(html.fromstring(content.decode("utf-8","ignore"))) utfflag = True else: d = pq(html.fromstring(content.decode("gbk", "ignore"))) utfflag = False key = newsurl.split("?")[0].split("/")[-1].replace(".shtml","") type = TYPE category = None categoryNames = [] tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip() != "" and tag not in tags: tags.append(tag) if utfflag is True: title = d('article> div> h1').text().strip() else: title = d('div.titleH> h1').text().strip() logger.info("title: %s",title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return # post = d('div#post_thumbnail> img').attr("src") postraw = newspost # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("meta[name='description']").attr("content") if utfflag is True: post_time = d('p.source> span.f-right').eq(0).text() else: post_time = d('div.titleH> p.zsp> span').eq(2).text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() # article = d('div.contdiv').html() if utfflag is True: article = d('div.post-text').html() else: article = d('div.contdiv').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("电商资讯第一入口") != -1: break if c["data"] in Nocontents: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('div.post-inner> h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] articletags = d("meta[name='keywords']").attr('content') if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d("p.post-byline> time.published").text().strip() logger.info('时间:%s' % post_time) p = re.compile(u'(年|月)') post_time = p.sub('-', post_time).replace('日', '') logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.entry-inner').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 for c in contents: if c['data'].find('文章相关引用及参考') >= 0 or c['data'].find( '读者QQ群') >= 0: continue if c['type'] == 'text': dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip().replace(".shtml", "") type = TYPE category = None title = d('div.subject> h1').text().strip() tags = [] post = newspost brief = d("meta[name='description']").attr("content") post_time = d('div.meta> span.meta-date').text().replace("发布", "") logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div.subject> div.subject-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: imgurl = parser_mysql_util.get_logo_id( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) collection_news.insert(dnews) mongo.close() logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newsposttime, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip() type = TYPE title = d('div.article-wrap> div.article-head> h1').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) category = None categoryNames = [] if "投资并购" in tags: category = 60101 categoryNames.append("融资") # post = d('div#post_thumbnail> img').attr("src") post = None brief = d("meta[name='description']").attr("content") news_time = None if newsposttime is not None: news_time = extract.extracttime(newsposttime) if news_time is None: dt = datetime.date.today() post_time = d( 'div.article-wrap> div.article-head> p> span.article-time' ).text() if post_time is None or post_time.strip() == str(dt): news_time = datetime.datetime.now() # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") article = d('div.article-wrap> div.article-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) dnews["postId"] = post dnews["brief"] = brief # Design for sector: dnews["sectors"] = [10] dnews["sector_confidence"] = [1] if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): try: d = pq(html.fromstring(content.decode("utf-8",'ignore'))) except: d = pq(html.fromstring(content)) key = newsurl.split("/")[-1].replace(".shtml","") type = TYPE category = None title = d('div.main-content> div.title').text().strip() tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip() != "" and tag not in tags and tag != title: tags.append(tag) # post = d('div#post_thumbnail> img').attr("src") post = None brief = d("meta[name='description']").attr("content") post_time = d('div.author> span.time').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() try: article = d('div.main-content').html() except: article = content contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["data"].find("http://app.iheima.com/?app=member&controller=avatar") != -1 or \ c["data"] ==title or c["data"] == brief or c["data"].find(post_time) != -1 or \ c["data"].find("data:image/png;base64") != -1: continue if c["data"].find("未经授权,转载必究") != -1 or c["data"].find("赞(...)") != -1: break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) if d.text().find('embed') >= 0: # 排除视频文章 logger.info('not article:%s' % newsurl) return category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return try: (posturl, width, height) = parser_mysql_util.get_logo_id_new( newspost, download_crawler, SOURCE, key, "news") except: posturl = None if posturl is not None: post = str(posturl) else: post = None tags = [] articletags = d("meta[name='keywords']").attr('content') if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time_1 = d("div.wyt-post-content-meta> div> p ").find( 'span').text().strip() post_time_2 = d("div.wyt-post-content-meta> div").find( 'p').next().text().strip() if post_time_1: post_time = post_time_1 else: post_time = post_time_2 if re.match('\d{2}-\d{2}', post_time): # 匹配 03-19格式 post_time = str(time.localtime()[0]) + '-' + post_time news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('article.wyt-post-content').html() contents = extract.extractContents(newsurl, article, document=True) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 if contents[0]['type'] == 'img': del contents[0] for c in contents: # logger.info("%s-%s",c["type"],c["data"]) if c['type'] == 'text': if re.match('^\d+$', c['data']) or c['data'].find('收藏') >= 0 or c['data'].find('投融资') >= 0 or c['data'].find('阅读时间') >= 0 \ or c['data'].find('违者必究') >= 0 or c['data'].find('微信公众号') >= 0 or c['data'].find('微信扫描') >= 0 \ or c['data'].find('点击获取完整版报告') >= 0 or c['data'].find('作者原创,微信号') >= 0: continue # if c['data'].find('译者') >= 0: # c['data'] = c['data'].split(' ')[0] # # if c['data'].find('来源') >= 0: # c['data'] = c['data'].split('|')[0] if c['data'].find('| 未央网') >= 0: c['data'] = c['data'].replace('| 未央网', ' ') dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("gbk"))) key = newsurl.split("/")[-1].replace(".htm", "") type = TYPE category = None categoryNames = [] title = d('div.hd> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # post = d('div#post_thumbnail> img').attr("src") # postraw = newspost # # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") # (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: # post = None post = None brief = d("meta[name='description']").attr("content") post_time = d('div.a_Info> span.a_time').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div.bd> div.Cnt-Main-Article-QQ').html() contents = extract.extractContents(newsurl, article) # logger.info(contents) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) # d = pq(html.fromstring(content.decode("gbk","ignore"))) utfflag = False if content.find("gb2312") == -1: d = pq(html.fromstring(content.decode("utf-8", "ignore"))) utfflag = True else: d = pq(html.fromstring(content.decode("gbk", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".shtml", "") type = TYPE post = None if utfflag is True: title = d('div#titsize> strong').text().strip() else: title = d('div.titmain> h1').text().strip() # logger.info("title: %s", title) if title is None or title.strip() == "": title = d('div.texttitbox> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # try: # brief = d('div.daodu> p').text().strip().replace("【数据猿导读】","") # except: # brief = None brief = None try: if utfflag is True: post_time = d("p.time> span.mh-title").text().strip() else: post_time = d("meta[property='og:release_date']").attr( "content").split("+")[0] logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if utfflag is True: article = d('div.tbox.content').html() else: article = d('div.texttit_m1').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=20), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('div.da-title> h2').text().strip() if title.find("融资") >= 0: type = 60001 category = 60101 (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d("span.article-time").eq(0).text().strip() logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.data-article').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("btm地址") >= 0 or \ c["data"].find("版权声明") >= 0: continue if c["data"].find("8btctest1/custom/images") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(column, newsurl, content): if has_news_content(content): d = pq(content) title = d('div#post_title').text() url = newsurl key = url.split('/')[-1] post_time = d('div#post_date').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() logger.info("title:%s, date:%s", title, news_time) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": [], "processStatus": 0, # "companyId":companyId, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] description = d('div#post_description').text() if description is not None: dc = { "rank": 1, "content": "亿欧快讯", "image": "", "image_src": "", } dcontents.append(dc) dc = { "rank": 2, "content": description.replace("【消息来源】", ""), "image": "", "image_src": "", } dcontents.append(dc) logger.info(description) dnews["contents"] = dcontents brief = util.get_brief_from_news(dcontents) post = util.get_posterId_from_news(dcontents) dnews["postId"] = post # dnews["post"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) # collection_news.insert(dnews) return