def process_news(content, url, key, col): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) title = d('div.cj_content> div.cj_top> div.cj_tit> h2').text().strip().replace("&quo;", "\"") datecontent = d('div.cj_content> div.cj_top> div.cj_tit> p.fa').text() result = util.re_get_result('(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', datecontent) if result: post_time, = result news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") else: logger.info("incorrcet post time") return try: key_int = int(key) except: key_int = None brief = d("meta[name='description']").attr("content").strip() if col["column"] == "view": type = 60003 else: type = TYPE categoryNames = [] category = col["category"] if category == 60105: categoryNames.append("大公司") if category == 60101: categoryNames.append("融资") tags = [] keywords = d("meta[name='keywords']").attr("content") if keywords is not None: for keyword in keywords.split(","): if keyword is not None and keyword.strip() not in tags: tags.append(keyword.strip()) postraw = d('div.cj_content> div.cj_top> img.gg').attr("src") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time, news_time, brief, ":".join(tags), category, post) article = d('div.para_ycont> div.col-xs-12').html() # logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""), # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""), } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"].replace("?imageView2/2/w/750/q/90",""), download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("gbk"))) key = newsurl.split("/")[-1].replace(".htm", "") type = TYPE category = None categoryNames = [] title = d('div.hd> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # post = d('div#post_thumbnail> img').attr("src") # postraw = newspost # # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") # (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: # post = None post = None brief = d("meta[name='description']").attr("content") post_time = d('div.a_Info> span.a_time').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div.bd> div.Cnt-Main-Article-QQ').html() contents = extract.extractContents(newsurl, article) # logger.info(contents) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, download_crawler): # if has_news_content(content): if 1: # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode('gb2312', 'ignore'))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".htm", "") title = d('h1.title').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) brief = None news_time = d('.timer').text() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M:%S') article = d('.content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, # "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) mongo.close() if news_classify.get_class(dcontents, 13866) == 1: logger.info('%s is fundingNews', title) TYPE = 60001 else: TYPE = 60010 logger.info('%s is not fundingNews', title) dnews['type'] = TYPE if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass return
def process_news(column, newsurl, content, newspost, download_crawler): logger.info('starting process_news %s', newsurl) # if has_news_content(content): if 1: download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1] # type = TYPE category = None title = d('.article_title p').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() logger.info('title:%s already exists' % title) return tags = [] articletags = d(".labs a").text().strip() if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) postraw = newspost # post = d('div#post_thumbnail> img').attr("src") # if post is not None: # post = "http://vcbeat.com"+ post brief = None # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip() # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content) news_time = d('.time').text().strip() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M') # dt = datetime.date.today() # today = datetime.datetime.now() # if news_time is None or news_time > today: # news_time = datetime.datetime.now() article = d('.art_text').html() contents = extract.extractContents(newsurl, article, document=False) # if len(contents)==0: # contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, postraw) # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # logger.info( 'title:%s already exists'%title) # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": TYPE, "original_tags": None, "processStatus": 0, # "companyId": None, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) dnews["brief"] = brief # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # update link content with oldId item = collection_news.find_one({"link": newsurl}) if item is None: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", newsurl) # collection_news.update_many({'link': newsurl},{'$set': dnews}) # oldId = collection_news.find_one({"link": newsurl})['_id'] # collection_news.delete_one({"link": newsurl}) # dnews['_id']=oldId # collection_news.insert(dnews) mongo.close() logger.info("*************DONE*************") return
def process_news(item, url, content): if has_news_content(content): d = pq(html.fromstring(content.decode("gbk"))) title = d( 'div.g-main> div> div.m-cont-hd> div.title> h1').text().strip() datecontent = d( 'div.g-main> div> div.m-cont-hd> div.m-info> div> div> div.box> div.origin' ).text().strip() result = util.re_get_result('(\d{4}\/.*?)$', datecontent) if result: post_time, = result news_time = datetime.datetime.strptime(post_time, "%Y/%m/%d %H:%M:%S") else: post_time = None news_time = None key = item["key"] column = d('div.g-main> div> div.m-cont-hd> div.tag').text().strip() brief = d('div.g-article> div> div.review').text().strip() postraw = item["post"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if column is not None: tags = column.split() else: tags = [] logger.info("%s, %s, %s, %s, %s, %s", key, title, post_time, news_time, brief, ":".join(tags)) article = d('div.g-article> div.m-article').html() #logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) # # for t in contents: # logger.info(t["data"]) # logger.info("") flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_posterId_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, j_content, content, download_crawler): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = j_content['id'] type = TYPE title = j_content['title'] newspost = j_content.get('image') (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] brief = j_content['description'] newsurl = j_content['uri'] try: date = j_content['date'] post_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(str(date)[:-3]))) news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") - datetime.timedelta(days=1) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() article = d('div.article-container').html() contents = extract.extractContents(newsurl, article,document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": if c["data"].find("Share to facebookShare to twitterShare to linkedin") >= 0: c['data'] = c['data'].replace('Share to facebookShare to twitterShare to linkedin', '') dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(content, news_key, url): if has_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content)) brief = d("meta[name='description']").attr("content").split(",")[-1] title = d('div#article> div.single-item> div.article-hd> h1').text().strip() pagetitle = d('head> title').text().strip() temp = pagetitle.split("-")[-2] categoryNames = [] if temp.strip() == "初页": category = 60102 categoryNames.append("产品") elif temp.strip() == 'IPO/并购': category = 60105 categoryNames.append("大公司") else: category = None post_time = d('div.author-time> span.date-time').attr("data-time") post_date = time.localtime(int(post_time)) news_time = datetime.datetime(post_date.tm_year, post_date.tm_mon, post_date.tm_mday, post_date.tm_hour, post_date.tm_min, post_date.tm_sec) key = news_key column = d('div.article-tags> a').text() tags = column.split() logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, post_time, news_time, temp, category, ":".join(tags)) article = d('div#article> div> div.article-content').html() # # logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None: return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) # for t in contents: # logger.info(t["data"]) # logger.info("") flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "" or desc_helper.check_desc(brief,2) is False: brief = util.get_brief_from_news(dcontents) # post = util.get_poster_from_news(dcontents) # dnews["post"] = post post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost, download_crawler, force): if has_news_content(content): main = pq(content)('div.article_content') d = pq(main) key = newsurl.split("/")[-1].replace(".html", "") title = pq(content)('head> title').text().strip() logger.info("title: %s", title) # title = d('h1#article_title').text() brief = pq(content)("meta[name='description']").attr("content") # post_time =pq(content)("meta[property='article:published_time']").attr("content").split("+")[0] # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S") result = util.re_get_result("var publishTime = new Date\(\"(.*?)\"\)", content) if result: post_time, = result news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") else: logger.info("incorrcet post time") logger.info(content) # exit() return categoryNames = [] contents = extract.extractContents(newsurl, d.html()) if title.find("融资") >= 0 or title.find("获投") >= 0: category = 60101 categoryNames.append("融资") else: category = None tags = [] articletags = pq(content)("meta[name='keywords']").attr( "content").replace(";", ",") if articletags is None: logger.info(content) else: for tag in articletags.split(","): if tag is not None and tag.strip() != "" and tag not in tags: tags.append(tag) logger.info("%s, %s, %s, %s, %s, %s", key, title, news_time, category, ":".join(tags), brief) if force is True: mongo = db.connect_mongo() collection_news = mongo.article.news collection_news.delete_many({ "source": SOURCE, "key_int": int(key) }) collection_news.delete_many({"title": title}) mongo.close() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } #pjtables pjcontents = [] trs = pq(content)('div.proj_table> table> tr') logger.info("*****len of trs %s", len(trs)) for tr in trs: logger.info(tr) co = pq(tr).text() logger.info(co) if co is not None and co.strip() != "": pjcontents.append(co.replace(" ", ":")) dcontents = [] rank = 1 for c in contents: if c["data"] == "/The End/": break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 for pjc in pjcontents: dc = { "rank": rank, "content": pjc, "image": "", "image_src": "", } dcontents.append(dc) logger.info(pjc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # post = util.get_poster_from_news(dcontents) # dnews["post"] = post # if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # id =collection_news.insert(dnews) # logger.info("***********id: %s", id) # logger.info("*************DONE**************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) mongo.close() return
def process(crawler): while True: if len(URLS) == 0: return linkDict = URLS.pop(0) retry = 0 while True: retry += 1 if retry > 6: break download_crawler = download.DownloadCrawler(use_proxy=False) url = linkDict['href'] result = crawler.crawl(url) if result['get'] == 'success': d = pq(html.fromstring(result['content'].decode("utf-8"))) title = linkDict['title'] key = url.split('/')[-1] category = d('.al-crumbs a:nth-child(2)').text() if categoryDict.has_key(category): TYPE = categoryDict[category]['type'] category = categoryDict[category]['category'] else: TYPE = 60001 category = None brief = linkDict['brief'] postraw = linkDict['post'] tags = [] # for tag in d('.tags').text().split(): # if tag.strip() not in tags: tags.append(tag) news_time = d('.article__published').eq(0).text() # news_time = datetime.datetime.strptime(' '.join(news_time.split(' ')[:2]), '%Y年%m月%d日 %H:%M') # news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %p %I:%M') news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %H:%M') flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } article = d('.article__content').html() contents = extract.extractContents(url, article) dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new( postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post # brief=brief[:100] dnews["brief"] = brief mongo = db.connect_mongo() collection_news = mongo.article.news # update link content with oldId item = collection_news.find_one({"link": url}) if len(dcontents) > 1: if item is None: # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", url) # oldId = collection_news.find_one({"link": url})['_id'] # collection_news.delete_one({"link": url}) # dnews['_id'] = oldId # collection_news.insert(dnews) mongo.close() logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, news_time, category, " ".join(tags), brief, post) logger.info("*************DONE*************") break
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): key = content["id"] type = TYPE category = None title = content["title"] mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return try: tags = content["keywords"].split(",") except: tags = [] postraw = newspost # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = content["description"] post_time = content["pubdate"] news_time = extract.extracttime(str(post_time)) if news_time is None: news_time = datetime.datetime.now() article = pq(content["content"]).html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(self, newsurl, content, download_crawler): dnews = {} if self.has_news_content(content): try: d = pq(html.fromstring(content.decode("utf-8"))) except: d = pq(html.fromstring(content)) try: key = re.findall('sn=(.*)?&', newsurl)[0] except: key = newsurl try: key_int = int(key) except: key_int = None news_time = extractArticlePublishedDate.extractArticlePublishedDate( newsurl, content) if news_time is None: news_time = datetime.datetime.now() # title = extract.extractTitle(content) # title = d('.rich_media_title').eq(0).text() r = "var msg_title = \"(.*?)\".*var msg_desc" result = util.re_get_result(r, content) if result: # logger.info("Found brief") title, = result logger.info(title) else: title = None logger.info("title: %s", title) article = d('#page-content .rich_media_content').html() # contents = extract.extractContents(newsurl, article) contents = self.extractWechatContents(article) r = "var msg_desc = \"(.*?)\".*var msg_cdn_url" result = util.re_get_result(r, content) if result: # logger.info("Found brief") brief, = result logger.info(brief) else: brief = None # logger.info(b) tags = [] try: articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) except: pass logger.info("News: %s, %s, %s", key, title, news_time) # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"link": newsurl}) is not None: # mongo.close() # return try: wechatId = d('span.profile_meta_value').eq(0).text().strip() wechatName = d('strong.profile_nickname').text().strip() except: wechatId = None wechatName = None logger.info("wechatId: %s, wechatName: %s", wechatId, wechatName) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": self.SOURCE, "key": key, "key_int": key_int, "type": self.TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": self.CATEGORY, "domain": domain, "categoryNames": [], "wechatId": wechatId, "wechatName": wechatName } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, self.SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None: brief = util.get_brief_from_news(dcontents) post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief.decode("utf-8")[:100] # if news_time > datetime.datetime.now() or news_time < datetime.datetime.now() - datetime.timedelta(days=30): # logger.info("Time: %s is not correct with current time", news_time) # dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) # mongo.close() # print dnews logger.info("*************DONE*************") return dnews
def process_news(column, newsurl, content, newspost): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) # d = pq(html.fromstring(content.decode("utf-8","ignore"))) if content.find("charset=GBK") == -1: d = pq(html.fromstring(content.decode("utf-8","ignore"))) utfflag = True else: d = pq(html.fromstring(content.decode("gbk", "ignore"))) utfflag = False key = newsurl.split("?")[0].split("/")[-1].replace(".shtml","") type = TYPE category = None categoryNames = [] tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip() != "" and tag not in tags: tags.append(tag) if utfflag is True: title = d('article> div> h1').text().strip() else: title = d('div.titleH> h1').text().strip() logger.info("title: %s",title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return # post = d('div#post_thumbnail> img').attr("src") postraw = newspost # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("meta[name='description']").attr("content") if utfflag is True: post_time = d('p.source> span.f-right').eq(0).text() else: post_time = d('div.titleH> p.zsp> span').eq(2).text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() # article = d('div.contdiv').html() if utfflag is True: article = d('div.post-text').html() else: article = d('div.contdiv').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("电商资讯第一入口") != -1: break if c["data"] in Nocontents: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, d_map, content, download_crawler): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] type = TYPE key = d('div#sb-site> article').attr('data-id') title = d('div#sb-site> article> section#article-header> h2> strong> a' ).text().strip() newspost = d( 'div#sb-site> article> section#article-image> div> figure> img' ).attr('src') logger.info('newspost:%s' % newspost) newsurl = d_map['link'] (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] brief = d("meta[name='description']").attr('content') post_time = d('div#sb-site> article> section#article-meta> span> em' ).text().strip() news_time = None is_re = re.search('(\d{2}-\d{2}-\d{4})', post_time) is_re2 = re.search('(\d) hours ago', post_time) if is_re: news_time = datetime.datetime.strptime(is_re.group(1), "%d-%m-%Y") elif is_re2: news_time = datetime.datetime.now() - datetime.timedelta( hours=int(is_re2.group(1))) elif post_time.find('a moment') >= 0: news_time = datetime.datetime.now() logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) article = d( 'div#sb-site> article> section#article-content> div.post-content> div.row' ).html() is_re3 = re.search("(<strong>DailySocial\.id.*?</p>)", article, re.S) if is_re3: article = article.replace(is_re3.group(1), '') contents = extract.extractContents(newsurl, article, document=False) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("Also Read") >= 0 or c['data'].find( 'function()') >= 0: continue # if c['data'].find('caption') >= 0: # c['data'] = c['data'].replace('caption','') if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass return
def run(url): proxy = get_proxy() if proxy is None: logger.info("Error: no proxy!") return logger.info(proxy) response = request(proxy, url) if response is None: return text = response.text # logger.info(html) key = url.split("/")[-1].strip() download_crawler = None d = pq(html.fromstring(text.decode("utf-8"))) title = d('h1.article-title').text().strip() str_time = d('span.time').text().strip() str_content = d('div.article-content').html() brief = d("meta[name='description']").attr("content") logger.info(title) logger.info(str_time) # logger.info(str_content) contents = extract.extractContents(url, str_content) # logger.info(contents) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(url) news_time = datetime.datetime.strptime(str_time, "%Y-%m-%d %H:%M") dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": 60001, "original_tags": [], "processStatus": 0, # "companyId": None, "companyIds": [], "domain": domain, "category": None, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: imgurl = parser_mysql_util.get_logo_id(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) id = collection_news.insert(dnews) mongo.close() logger.info("*************DONE************* %s", id)
def process_news(content, url): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) download_crawler = download.DownloadCrawler(use_proxy=False) title = d( 'div.post-img-left> div> div.post-head> h1.title').text().strip() post_time = d('article.post-article').attr("ptime") post_Date = time.localtime(int(post_time)) news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon, post_Date.tm_mday, post_Date.tm_hour, post_Date.tm_min, post_Date.tm_sec) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) if collection_news.find_one({ "title": title, "source": { "$ne": SOURCE } }) is not None: return key = d('article.post-article').attr("postid") try: key_int = int(key) except: key_int = None column = d('span.post-category').text().strip() brief = d("meta[name='description']").attr("content").strip() if column is not None: tags = column.split() else: tags = [] categoryNames = [] if "人物" in tags: category = 60103 elif "公司" in tags: category = 60105 categoryNames.append("大公司") else: category = None keywords = d("meta[name='keywords']").attr("content") if keywords is not None: for keyword in keywords.split(","): if keyword is not None and keyword.strip( ) not in tags and keyword.strip() not in ["PingWest", "品玩"]: tags.append(keyword.strip()) postraw = d("link[rel='image_src']").attr("href") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time, news_time, brief, ":".join(tags), category, post) article = d('div.box-con> div#sc-container').html() # logger.info(article) contents = extract.extractContents(url, article) # if collection_news.find_one({"link": url}) is not None: # return # # collection_news.delete_one({"link": url}) # # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: # return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=16), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""), # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"].replace("?imageView2/2/w/750/q/90", ""), } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"].replace("?imageView2/2/w/750/q/90", ""), download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8", "ignore"))) key = newsurl.split("/")[-1].replace(".html", "") type = TYPE category = None title = d('article> h1').text().strip() tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) if "English" in tags or "english" in tags: logger.info("Englis not needed, get out!") return if "商业价值杂志" in tags: type = 60003 category = 60107 # post = d('div#post_thumbnail> img').attr("src") # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: # post = None postraw = d("meta[property='og:image']").attr("content") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("article> p.post-abstract").text().strip().replace( '摘要: ', "") post_time = d('article> div.post-info> span.time').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('article> div.inner').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] # "sectors": [20] } dcontents = [] rank = 1 for c in contents: if c["data"] == "http://www.tmtpost.com/public/css/images/wzny_ewm.jpg": continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) if title is not None and len(contents) > 0: # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) mongo.close() # logger.info("*************DONE*************") return
"image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newsposttime, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip() type = TYPE title = d('div.article-wrap> div.article-head> h1').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) category = None categoryNames = [] if "投资并购" in tags: category = 60101 categoryNames.append("融资") # post = d('div#post_thumbnail> img').attr("src") post = None brief = d("meta[name='description']").attr("content") news_time = None if newsposttime is not None: news_time = extract.extracttime(newsposttime) if news_time is None: dt = datetime.date.today() post_time = d( 'div.article-wrap> div.article-head> p> span.article-time' ).text() if post_time is None or post_time.strip() == str(dt): news_time = datetime.datetime.now() # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") article = d('div.article-wrap> div.article-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) dnews["postId"] = post dnews["brief"] = brief # Design for sector: dnews["sectors"] = [10] dnews["sector_confidence"] = [1] if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) key = content["news"]["id"] newsurl = "https://www.chinaventure.com.cn/cmsmodel/news/detail/%s.shtml" % key type = TYPE category = None categoryNames = [] if content["news"].has_key("newsChannelId"): if content["news"]["newsChannelId"] == 52: category = 60101 categoryNames.append("融资") if content["news"].has_key("tagName"): if content["news"]["tagName"] == '人物': category = 60103 tags = [] if content.has_key("keywordList") is True and len( content["keywordList"]) > 0: for tag in content["keywordList"]: if tag.has_key("keyword") and tag[ "keyword"] is not None and tag["keyword"].strip( ) != "" and tag["keyword"] not in tags: tags.append(tag["keyword"]) title = content["news"]["title"].replace(""", "\"") mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: logger.info( "***************************News existed!!!***********************" ) mongo.close() return # post = d('div#post_thumbnail> img').attr("src") postraw = "http://pic.chinaventure.com.cn/" + content["news"][ "coverImg"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = content["news"]["introduction"] post_time = content["news"]["updateAt"] news_time = extract.extracttime(str(post_time)) if news_time is None: news_time = datetime.datetime.now() article = pq(content["news"]["content"]).html() contents = extract.extractContents(newsurl, article) # for c in contents: # logger.info(c["data"]) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post) # return # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # logger.info("***************************News existed!!!***********************") # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("img.mp.itc.cn") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post # dnews["post"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) # logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) if d.text().find('embed') >= 0: # 排除视频文章 logger.info('not article:%s' % newsurl) return category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return try: (posturl, width, height) = parser_mysql_util.get_logo_id_new( newspost, download_crawler, SOURCE, key, "news") except: posturl = None if posturl is not None: post = str(posturl) else: post = None tags = [] articletags = d("meta[name='keywords']").attr('content') if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time_1 = d("div.wyt-post-content-meta> div> p ").find( 'span').text().strip() post_time_2 = d("div.wyt-post-content-meta> div").find( 'p').next().text().strip() if post_time_1: post_time = post_time_1 else: post_time = post_time_2 if re.match('\d{2}-\d{2}', post_time): # 匹配 03-19格式 post_time = str(time.localtime()[0]) + '-' + post_time news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('article.wyt-post-content').html() contents = extract.extractContents(newsurl, article, document=True) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 if contents[0]['type'] == 'img': del contents[0] for c in contents: # logger.info("%s-%s",c["type"],c["data"]) if c['type'] == 'text': if re.match('^\d+$', c['data']) or c['data'].find('收藏') >= 0 or c['data'].find('投融资') >= 0 or c['data'].find('阅读时间') >= 0 \ or c['data'].find('违者必究') >= 0 or c['data'].find('微信公众号') >= 0 or c['data'].find('微信扫描') >= 0 \ or c['data'].find('点击获取完整版报告') >= 0 or c['data'].find('作者原创,微信号') >= 0: continue # if c['data'].find('译者') >= 0: # c['data'] = c['data'].split(' ')[0] # # if c['data'].find('来源') >= 0: # c['data'] = c['data'].split('|')[0] if c['data'].find('| 未央网') >= 0: c['data'] = c['data'].replace('| 未央网', ' ') dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip().replace(".html", "").replace( 'detail_', '') type = TYPE category = None title = d('div.left.zb-n> h1').text().strip() tags = [] postraw = newspost # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # brief = d("meta[name='description']").attr("content").replace(u'一鸣网——让发生的发声|智慧共享新媒体平台|上海TMT媒体开创者、一鸣网ymtmt.com','') brief = d('div.left.zb-n> p.gy').text().strip() news_time = datetime.datetime.now() article = d('div.left.zb-n').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: logger.info('already exists %s', title) mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 start = False for c in contents: if start is False and c["data"].find( brief) >= 0 and c["data"].find(title) >= 0: start = True continue if start is False: continue if c["data"].find("-END-") >= 0: break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) mongo.close() # logger.info("*************DONE*************") else: logger.info('has no news content %s', newsurl) return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) # d = pq(html.fromstring(content.decode("gbk","ignore"))) utfflag = False if content.find("gb2312") == -1: d = pq(html.fromstring(content.decode("utf-8", "ignore"))) utfflag = True else: d = pq(html.fromstring(content.decode("gbk", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".shtml", "") type = TYPE post = None if utfflag is True: title = d('div#titsize> strong').text().strip() else: title = d('div.titmain> h1').text().strip() # logger.info("title: %s", title) if title is None or title.strip() == "": title = d('div.texttitbox> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # try: # brief = d('div.daodu> p').text().strip().replace("【数据猿导读】","") # except: # brief = None brief = None try: if utfflag is True: post_time = d("p.time> span.mh-title").text().strip() else: post_time = d("meta[property='og:release_date']").attr( "content").split("+")[0] logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if utfflag is True: article = d('div.tbox.content').html() else: article = d('div.texttit_m1').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=20), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip().replace(".shtml", "") type = TYPE category = None title = d('div.subject> h1').text().strip() tags = [] post = newspost brief = d("meta[name='description']").attr("content") post_time = d('div.meta> span.meta-date').text().replace("发布", "") logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div.subject> div.subject-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: imgurl = parser_mysql_util.get_logo_id( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) collection_news.insert(dnews) mongo.close() logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-2].replace(".html", "") type = TYPE title = d('h1.single-title').text().strip() newspost = d('header> img.wp-post-image').attr("src") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None # try: # post_time = topic # # logger.info(post_time) # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S") # logger.info("news-time: %s", news_time) # except Exception, e: # logger.info(e) news_time = datetime.datetime.now() article = d('section.post_content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } processStatus = 0 dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["data"].find( "Continue reading this story with a subscription to DealStreetAsia" ) >= 0: processStatus = -5 if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 if processStatus != 0: dnews["processStatus"] = processStatus dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s | %s", nid, processStatus) pass return
def process_news(column, newsurl, content, newspost, download_crawler, sort): if has_news_content(content): logger.info("here") d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1] type = TYPE if sort.find("投融资") >= 0: type = 60001 category = None title = d('div.mod-head> h1').text().strip() if title is None or title == "": return tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.replace(",", ",").split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # # # newspost1 = d('div.article-main> div> img').attr("src") # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # post = d("meta[property='og:image']").attr("content") try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d('span.time> time').text() logger.info(post_time) # if post_time == datetime.date.strftime(datetime.date.today(),'%Y-%m-%d'): # news_time = datetime.datetime.now() # else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M") except: news_time = datetime.datetime.now() article = d('div.mod-body> div.content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s, %s, %s", key, title, news_time, ":".join(tags), type, category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [], "sectors": [20] } dcontents = [] rank = 1 for c in contents: # if c["data"].find("◆END◆")>=0 or c["data"].find("…………………")>=0: # break # # if c["data"].find("ACG 领域最具影响力的产业新媒体") >= 0 or c["data"].find("访问三文娱网站3wyu.com查看产业必读文章") >=0: # continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) if title is not None and len(contents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE************* %s", mid) return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8","ignore"))) key = newsurl.split("/")[-1].replace("i","") type = TYPE category = None title = d('head> title').text().strip() r = "content: '(.*?)',.*groupId" result = util.re_get_result(r.strip()[:-1], content) (b,) = result logger.info(b) # exit() tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.replace(",", ",").split(","): if tag is not None and tag.strip() != "" and tag not in tags and tag != title: tags.append(tag) post = None brief = None news_time = None try: r1 = "time: '(.*?)'.*},.*tagInfo" result = util.re_get_result(r1, content) (post_time,) = result logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except: pass if news_time is None: news_time = datetime.datetime.now() # exit() # article = d('div.post> div.post-content').html() # contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": 60101, "domain": domain, "categoryNames": [], # "sectors": [20] } dcontents = [] rank = 1 bb = b.replace('<', "<").replace(">",">").replace(""","\"").replace("=","=") logger.info(bb) contents = extract.extractContents(newsurl, bb, document=False) for c in contents: logger.info(c["data"]) if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 # for c in b.replace("<div><p>",'').replace("</p></div>","").split('</p><p>'): # logger.info(c) # if c.find("转载务必署名来源")>=0 or c.find("</p></div>")>=0 or c.find("<div><p> ")>=0: # continue # if c.find("img") >= 0: # c = re.sub(r'<(.*)?img.*"0">',"",c) # dc = { # "rank": rank, # "content": c, # "image": "", # "image_src": "", # } # else: # dc = { # "rank": rank, # "content": c, # "image": "", # "image_src": "", # } # # else: # # if download_crawler is None: # # dc = { # # "rank": rank, # # "content": "", # # "image": "", # # "image_src": c, # # } # # else: # # (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c, download_crawler, SOURCE, key, "news") # # if imgurl is not None: # # dc = { # # "rank": rank, # # "content": "", # # "image": str(imgurl), # # "image_src": "", # # "height": int(height), # # "width": int(width) # # } # # else: # # continue # # logger.info(c) # dcontents.append(dc) # rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) mid = None if title is not None and len(dcontents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE*************%s",mid) return
def process_news(newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here.') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) category = None categoryNames = [] Type = TYPE tags = [] brief = None title = d('h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return key = d('article').attr('id').strip().split('-')[-1] try: (posturl, width, height) = parser_mysql_util.get_logo_id_new( newspost, download_crawler, SOURCE, key, "news") except: posturl = None if posturl is not None: post = str(posturl) else: post = None try: post_time = d("header> div> span> time").text().strip() res = re.search(u'(\d{4})年(\d+)月(\d+)日', post_time) year = res.group(1) month = res.group(2) if len(month) == 1: month = '0' + month day = res.group(3) if len(day) == 1: day = '0' + day post_time = '{}-{}-{}'.format(year, month, day) news_time = extract.extracttime(post_time) except Exception as e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.td-post-content').html() contents = extract.extractContents(newsurl, article, document=True) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": Type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 for c in contents: if c['type'] == 'text': dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process_news(content, news_key, url, news_posttime): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode('utf-8'))) title = d('header.article-header>h1').text().strip() if title is None or title.strip() == "": logger.info("wrong title for url: %s", url) return post_time = pq(content)("meta[name='sailthru.date']").attr("content") news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") + datetime.timedelta(hours=15) key = news_key try: postraw = pq(content)("meta[property='og:image']").attr("content") if postraw.find("techcrunch.opengraph.default.png")>=0: postraw = None except: postraw = None # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None divtags = d('div.tags> div.tag-item') tags = [pq(divtag)('a.tag').text().strip() for divtag in divtags if pq(divtag)('a.tag').text().strip() is not None] category = None logger.info("%s, %s, %s, %s, %s -> %s", key, title, post_time, news_time, ":".join(tags),category) article = d('div.article-entry.text').html() # logger.info(article) contents = extract.extractContents(url, article) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None: mongo.close() return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: mongo.close() return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) mongo.close() flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "" or post.find("techcrunch.opengraph.default.png")>=0: # post = util.get_poster_from_news(dcontents) # # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) if len(dcontents) > 0: # mongo = db.connect_mongo() # collection_news = mongo.article.news # collection_news.insert(dnews) # mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) logger.info("Done")
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('div.da-title> h2').text().strip() if title.find("融资") >= 0: type = 60001 category = 60101 (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d("span.article-time").eq(0).text().strip() logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.data-article').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("btm地址") >= 0 or \ c["data"].find("版权声明") >= 0: continue if c["data"].find("8btctest1/custom/images") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('div.post-inner> h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] articletags = d("meta[name='keywords']").attr('content') if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d("p.post-byline> time.published").text().strip() logger.info('时间:%s' % post_time) p = re.compile(u'(年|月)') post_time = p.sub('-', post_time).replace('日', '') logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.entry-inner').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 for c in contents: if c['data'].find('文章相关引用及参考') >= 0 or c['data'].find( '读者QQ群') >= 0: continue if c['type'] == 'text': dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()