def process_news(column, content, msg, download_crawler): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) title = msg['title'] newsurl = msg['link'] brief = msg['brief'] newspost = msg['post'] post_time = msg['newsDate'] category = None categoryNames = [] key = re.search('https://vulcanpost.com/(\d+)/.*',newsurl).group(1) type = TYPE (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] try: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now()
def save_member(r, SOURCE, download_crawler): member_key, name, weibo, introduction, education, work, location, role, pictureUrl, company_key, position = r conn = db.connect_torndb() source_member = conn.get( "select * from source_member where source=%s and sourceId=%s order by id limit 1", SOURCE, member_key) logo_id = None if source_member == None or source_member[ "photo"] == None or source_member["photo"] == "": if pictureUrl is not None and pictureUrl != "": # image_value = download_crawler.get_image(pictureUrl) # if image_value is not None: # logo_id = imgfs.put(image_value, content_type='jpeg', filename='member_%s_%s.jpg' % (SOURCE, member_key)) # logger.info("gridfs logo_id=%s" % logo_id) (logo_id, w, h) = parser_mysql_util.get_logo_id_new(pictureUrl, download_crawler, SOURCE, member_key, "member") else: logo_id = source_member["photo"] if source_member is None: sql = "insert source_member(name,photo,weibo,location,role,description,\ education,work,source,sourceId,createTime,modifyTime,processStatus) \ values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),0)" source_member_id = conn.insert(sql, name, logo_id, weibo, location, role, introduction, education, work, SOURCE, member_key) else: source_member_id = source_member["id"] sql = "update source_member set name=%s,photo=%s,weibo=%s,location=%s,role=%s,description=%s,\ education=%s,work=%s,modifyTime=now(),processStatus=0 where id=%s" conn.update(sql, name, logo_id, weibo, location, role, introduction, education, work, source_member_id) if company_key is not None: source_company = conn.get( "select * from source_company where source=%s and sourceId=%s", SOURCE, company_key) if source_company is not None: source_company_id = source_company["id"] source_company_member_rel = conn.get( "select * from source_company_member_rel where \ sourceCompanyId=%s and sourceMemberId=%s", source_company_id, source_member_id) if source_company_member_rel is None: type = name_helper.position_check(position) logger.info("position %s, type %s", position, type) conn.insert( "insert source_company_member_rel(sourceCompanyId, sourceMemberId, \ position,type,createTime,modifyTime) \ values(%s,%s,%s,%s, now(),now())", source_company_id, source_member_id, position, type) conn.close()
def save_company_standard(source_company, download_crawler): conn = db.connect_torndb() s = source_company result = conn.get( "select * from source_company where source=%s and sourceId=%s order by id limit 1", s["source"], str(s["sourceId"])) logo_id = None if result is None or result["logo"] is None or result["logo"] == "": log_url = s["logo"] if log_url is not None and len(log_url.strip()) > 0: # logger.info(log_url) # image_value = download_crawler.get_image(log_url) # if image_value is not None: # logo_id = imgfs.put(image_value, content_type='jpeg', filename='company_%s_%s.jpg' % (s["source"], s["sourceId"])) (logo_id, w, h) = parser_mysql_util.get_logo_id_new(log_url, download_crawler, s["source"], s["sourceId"], "company") else: logo_id = result["logo"] logger.info("gridfs logo_id=%s" % logo_id) s["logo"] = logo_id if result is not None: source_company_id = result["id"] s["id"] = source_company_id update_source_company(s) else: sql = "insert source_company(name,fullName,description,brief,round, \ productDesc, modelDesc, operationDesc, teamDesc, marketDesc, compititorDesc, advantageDesc, planDesc, \ roundDesc,companyStatus,fundingType,locationId, address, \ phone, establishDate, logo,source,sourceId, \ createTime,modifyTime, \ field,subField,tags, headCountMin, headCountMax,processStatus) \ values \ (%s,%s,%s,%s,%s, \ %s,%s,%s,%s,%s,%s,%s,%s, \ %s,%s,%s,%s,%s, \ %s,%s,%s, %s, %s, \ now(),now(), \ %s,%s,%s, %s, %s,0)" source_company_id = conn.insert( sql, s["name"], s["fullName"], s["description"], s["brief"], s["round"], s.get("productDesc"), s.get("modelDesc"), s.get("operationDesc"), s.get("teamDesc"), s.get("marketDesc"), s.get("compititorDesc"), s.get("advantageDesc"), s.get("planDesc"), s["roundDesc"], s["companyStatus"], s["fundingType"], s["locationId"], s["address"], s["phone"], s["establishDate"], s["logo"], s["source"], s["sourceId"], s["field"], s["subField"], s["tags"], s["headCountMin"], s["headCountMax"]) conn.close() return source_company_id
def save_blockchain_standard_feixiaohao(source_feixiaohao, download_crawler): s = source_feixiaohao logo_url = s["logo"] conn = db.connect_torndb() source = None if s["name"] is not None and s["name"].strip() != "": source = conn.get( "select * from digital_token where symbol=%s and name=%s", s["symbol"], s["name"]) elif s["enname"] is not None and s["enname"].strip() != "": source = conn.get( "select * from digital_token where symbol=%s and enname=%s", s["symbol"], s["enname"]) else: source = conn.get("select * from digital_token where symbol=%s", s["symbol"]) logo_id = None if source is None or source["logo"] is None or source["logo"] == "": if logo_url is not None and len(logo_url.strip()) > 0: (logo_id, w, h) = parser_mysql_util.get_logo_id_new(logo_url, download_crawler, 13511, s["symbol"], "blockchain") else: logo_id = source["logo"] if source is None: sql = "insert digital_token(" \ "companyId,symbol,name,enname,publishDate," \ "websites,browsers,description,whitepaper,logo,createTime," \ "modifyTime)" \ " values" \ "(%s,%s,%s,%s,%s," \ "%s,%s,%s,%s,%s, now()," \ "now())" source_d_id = conn.insert(sql, s["companyId"], s["symbol"], s["name"], s["enname"], s["publishDate"], s["websites"], s["browsers"], s["description"], s["whitepaper"], logo_id) else: source_d_id = source["id"] sql = "update digital_token set name=%s,enname=%s,publishDate=%s," \ "websites=%s,browsers=%s,description=%s,whitepaper=%s,logo=%s,modifyTime=now() where id=%s" conn.update(sql, s["name"], s["enname"], s["publishDate"], s["websites"], s["browsers"], s["description"], s["whitepaper"], logo_id, source_d_id) conn.close() return source_d_id
def save_company(r, SOURCE, download_crawler): company_key = r["sourceId"] conn = db.connect_torndb() logo_id = None source_company = conn.get( "select * from source_company where source=%s and sourceId=%s", SOURCE, str(company_key)) if source_company is None or source_company[ "logo"] is None or source_company["logo"] == "": log_url = r["logo"] if log_url is not None and len(log_url.strip()) > 0: logger.info(log_url) # image_value = download_crawler.get_image(log_url) # if image_value is not None: # logo_id = imgfs.put(image_value, content_type='jpeg', filename='company_%s_%s.jpg' % (SOURCE, company_key)) (logo_id, w, h) = parser_mysql_util.get_logo_id_new(log_url, download_crawler, SOURCE, company_key, "company") else: logo_id = source_company["logo"] logger.info("gridfs logo_id=%s" % logo_id) if source_company == None: source_company_id = conn.insert( "insert source_company(name,fullName,description,brief,\ round,roundDesc,companyStatus,fundingType,locationId,establishDate,logo,\ source,sourceId,createTime,modifyTime,\ field,subField,tags,type,processStatus) \ values(%s,%s,%s,%s,\ %s,%s,%s,%s,%s,%s,%s,\ %s,%s,now(),now(),\ %s,%s,%s,%s,0)", r["productName"], r["fullName"], r["description"], r["brief"], r["round"], r["roundDesc"], r["companyStatus"], r["fundingType"], r["locationId"], r["establishDate"], logo_id, SOURCE, company_key, r["field"], r["subField"], r["tags"], r["type"]) else: source_company_id = source_company["id"] conn.update( "update source_company set \ name=%s,fullName=%s,description=%s, brief=%s, \ round=%s,roundDesc=%s,companyStatus=%s,fundingType=%s,locationId=%s,establishDate=%s,logo=%s, \ field=%s,subField=%s,tags=%s,type=%s, \ modifyTime=now(),processStatus=0,active=null \ where id=%s", r["productName"], r["fullName"], r["description"], r["brief"], r["round"], r["roundDesc"], r["companyStatus"], r["fundingType"], r["locationId"], r["establishDate"], logo_id, r["field"], r["subField"], r["tags"], r["type"], source_company_id) conn.close() return source_company_id
def save_investor_standard_new(source_investor, download_crawler): s = source_investor source = s["source"] sourceId = s["sourceId"] logo_url = s["logo"] conn = db.connect_torndb() source_investor = conn.get( "select * from source_investor where source=%s and sourceId=%s", source, sourceId) logo_id = None if source_investor is None or source_investor[ "logo"] is None or source_investor["logo"] == "": if logo_url is not None and len(logo_url.strip()) > 0: (logo_id, w, h) = parser_mysql_util.get_logo_id_new(logo_url, download_crawler, source, sourceId, "investor") else: logo_id = source_investor["logo"] if source_investor is None: sql = "insert source_investor(" \ "name,website,description,logo,stage," \ "field,type, source,sourceId,createTime," \ "modifyTime,processStatus,wechatId,weibo,enName,fullName,enFullName,establishDate)" \ " values" \ "(%s,%s,%s,%s,%s," \ "%s,%s,%s,%s,now()," \ "now(),0,%s,%s,%s,%s,%s,%s)" source_investor_id = conn.insert(sql, s["name"], s["website"], s["description"], logo_id, s.get("stage"), s.get("field"), s.get("type"), source, sourceId, s.get("wechatId"), s.get("weibo"), s.get("enName"), s.get("fullName"), s.get("enFullName"), s.get("establishDate")) else: source_investor_id = source_investor["id"] sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s," \ "field=%s,type=%s,wechatId=%s,weibo=%s,enName=%s,fullName=%s," \ "enFullName=%s,establishDate=%s, modifyTime=now(),processStatus=0 where id=%s" conn.update(sql, s["name"], s["website"], s["description"], logo_id, s.get("stage"), s.get("field"), s.get("type"), s.get("wechatId"), s.get("weibo"), s.get("enName"), s.get("fullName"), s.get("enFullName"), s.get("establishDate"), source_investor_id) conn.close() return source_investor_id
def save_investor_standard(source_investor, download_crawler): s = source_investor source = s["source"] sourceId = s["sourceId"] logo_url = s["logo_url"] conn = db.connect_torndb() source_investor = conn.get( "select * from source_investor where source=%s and sourceId=%s", source, str(sourceId)) logo_id = None if source_investor == None or source_investor[ "logo"] == None or source_investor["logo"] == "": if logo_url is not None and len(logo_url.strip()) > 0: # image_value = download_crawler.get_image(logo_url) # if image_value is not None: # logo_id = imgfs.put(image_value, content_type='jpeg', filename='investor_%s_%s.jpg' % (source, sourceId)) # logger.info("gridfs logo_id=%s" % logo_id) (logo_id, w, h) = parser_mysql_util.get_logo_id_new(logo_url, download_crawler, source, sourceId, "investor") else: logo_id = source_investor["logo"] #logger.info("logo_id=%s" % logo_id) if source_investor is None: sql = "insert source_investor(" \ "name,website,description,logo,stage," \ "field,type, source,sourceId,createTime," \ "modifyTime,processStatus)" \ " values" \ "(%s,%s,%s,%s,%s," \ "%s,%s,%s,%s,now()," \ "now(),0)" source_investor_id = conn.insert(sql, s["name"], s["website"], s["description"], logo_id, s["stage"], s["field"], s["type"], source, sourceId) else: source_investor_id = source_investor["id"] sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s,\ field=%s,type=%s,modifyTime=now(),processStatus=0 where id=%s" conn.update(sql, s["name"], s["website"], s["description"], logo_id, s["stage"], s["field"], s["type"], source_investor_id) conn.close() return source_investor_id
def save_investor_member_standard(source_investor_id, members, download_crawler): conn = db.connect_torndb() conn.execute( "delete from source_investor_member where sourceInvestorId=%s", source_investor_id) for s in members: (logo_id, w, h) = parser_mysql_util.get_logo_id_new(s["logo"], download_crawler, s["source"], s["sourceId"], "member") sql = "insert source_investor_member(sourceInvestorId, investorMemberId,source,sourceId,name,logo, position, description, createTime,modifyTime) \ values(%s,%s,%s,%s,%s,%s,%s,%s,now(),now())" conn.insert(sql, source_investor_id, None, s["source"], s["sourceId"], s["name"], logo_id, s["position"], s["description"]) conn.close()
def process_news(content, download_crawler): download_crawler = download.DownloadCrawler(use_proxy=False) category = None categoryNames = [] key = content['id'] type = TYPE title = content['title'] mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return newspost = content.get('featured_image').get('source') (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # logger.info(post) tags = [] for tag in content['tags']: tags.append(tag['name']) brief = content['seo']['description'] try: post_time = content['modified_gmt'] news_time = None if post_time.find('T'): post_time = post_time.replace('T', ' ') news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now()
def save_investfirm(r, SOURCE, download_crawler): investor_key, investor_name, logo, website, stageStr, fieldsStr, desc = r conn = db.connect_torndb() source_investor = conn.get( "select * from source_investor where source=%s and sourceId=%s", SOURCE, str(investor_key)) #logger.info(source_investor["logo"]) logo_id = None if source_investor == None or source_investor[ "logo"] == None or source_investor["logo"] == "": if logo is not None and logo != "": # image_value = download_crawler.get_image(logo) # if image_value is not None: # logo_id = imgfs.put(image_value, content_type='jpeg', filename='investor_%s_%s.jpg' % (SOURCE, investor_key)) # logger.info("gridfs logo_id=%s" % logo_id) (logo_id, w, h) = parser_mysql_util.get_logo_id_new(logo, download_crawler, SOURCE, investor_key, "investor") else: logo_id = source_investor["logo"] if source_investor is None: sql = "insert source_investor(name,website,description,logo,stage,field,type, \ source,sourceId,createTime,modifyTime,processStatus) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),0)" source_investor_id = conn.insert(sql, investor_name, website, desc, logo_id, stageStr, fieldsStr, 10020, SOURCE, investor_key) else: source_investor_id = source_investor["id"] sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s,\ field=%s,type=%s,modifyTime=now(),processStatus=0 where id=%s" conn.update(sql, investor_name, website, desc, logo_id, stageStr, fieldsStr, 10020, source_investor_id) conn.close()
dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents
def process_news(content, url): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) download_crawler = download.DownloadCrawler(use_proxy=False) title = d( 'div.post-img-left> div> div.post-head> h1.title').text().strip() post_time = d('article.post-article').attr("ptime") post_Date = time.localtime(int(post_time)) news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon, post_Date.tm_mday, post_Date.tm_hour, post_Date.tm_min, post_Date.tm_sec) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) if collection_news.find_one({ "title": title, "source": { "$ne": SOURCE } }) is not None: return key = d('article.post-article').attr("postid") try: key_int = int(key) except: key_int = None column = d('span.post-category').text().strip() brief = d("meta[name='description']").attr("content").strip() if column is not None: tags = column.split() else: tags = [] categoryNames = [] if "人物" in tags: category = 60103 elif "公司" in tags: category = 60105 categoryNames.append("大公司") else: category = None keywords = d("meta[name='keywords']").attr("content") if keywords is not None: for keyword in keywords.split(","): if keyword is not None and keyword.strip( ) not in tags and keyword.strip() not in ["PingWest", "品玩"]: tags.append(keyword.strip()) postraw = d("link[rel='image_src']").attr("href") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time, news_time, brief, ":".join(tags), category, post) article = d('div.box-con> div#sc-container').html() # logger.info(article) contents = extract.extractContents(url, article) # if collection_news.find_one({"link": url}) is not None: # return # # collection_news.delete_one({"link": url}) # # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: # return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=16), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""), # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"].replace("?imageView2/2/w/750/q/90", ""), } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"].replace("?imageView2/2/w/750/q/90", ""), download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost, download_crawler): # if has_news_content(content): if 1: # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode('gb2312', 'ignore'))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".htm", "") title = d('h1.title').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) brief = None news_time = d('.timer').text() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M:%S') article = d('.content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, # "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) mongo.close() if news_classify.get_class(dcontents, 13866) == 1: logger.info('%s is fundingNews', title) TYPE = 60001 else: TYPE = 60010 logger.info('%s is not fundingNews', title) dnews['type'] = TYPE if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("gbk"))) key = newsurl.split("/")[-1].replace(".htm", "") type = TYPE category = None categoryNames = [] title = d('div.hd> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # post = d('div#post_thumbnail> img').attr("src") # postraw = newspost # # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") # (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: # post = None post = None brief = d("meta[name='description']").attr("content") post_time = d('div.a_Info> span.a_time').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div.bd> div.Cnt-Main-Article-QQ').html() contents = extract.extractContents(newsurl, article) # logger.info(contents) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(item, url, content): if has_news_content(content): d = pq(html.fromstring(content.decode("gbk"))) title = d( 'div.g-main> div> div.m-cont-hd> div.title> h1').text().strip() datecontent = d( 'div.g-main> div> div.m-cont-hd> div.m-info> div> div> div.box> div.origin' ).text().strip() result = util.re_get_result('(\d{4}\/.*?)$', datecontent) if result: post_time, = result news_time = datetime.datetime.strptime(post_time, "%Y/%m/%d %H:%M:%S") else: post_time = None news_time = None key = item["key"] column = d('div.g-main> div> div.m-cont-hd> div.tag').text().strip() brief = d('div.g-article> div> div.review').text().strip() postraw = item["post"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if column is not None: tags = column.split() else: tags = [] logger.info("%s, %s, %s, %s, %s, %s", key, title, post_time, news_time, brief, ":".join(tags)) article = d('div.g-article> div.m-article').html() #logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) # # for t in contents: # logger.info(t["data"]) # logger.info("") flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_posterId_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip().replace(".html", "").replace( 'detail_', '') type = TYPE category = None title = d('div.left.zb-n> h1').text().strip() tags = [] postraw = newspost # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # brief = d("meta[name='description']").attr("content").replace(u'一鸣网——让发生的发声|智慧共享新媒体平台|上海TMT媒体开创者、一鸣网ymtmt.com','') brief = d('div.left.zb-n> p.gy').text().strip() news_time = datetime.datetime.now() article = d('div.left.zb-n').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: logger.info('already exists %s', title) mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 start = False for c in contents: if start is False and c["data"].find( brief) >= 0 and c["data"].find(title) >= 0: start = True continue if start is False: continue if c["data"].find("-END-") >= 0: break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) mongo.close() # logger.info("*************DONE*************") else: logger.info('has no news content %s', newsurl) return
def process_news(content, news_key, url): if has_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content)) brief = d("meta[name='description']").attr("content").split(",")[-1] title = d('div#article> div.single-item> div.article-hd> h1').text().strip() pagetitle = d('head> title').text().strip() temp = pagetitle.split("-")[-2] categoryNames = [] if temp.strip() == "初页": category = 60102 categoryNames.append("产品") elif temp.strip() == 'IPO/并购': category = 60105 categoryNames.append("大公司") else: category = None post_time = d('div.author-time> span.date-time').attr("data-time") post_date = time.localtime(int(post_time)) news_time = datetime.datetime(post_date.tm_year, post_date.tm_mon, post_date.tm_mday, post_date.tm_hour, post_date.tm_min, post_date.tm_sec) key = news_key column = d('div.article-tags> a').text() tags = column.split() logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, post_time, news_time, temp, category, ":".join(tags)) article = d('div#article> div> div.article-content').html() # # logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None: return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) # for t in contents: # logger.info(t["data"]) # logger.info("") flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "" or desc_helper.check_desc(brief,2) is False: brief = util.get_brief_from_news(dcontents) # post = util.get_poster_from_news(dcontents) # dnews["post"] = post post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, j_content, content, download_crawler): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = j_content['id'] type = TYPE title = j_content['title'] newspost = j_content.get('image') (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] brief = j_content['description'] newsurl = j_content['uri'] try: date = j_content['date'] post_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(str(date)[:-3]))) news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") - datetime.timedelta(days=1) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() article = d('div.article-container').html() contents = extract.extractContents(newsurl, article,document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": if c["data"].find("Share to facebookShare to twitterShare to linkedin") >= 0: c['data'] = c['data'].replace('Share to facebookShare to twitterShare to linkedin', '') dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(column, newsurl, content, newspost): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) key = content["news"]["id"] newsurl = "https://www.chinaventure.com.cn/cmsmodel/news/detail/%s.shtml" % key type = TYPE category = None categoryNames = [] if content["news"].has_key("newsChannelId"): if content["news"]["newsChannelId"] == 52: category = 60101 categoryNames.append("融资") if content["news"].has_key("tagName"): if content["news"]["tagName"] == '人物': category = 60103 tags = [] if content.has_key("keywordList") is True and len( content["keywordList"]) > 0: for tag in content["keywordList"]: if tag.has_key("keyword") and tag[ "keyword"] is not None and tag["keyword"].strip( ) != "" and tag["keyword"] not in tags: tags.append(tag["keyword"]) title = content["news"]["title"].replace(""", "\"") mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: logger.info( "***************************News existed!!!***********************" ) mongo.close() return # post = d('div#post_thumbnail> img').attr("src") postraw = "http://pic.chinaventure.com.cn/" + content["news"][ "coverImg"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = content["news"]["introduction"] post_time = content["news"]["updateAt"] news_time = extract.extracttime(str(post_time)) if news_time is None: news_time = datetime.datetime.now() article = pq(content["news"]["content"]).html() contents = extract.extractContents(newsurl, article) # for c in contents: # logger.info(c["data"]) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post) # return # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # logger.info("***************************News existed!!!***********************") # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("img.mp.itc.cn") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post # dnews["post"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) # logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newsposttime, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip() type = TYPE title = d('div.article-wrap> div.article-head> h1').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) category = None categoryNames = [] if "投资并购" in tags: category = 60101 categoryNames.append("融资") # post = d('div#post_thumbnail> img').attr("src") post = None brief = d("meta[name='description']").attr("content") news_time = None if newsposttime is not None: news_time = extract.extracttime(newsposttime) if news_time is None: dt = datetime.date.today() post_time = d( 'div.article-wrap> div.article-head> p> span.article-time' ).text() if post_time is None or post_time.strip() == str(dt): news_time = datetime.datetime.now() # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") article = d('div.article-wrap> div.article-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) dnews["postId"] = post dnews["brief"] = brief # Design for sector: dnews["sectors"] = [10] dnews["sector_confidence"] = [1] if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process(content, citykey, crawler): cnt = 0 if has_content(content): DT = datetime.date.today() TODAY = datetime.datetime(DT.year, DT.month, DT.day) #logger.info(content) d = pq(html.fromstring(content.decode("utf-8"))) lis = d('div.wrap> div> div> ul.ativities> li.item') for li in lis: c = pq(li) img = c('a> img').attr("src").strip().replace("|130w","") if img is not None: # logger.info("poster: %s", poster) # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(img, download_crawler, SOURCE, key, "news") if posturl is not None: poster = str(posturl) else: poster = None title = c('h3.title> a').text() link = c('h3.title> a').attr("href") if link.find("http") ==-1: continue key = link.split("/")[-1] key_int = int(key) location = c('div.intro> div.address').text() sponors = c('div.intro> div.sponors> span').text().replace(","," ").replace(","," ").split() spans = c('div.intro> div.time> span') if len(spans) == 3: date = c('div.intro> div.time> span').eq(0).text() times = c('div.intro> div.time> span').eq(2).text().split("~") beginTime = date+" "+times[0] endTime = date+" "+times[1] elif len(spans) == 5: date = c('div.intro> div.time> span').eq(0).text() year = date.split("-")[0] times = c('div.intro> div.time> span').eq(2).text().split("~") beginTime = date+" "+times[0] endTime = year+"-"+times[1]+" "+c('div.intro> div.time> span').eq(4).text() else: continue try: beginDate = datetime.datetime.strptime(beginTime, "%Y-%m-%d %H:%M") endDate = datetime.datetime.strptime(endTime, "%Y-%m-%d %H:%M") except: beginDate = None if beginDate is None or beginDate < TODAY or key_int is None: # Not save active activity continue result = crawler.crawl(link) while True: if result['get'] == 'success': break else: result = crawler.crawl(link) if has_content(result['content']): contents = extract.extractContents(link, result['content']) flag, domain = url_helper.get_domain(link) dact = { "beginDate": beginDate - datetime.timedelta(hours=8), "endDate": endDate - datetime.timedelta(hours=8), "date": beginDate - datetime.timedelta(hours=8), "title": title, "link": link, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": TYPE, "original_tags": [], "processStatus": 0, "companyIds": [], "location": location, "city": citymap[citykey], "sponors": sponors, "post": poster, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": if c["data"].find("我要报名") >= 0: logger.info("************************over") break dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"],download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dact["contents"] = dcontents value = activity_simhash.get_simhash_value(dcontents) dact["simhashValue"] = value record = collection_news.find_one({"source": SOURCE, "key_int": key_int}) if record is not None: city = record["city"] if record["beginDate"] == dact["beginDate"] and record["endDate"] == dact["endDate"] and record["title"] == dact["title"] and record["city"] == citymap[citykey] and record["location"] == dact["location"]: logger.info("%s activity already existed", title) cnt += 1 continue else: collection_news.delete_one({"source": SOURCE, "key_int": key_int}) if city != citymap[citykey]: logger.info("%s has two city : %s and %s with location %s, something is wrong", title, city, citymap[citykey], location) cnt += 1 continue collection_news.insert(dact) logger.info("%s, %s, %s->%s, %s, %s, %s, %s", key, title, beginDate, endDate, ":".join(sponors),location, link, img) else: if activity_simhash.check_same_act(dact) is True: pass else: collection_news.insert(dact) logger.info("%s, %s, %s->%s, %s, %s, %s, %s", key, title, beginDate, endDate, ":".join(sponors), location, link, img) cnt += 1 logger.info("************Done***************") logger.info("*******%s activities has been checked or recorded", cnt) return cnt
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) if d.text().find('embed') >= 0: # 排除视频文章 logger.info('not article:%s' % newsurl) return category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return try: (posturl, width, height) = parser_mysql_util.get_logo_id_new( newspost, download_crawler, SOURCE, key, "news") except: posturl = None if posturl is not None: post = str(posturl) else: post = None tags = [] articletags = d("meta[name='keywords']").attr('content') if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time_1 = d("div.wyt-post-content-meta> div> p ").find( 'span').text().strip() post_time_2 = d("div.wyt-post-content-meta> div").find( 'p').next().text().strip() if post_time_1: post_time = post_time_1 else: post_time = post_time_2 if re.match('\d{2}-\d{2}', post_time): # 匹配 03-19格式 post_time = str(time.localtime()[0]) + '-' + post_time news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('article.wyt-post-content').html() contents = extract.extractContents(newsurl, article, document=True) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 if contents[0]['type'] == 'img': del contents[0] for c in contents: # logger.info("%s-%s",c["type"],c["data"]) if c['type'] == 'text': if re.match('^\d+$', c['data']) or c['data'].find('收藏') >= 0 or c['data'].find('投融资') >= 0 or c['data'].find('阅读时间') >= 0 \ or c['data'].find('违者必究') >= 0 or c['data'].find('微信公众号') >= 0 or c['data'].find('微信扫描') >= 0 \ or c['data'].find('点击获取完整版报告') >= 0 or c['data'].find('作者原创,微信号') >= 0: continue # if c['data'].find('译者') >= 0: # c['data'] = c['data'].split(' ')[0] # # if c['data'].find('来源') >= 0: # c['data'] = c['data'].split('|')[0] if c['data'].find('| 未央网') >= 0: c['data'] = c['data'].replace('| 未央网', ' ') dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process_news(column, newsurl, content, newspost, download_crawler): logger.info('starting process_news %s', newsurl) # if has_news_content(content): if 1: download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1] # type = TYPE category = None title = d('.article_title p').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() logger.info('title:%s already exists' % title) return tags = [] articletags = d(".labs a").text().strip() if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) postraw = newspost # post = d('div#post_thumbnail> img').attr("src") # if post is not None: # post = "http://vcbeat.com"+ post brief = None # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip() # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content) news_time = d('.time').text().strip() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M') # dt = datetime.date.today() # today = datetime.datetime.now() # if news_time is None or news_time > today: # news_time = datetime.datetime.now() article = d('.art_text').html() contents = extract.extractContents(newsurl, article, document=False) # if len(contents)==0: # contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, postraw) # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # logger.info( 'title:%s already exists'%title) # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": TYPE, "original_tags": None, "processStatus": 0, # "companyId": None, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) dnews["brief"] = brief # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # update link content with oldId item = collection_news.find_one({"link": newsurl}) if item is None: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", newsurl) # collection_news.update_many({'link': newsurl},{'$set': dnews}) # oldId = collection_news.find_one({"link": newsurl})['_id'] # collection_news.delete_one({"link": newsurl}) # dnews['_id']=oldId # collection_news.insert(dnews) mongo.close() logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) # d = pq(html.fromstring(content.decode("gbk","ignore"))) utfflag = False if content.find("gb2312") == -1: d = pq(html.fromstring(content.decode("utf-8", "ignore"))) utfflag = True else: d = pq(html.fromstring(content.decode("gbk", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".shtml", "") type = TYPE post = None if utfflag is True: title = d('div#titsize> strong').text().strip() else: title = d('div.titmain> h1').text().strip() # logger.info("title: %s", title) if title is None or title.strip() == "": title = d('div.texttitbox> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # try: # brief = d('div.daodu> p').text().strip().replace("【数据猿导读】","") # except: # brief = None brief = None try: if utfflag is True: post_time = d("p.time> span.mh-title").text().strip() else: post_time = d("meta[property='og:release_date']").attr( "content").split("+")[0] logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if utfflag is True: article = d('div.tbox.content').html() else: article = d('div.texttit_m1').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=20), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(column, newsurl, content, newspost, download_crawler, sort): if has_news_content(content): logger.info("here") d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1] type = TYPE if sort.find("投融资") >= 0: type = 60001 category = None title = d('div.mod-head> h1').text().strip() if title is None or title == "": return tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.replace(",", ",").split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # # # newspost1 = d('div.article-main> div> img').attr("src") # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # post = d("meta[property='og:image']").attr("content") try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d('span.time> time').text() logger.info(post_time) # if post_time == datetime.date.strftime(datetime.date.today(),'%Y-%m-%d'): # news_time = datetime.datetime.now() # else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M") except: news_time = datetime.datetime.now() article = d('div.mod-body> div.content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s, %s, %s", key, title, news_time, ":".join(tags), type, category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [], "sectors": [20] } dcontents = [] rank = 1 for c in contents: # if c["data"].find("◆END◆")>=0 or c["data"].find("…………………")>=0: # break # # if c["data"].find("ACG 领域最具影响力的产业新媒体") >= 0 or c["data"].find("访问三文娱网站3wyu.com查看产业必读文章") >=0: # continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) if title is not None and len(contents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE************* %s", mid) return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-2].replace(".html", "") type = TYPE title = d('h1.single-title').text().strip() newspost = d('header> img.wp-post-image').attr("src") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None # try: # post_time = topic # # logger.info(post_time) # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S") # logger.info("news-time: %s", news_time) # except Exception, e: # logger.info(e) news_time = datetime.datetime.now() article = d('section.post_content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } processStatus = 0 dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["data"].find( "Continue reading this story with a subscription to DealStreetAsia" ) >= 0: processStatus = -5 if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 if processStatus != 0: dnews["processStatus"] = processStatus dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s | %s", nid, processStatus) pass return
def process_news(newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here.') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) category = None categoryNames = [] Type = TYPE tags = [] brief = None title = d('h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return key = d('article').attr('id').strip().split('-')[-1] try: (posturl, width, height) = parser_mysql_util.get_logo_id_new( newspost, download_crawler, SOURCE, key, "news") except: posturl = None if posturl is not None: post = str(posturl) else: post = None try: post_time = d("header> div> span> time").text().strip() res = re.search(u'(\d{4})年(\d+)月(\d+)日', post_time) year = res.group(1) month = res.group(2) if len(month) == 1: month = '0' + month day = res.group(3) if len(day) == 1: day = '0' + day post_time = '{}-{}-{}'.format(year, month, day) news_time = extract.extracttime(post_time) except Exception as e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.td-post-content').html() contents = extract.extractContents(newsurl, article, document=True) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": Type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 for c in contents: if c['type'] == 'text': dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8","ignore"))) key = newsurl.split("/")[-1].replace("i","") type = TYPE category = None title = d('head> title').text().strip() r = "content: '(.*?)',.*groupId" result = util.re_get_result(r.strip()[:-1], content) (b,) = result logger.info(b) # exit() tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.replace(",", ",").split(","): if tag is not None and tag.strip() != "" and tag not in tags and tag != title: tags.append(tag) post = None brief = None news_time = None try: r1 = "time: '(.*?)'.*},.*tagInfo" result = util.re_get_result(r1, content) (post_time,) = result logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except: pass if news_time is None: news_time = datetime.datetime.now() # exit() # article = d('div.post> div.post-content').html() # contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": 60101, "domain": domain, "categoryNames": [], # "sectors": [20] } dcontents = [] rank = 1 bb = b.replace('<', "<").replace(">",">").replace(""","\"").replace("=","=") logger.info(bb) contents = extract.extractContents(newsurl, bb, document=False) for c in contents: logger.info(c["data"]) if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 # for c in b.replace("<div><p>",'').replace("</p></div>","").split('</p><p>'): # logger.info(c) # if c.find("转载务必署名来源")>=0 or c.find("</p></div>")>=0 or c.find("<div><p> ")>=0: # continue # if c.find("img") >= 0: # c = re.sub(r'<(.*)?img.*"0">',"",c) # dc = { # "rank": rank, # "content": c, # "image": "", # "image_src": "", # } # else: # dc = { # "rank": rank, # "content": c, # "image": "", # "image_src": "", # } # # else: # # if download_crawler is None: # # dc = { # # "rank": rank, # # "content": "", # # "image": "", # # "image_src": c, # # } # # else: # # (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c, download_crawler, SOURCE, key, "news") # # if imgurl is not None: # # dc = { # # "rank": rank, # # "content": "", # # "image": str(imgurl), # # "image_src": "", # # "height": int(height), # # "width": int(width) # # } # # else: # # continue # # logger.info(c) # dcontents.append(dc) # rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) mid = None if title is not None and len(dcontents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE*************%s",mid) return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('div.da-title> h2').text().strip() if title.find("融资") >= 0: type = 60001 category = 60101 (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d("span.article-time").eq(0).text().strip() logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.data-article').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("btm地址") >= 0 or \ c["data"].find("版权声明") >= 0: continue if c["data"].find("8btctest1/custom/images") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(content, news_key, url, news_posttime): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode('utf-8'))) title = d('header.article-header>h1').text().strip() if title is None or title.strip() == "": logger.info("wrong title for url: %s", url) return post_time = pq(content)("meta[name='sailthru.date']").attr("content") news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") + datetime.timedelta(hours=15) key = news_key try: postraw = pq(content)("meta[property='og:image']").attr("content") if postraw.find("techcrunch.opengraph.default.png")>=0: postraw = None except: postraw = None # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None divtags = d('div.tags> div.tag-item') tags = [pq(divtag)('a.tag').text().strip() for divtag in divtags if pq(divtag)('a.tag').text().strip() is not None] category = None logger.info("%s, %s, %s, %s, %s -> %s", key, title, post_time, news_time, ":".join(tags),category) article = d('div.article-entry.text').html() # logger.info(article) contents = extract.extractContents(url, article) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None: mongo.close() return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: mongo.close() return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) mongo.close() flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "" or post.find("techcrunch.opengraph.default.png")>=0: # post = util.get_poster_from_news(dcontents) # # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) if len(dcontents) > 0: # mongo = db.connect_mongo() # collection_news = mongo.article.news # collection_news.insert(dnews) # mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) logger.info("Done")