def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.conn = MySQLdb.connect('localhost', 'root', '1995', 'newsurl', charset='utf8', use_unicode=True) self.cursor = self.conn.cursor() self.message = NewsMessage()
class genZong(object): def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.huan = huanCun() def run(self): client = pymongo.MongoClient('localhost', 27017) db = client['news'] collection_zhuizong = db['zhuiZong'] collection_del = db['zhuiZong'] collecthion = collection_zhuizong.find() for i in collecthion: num = self.getCommentNumber(i['url']) if num == i['num']: collection_del.delete_one({'url': i['url']}) else: all_page = num / 20 for page in xrange(1, all_page + 2): self.comment.run(i['url'], page) dict_zhui = {} dict_zhui['url'] = i['url'] dict_zhui['num'] = num dict_zhui['_id'] = i['url'] '''print json.dumps(dict_zhui, ensure_ascii=False, indent=4)''' self.huan.put_zhuizong(dict_zhui) def getCommentNumber(self, news_url): jison_object = dict() bu = re.split(r'c_|.htm', news_url)[1] comment_url = 'http://comment.home.news.cn/a/newsCommAll.do?newsId=1-' + bu flag = 1 while 1: try: json_object = json.loads( requests.get(comment_url, timeout=30).content.replace( 'var commentJsonVarStr___=', '')[:-1]) break except Exception as e: flag += 1 print e if flag > 5: return # 评论数 ping_lun_shu_liang = json_object['totalRows'] return ping_lun_shu_liang
def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.huan = huanCun()
class NewsMessage(object): def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() # self.update = update_uel() # self.conn = MySQLdb.connect('localhost', 'root', '1995', 'newsurl', charset='utf8', use_unicode=True) # self.cursor = self.conn.cursor() def getNewsMessage(self): count = 0 for news_url in NewsUrl.Run(): req = urllib2.Request(news_url) try: urllib2.urlopen(req) except urllib2.URLError, e: if hasattr(e, 'code'): print 'Error code: ', e.code elif hasattr(e, 'reason'): print 'Reason: ', e.reason continue re_ = 'http://sports.sohu.com/\d*?/[n]\d*?.shtml' if (re.match(re_, news_url)): print news_url html = '' flag = 1 while 1: try: html = requests.get(news_url, timeout=30) html.encoding = 'gb2312' break except Exception as e: flag += 1 print e if flag > 10: return soup = BeautifulSoup(html.text, 'html.parser') re_ = '.*[n](\d*?).shtml' _id = re.match(re_, news_url).group(1) title = soup.find_all('title')[0].text if (title == "404,您访问的页面已经不存在!"): continue """这一段代码是用来获取阅读数和评论数的""" comment_number = self.getCommentNumber(news_url, _id) if comment_number: yue_du_shu = comment_number[0] ping_lun_shu_liang = comment_number[1] else: yue_du_shu = 0 ping_lun_shu_liang = 0 # select_sql = """ # select ping_lun_shu_liang from news where url = %s""" # if (self.cursor.execute(select_sql, news_url)): # data = self.cursor.fetchone() # # print data[0] # if (data[0] == ping_lun_shu_liang): # continue # else: message_dict = dict() ping_dic = dict() # 发布时间 # shijian1 = tiongoe.strftime('%Y-%m-%d', time.localtime(time.time() - 2 * 24 * 60 * 60)) shijian = time.strftime('%Y-%m-%d', time.localtime(time.time())) fa_bu_shi_jian = soup.find_all(id='pubtime_baidu')[0].text if (not re.search(shijian, fa_bu_shi_jian)): continue message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian # print fa_bu_shi_jian # 文章网址 wen_zhang_wang_zhi = news_url message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi # 文章标题 wen_zhang_biao = soup.title.string.encode('utf-8') # print wen_zhang_biao wen_ = '(.*?)\-.*?' wen_zhang_biao_ti = re.search(wen_, wen_zhang_biao).group(1) # print wen_zhang_biao_ti message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti # 评论数量 ping_lun_shu_liang = ping_lun_shu_liang message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang # 文章来源 wen_zhang_lai_yuan = soup.find_all( id="media_span")[0].text.encode('utf-8') message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan # 文章正文 li = [] for i in soup.select("div#contentText"): for wen_zhang_zheng_wen in i.select('p'): li.append(wen_zhang_zheng_wen.text.encode('utf-8')) message_dict['wen_zhang_zheng_wen'] = ",".join(li) # 抓取时间 do_time = time.time() message_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'搜狐网' message_dict['zhan_dian'] = zhan_dian # 图片链接n tu_pian_lian_jie = None message_dict['tu_pian_lian_jie'] = tu_pian_lian_jie # 文章栏目 wen_zhang_lan_mu = u'搜狐体育' + soup.select( "div#mypos")[0].text.encode('utf-8') try: message_dict[ 'wen_zhang_lan_mu'] = wen_zhang_lan_mu.replace( '>', '->') except Exception as e: print e message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu # 文章作者 wen_zhang_zuo_zhe = soup.find_all( id="author")[0].text.encode('utf-8') message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe # 关键词 guan_jian_ci = None message_dict['guan_jian_ci'] = guan_jian_ci # 相关标签 xiang_guan_biao_qian = None message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian # 阅读数量 yue_du_shu = yue_du_shu message_dict['yue_du_shu'] = yue_du_shu # 主键 message_dict['_id'] = _id + '|_|' + news_url count += 1 # print count ping_dic['url'] = news_url ping_dic['_id'] = _id ping_dic['ping_lun_shu_liang'] = ping_lun_shu_liang # self.update.process_item(ping_dic) # print json.dumps(message_dict, ensure_ascii=False, indent=4) self.mongo.put_content(message_dict) flag1 = 0 if ping_lun_shu_liang > 0: all_page = int(math.ceil(ping_lun_shu_liang / 10.0)) for page in xrange(1, all_page + 1): try: self.comment.run(news_url, _id, page) except Exception as e: print e self.comment.run(news_url, _id, page) continue else: print news_url html1 = '' flag = 1 while 1: try: html1 = requests.get(news_url, timeout=30) html1.encoding = 'utf-8' break except Exception as e: flag += 1 print e if flag > 10: return tree = etree.HTML(html1.text) soup = BeautifulSoup(html1.text, 'html.parser') # print soup.text re_ = "http://www.sohu.com/a/(\d*?)\_" title = soup.find_all('title')[0].text if (title == "404,您访问的页面已经不存在!"): continue # print soup.select("#mp-comment") if (soup.select("#mp-comment") != []): _id = soup.select("#mp-comment")[0]['sid'].encode("utf-8") # print _id if (int(_id) == 0): _id = 'mp_' + re.search(re_, news_url).group(1) else: continue """这一段代码是用来获取阅读数和评论数的""" comment_number = self.getCommentNumber(news_url, _id) if comment_number: yue_du_shu = comment_number[0] ping_lun_shu_liang = comment_number[1] else: yue_du_shu = 0 ping_lun_shu_liang = 0 # select_sql = """ select ping_lun_shu_liang from news where url = %s""" # if(self.cursor.execute(select_sql, news_url)): # data = self.cursor.fetchone() # #print data[0] # if (data[0] == ping_lun_shu_liang): # continue # else: message_dict = dict() ping_dic = dict() # 发布时间 shijian = time.strftime('%Y-%m-%d', time.localtime(time.time())) try: fa_bu_shi_jian = soup.select('span#news-time')[0].text except: fa_bu_shi_jian = soup.select('span.time')[0].text if (not re.search(shijian, fa_bu_shi_jian)): continue message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian # 文章网址 wen_zhang_wang_zhi = news_url message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi # 文章标题 wen_zhang_biao = soup.title.string.encode('utf-8') wen_ = '(.*?)\_.*?' try: wen_zhang_biao_ti = re.search(wen_, wen_zhang_biao).group(1) message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti except: message_dict['wen_zhang_biao_ti'] = "无" # 评论数量 ping_lun_shu_liang = ping_lun_shu_liang message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang # 文章来源 try: wen_zhang_lai_yuan = soup.select( "#user-info h4 a")[0].text.encode('utf-8') message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan except: message_dict['wen_zhang_lai_yuan'] = "空" # 文章正文 li = [] for i in soup.select("article.article"): for wen_zhang_zheng_wen in i.select('p'): li.append(wen_zhang_zheng_wen.text.encode('utf-8')) message_dict['wen_zhang_zheng_wen'] = ','.join(li) # 抓取时间 do_time = time.time() message_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'搜狐网' message_dict['zhan_dian'] = zhan_dian # 图片链接 if (not soup.select('.article img')): tu_pian_lian_jie = None message_dict['tu_pian_lian_jie'] = tu_pian_lian_jie else: tu_pian = soup.select('.article img') tu = [] for tu_pian_lian_jie in tu_pian: if (not re.search('http', tu_pian_lian_jie['src'])): tu.append("http:" + tu_pian_lian_jie['src']) else: tu.append(tu_pian_lian_jie['src']) message_dict['tu_pian_lian_jie'] = " ".join(tu) # 文章栏目 try: wen_zhang_lan_mu = soup.select( ".location.area")[0].text.encode('utf-8') except: wen_zhang_lan_mu = "" try: message_dict[ 'wen_zhang_lan_mu'] = wen_zhang_lan_mu.replace( '>', '->') except Exception as e: print e message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu # 文章作者 wen_zhang_zuo_zhe = None message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe # 关键词 guan_jian_ci = None message_dict['guan_jian_ci'] = guan_jian_ci # 相关标签 xiang_guan_biao_qian = None message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian # 阅读数量 yue_du_shu = yue_du_shu message_dict['yue_du_shu'] = yue_du_shu # 主键 message_dict['_id'] = _id + '|_|' + news_url count += 1 # print count ping_dic['url'] = news_url ping_dic['_id'] = _id ping_dic['ping_lun_shu_liang'] = ping_lun_shu_liang # self.update.process_item(ping_dic) # print json.dumps(message_dict, ensure_ascii=False, indent=4) self.mongo.put_content(message_dict) flag2 = 0 if ping_lun_shu_liang > 0: all_page = int(math.ceil(ping_lun_shu_liang / 10.0)) for page in xrange(1, all_page + 1): try: self.comment.run(news_url, _id, page) except Exception as e: print e self.comment.run(news_url, _id, page) continue
class ReNews(object): def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.conn = MySQLdb.connect('localhost', 'root', '1995', 'newsurl', charset='utf8', use_unicode=True) self.cursor = self.conn.cursor() self.message = NewsMessage() def process(self): insert_sql = """ select * from news1""" delete_sql = """delete from news1 where url = %s""" update_sql = """update news1 set ping_lun_shu_liang = %s where url = %s""" self.cursor.execute(insert_sql) data = self.cursor.fetchall() for ds in data: comment_number = self.message.getCommentNumber(ds[0], ds[1]) ping_lun_shu_liang = comment_number[1] if (ping_lun_shu_liang == ds[2]): self.cursor.execute(delete_sql, ds[0]) self.conn.commit() else: self.cursor.execute(update_sql, (ping_lun_shu_liang, ds[0])) self.conn.commit() url = ds[0] req = urllib2.Request(url) try: urllib2.urlopen(req) except urllib2.URLError, e: if hasattr(e, "reason"): print e.reason continue print url re_ = 'http://sports.sohu.com/\d*?/[n]\d*?.shtml' if (re.match(re_, url)): html = '' flag = 1 while 1: try: html = requests.get(url, timeout=30) html.encoding = 'gb2312' break except Exception as e: flag += 1 print e if flag > 10: return soup = BeautifulSoup(html.text, 'html.parser') re_ = '.*[n](\d*?).shtml' _id = re.match(re_, url).group(1) # print _id """这一段代码是用来获取阅读数和评论数的""" comment_number = self.getCommentNumber(url, _id) if comment_number: yue_du_shu = comment_number[0] ping_lun_shu_liang = comment_number[1] else: yue_du_shu = 0 ping_lun_shu_liang = 0 message_dict = dict() ping_dic = dict() # 抓取时间 do_time = time.time() message_dict['do_time'] = do_time #self.update.process_item(ping_dic) print json.dumps(message_dict, ensure_ascii=False, indent=4) # self.mongo.put_content(message_dict) flag1 = 0 if ping_lun_shu_liang > 0: all_page = int(math.ceil(ping_lun_shu_liang / 10.0)) for page in xrange(1, all_page + 1): try: self.comment.run(url, _id, page) except Exception as e: print e flag1 += 1 if (flag1 > 2): return False else: return self.comment.run(url, _id, page) else: html1 = '' flag = 1 while 1: try: html1 = requests.get(url, timeout=30) html1.encoding = 'utf-8' break except Exception as e: flag += 1 print e if flag > 10: return tree = etree.HTML(html1.text) soup = BeautifulSoup(html1.text, 'html.parser') re_ = "http://www.sohu.com/a/(\d*?)\_" _id = soup.select("#mp-comment")[0]['sid'].encode("utf-8") if (int(_id) == 0): _id = 'mp_' + re.search(re_, url).group(1) print _id """这一段代码是用来获取阅读数和评论数的""" comment_number = self.getCommentNumber(url, _id) if comment_number: yue_du_shu = comment_number[0] ping_lun_shu_liang = comment_number[1] else: yue_du_shu = 0 ping_lun_shu_liang = 0 message_dict = dict() ping_dic = dict() # 抓取时间 do_time = time.time() message_dict['do_time'] = do_time print json.dumps(message_dict, ensure_ascii=False, indent=4) # self.mongo.put_content(message_dict) flag2 = 0 if ping_lun_shu_liang > 0: all_page = int(math.ceil(ping_lun_shu_liang / 10.0)) for page in xrange(1, all_page + 1): try: self.comment.run(url, _id, page) except Exception as e: print e flag2 += 1 if (flag2 > 2): continue else: return self.comment.run(url, _id, page)
def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.huan = huanCun() '''self.genzong = genZong()'''
class NewsMessage(object): def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.huan = huanCun() '''self.genzong = genZong()''' def getNewsMessage(self): '''self.genzong.run()''' for news_url in NewsUrl.Run(): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0", "Host": re.split(r"/", news_url)[2], } html = '' flag = 1 while 1: try: html = requests.get(news_url, headers=headers, timeout=30).content break except Exception as e: flag += 1 print "RREQUESTERROR", e print "URL:" + news_url if flag > 10: return tree = etree.HTML(html) '''这一段代码是用来获取阅读数和评论数的''' comment_number = self.getCommentNumber(news_url) if comment_number: yue_du_shu = comment_number[0] ping_lun_shu_liang = comment_number[1] else: yue_du_shu = 0 ping_lun_shu_liang = 0 message_dict = dict() # 文章网址 wen_zhang_wang_zhi = news_url message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi # 文章标题.//*[@class="h-title"]/text() wen_zhang_biao_ti = pathOneNode( tree, ".//*[@class='h-title']/text()|.//*[@class='btt']/h1/text()|.//*[@class='tit']/h1/text()|.//*[@class='sm01']/text()|.//*[@id='title']/text()" ) message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti # 发布时间 fa_bu_shi_jian = pathOneNode( tree, ".//*[@class='h-time']/text()|.//*[@class='time']/text()|.//*[@class='gnxx']/div[2]/text()|.//*[@class='tm']/text()|.//*[@class='sm02']/text()|.//*[@id='pubtime']/text()" ) message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian # 评论数量 ''' ping_lun_shu_liang = ping_lun_shu_liang message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang # 文章来源.//*[@id="source"]/text() try: wen_zhang_lai_yuan = tree.xpath( ".//*[@class='ly']/a/text()|.//*[@class='gnxx']/div[1]/text()|.//*[@class='sus']/a/text()|.//*[@class='sm02']/a/text()|.//*[@id='source']/text()" )[-1].replace(u'来源:', '').replace('\r\n', '').replace(' ', '') except: wen_zhang_lai_yuan = pathAllNode(tree, ".//*[@id='source']//text()") message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan # 文章正文.//*[@id='xhw']//p try: wen_zhang_zheng_wen = tree.xpath( ".//*[@id='p-detail']//p/text()|.//*[@class='content']//p/text()|.//*[@id='content']//p/text()|.//*[@id='content']//p/text()" ) except: wen_zhang_zheng_wen = pathAllNode(tree, ".//*[@id='xhw']") zheng_wen = '' for i in wen_zhang_zheng_wen: zheng_wen = zheng_wen + i.replace(u' ', '').replace( '\r\n', '').replace(' ', '') message_dict['wen_zhang_zheng_wen'] = zheng_wen # 抓取时间 do_time = time.time() message_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'新华网' message_dict['zhan_dian'] = zhan_dian # 图片链接 photo_URL_qian = re.findall( r'http://[a-z|A-Z]+.xinhuanet.com/[a-z|A-Z]+/\d+-\d+/\d+/|http://ent.news.cn/\d+-\d+/\d+/|http://www.sc.xinhuanet.com/[a-z|A-Z]+/\d+-\d+/\d+/', news_url)[0] tu_pian_lian = '' try: tu_pian_lian_jie = tree.xpath( ".//*[@align='center']/img/@src|.//*[@align='center']/span/img/@src" ) if tu_pian_lian_jie: for i in tu_pian_lian_jie: photo_URL = photo_URL_qian + i tu_pian_lian = tu_pian_lian + ' ' + photo_URL else: pass except: print "photo Error:" + news_url message_dict['tu_pian_lian_jie'] = tu_pian_lian # 文章栏目 if (re.split('/', news_url)[3] == 'politics' or re.split('/', news_url)[3] == 'politics'): wen_zhang_lan_mu = re.split('/', news_url)[3] elif (re.split('/', news_url)[3] == 'c'): wen_zhang_lan_mu = 'sport' elif (re.split('/', news_url)[3] == 'content'): wen_zhang_lan_mu = 'bendi' else: wen_zhang_lan_mu = 'yule' message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu # 文章作者 try: try: con = tree.xpath(".//*[@class='tiyi1']/../text()")[-1] wen_zhang_zuo_zhe = '' for i in con: wen_zhang_zuo_zhe += i except: wen_zhang_zuo_zhe = pathAllNode( tree, ".//*[@class='p-jc']|.//*[@class='bjn']|.//*[@class='bj']|.//*[@class='editor']" ) message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe.replace( u'【纠错】', '').replace(u'责任编辑', '').replace(u'体育—', '').replace( '\r\n', '').replace(u':', '').replace(':', '').replace( '[', '').replace(']', '') except: message_dict['wen_zhang_zuo_zhe'] = None # 关键词 try: guan_jian_ci = tree.xpath( './/*[@name="keywords"]/@content')[0].replace('\r\n', '') except: guan_jian_ci = None message_dict['guan_jian_ci'] = guan_jian_ci # 相关标签 xiang_guan_biao_qian = None message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian # 阅读数量 yue_du_shu = yue_du_shu message_dict['yue_du_shu'] = yue_du_shu # 主键 message_dict['_id'] = news_url if (message_dict['fa_bu_shi_jian']) == None: try: with open("ERROR.text", "a") as file: file.write(news_url + "\n") finally: pass else: #print json.dumps(message_dict, ensure_ascii=False, indent=4) self.mongo.put_content(message_dict) if ping_lun_shu_liang > 0: all_page = ping_lun_shu_liang / 20 for page in xrange(1, all_page + 2): self.comment.run(news_url, page) '''#追踪 dict_zhui = {} dict_zhui['url'] = news_url dict_zhui['num'] = ping_lun_shu_liang dict_zhui['_id'] = news_url self.huan.put_zhuizong(dict_zhui)''' def getCommentNumber(self, news_url): jison_object = dict() bu = re.split(r'c_|.htm', news_url)[1] comment_url = 'http://comment.home.news.cn/a/newsCommAll.do?newsId=1-' + bu flag = 1 while 1: try: json_object = json.loads( requests.get(comment_url, timeout=30).content.replace( 'var commentJsonVarStr___=', '')[:-1]) break except Exception as e: flag += 1 print e if flag > 5: return # 阅读数 yue_du_shu = None # 评论数 ping_lun_shu_liang = json_object['totalRows'] return yue_du_shu, ping_lun_shu_liang
def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.f = open('test.txt', "r+") self.i = 0 self.url_list =[]
class NewsMessage(object): def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.f = open('test.txt', "r+") self.i = 0 self.url_list =[] def getNewsTotleUrl(self): for news_url in NewsUrl.Run(): self.getNewsMessage(news_url) if self.f.read(): with open('test.txt', 'r+') as f: a = json.load(f) url_list_before = a['url'] for url_before in url_list_before: url_json = json.loads(url_before)['wen_zhang_wang_zhi'] pin_lun_shu = self.getPinglun(url_json) if pin_lun_shu != json.loads(url_before)['ping_lun_shu_liang']: self.getNewsMessage(url_json) else: url_list_before.remove(url_before) self.file_close(url_list_before) def getNewsMessage(self, news_url): self.i += 1 print self.i print news_url html = '' flag = 1 while 1: try: html = requests.get(news_url, timeout=30).content break except Exception as e: flag += 1 print e if flag > 10: return tree = etree.HTML(html) """这一段代码是用来获取阅读数和评论数的""" comment_number = self.getCommentNumber(news_url) if comment_number: yue_du_shu = comment_number[0] ping_lun_shu_liang = comment_number[1] else: yue_du_shu = 0 ping_lun_shu_liang = 0 message_dict = dict() message_url = dict() # 文章网址 wen_zhang_wang_zhi = news_url message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi message_url['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi # 文章标题 wen_zhang_biao_ti = pathOneNode(tree, '//title/text()') if wen_zhang_biao_ti != None: wen_zhang_biao_ti = wen_zhang_biao_ti.replace('_', '').replace(u"娱乐频道", "").replace(u"凤凰网","").replace(u'凤凰体育','').replace(u'凤凰财经','') message_dict['wen_zhang_biao_ti'] =wen_zhang_biao_ti # 发布时间 fa_bu_shi_jian = pathOneNode(tree, '//span[@itemprop="datePublished"]/text()') if fa_bu_shi_jian == None: fa_bu_shi_jian = pathOneNode(tree, '//*[@id="titL"]/p/span/text()') message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian # 评论数量 ping_lun_shu_liang = ping_lun_shu_liang message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang message_url['ping_lun_shu_liang'] = ping_lun_shu_liang # 文章来源 wen_zhang_lai_yuan = pathOneNode(tree, '//span[@itemprop="publisher"]/span/a/text()') if wen_zhang_lai_yuan == None: wen_zhang_lai_yuan = pathOneNode(tree, '//*[@id="artical_sth"]/p/span[3]/span/text()') if wen_zhang_lai_yuan == None: wen_zhang_lai_yuan = u'凤凰网' message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan # 文章正文 wen_zhang_zheng_wen = pathAllNode(tree, '//div[@id="main_content"]') if wen_zhang_zheng_wen == None: try: re_ = "G_listdata=..\n{1,}.*({title:\'[\S\s]+?])" re__ = "{title:\'([\S\s]+?)\'," text_first = re.findall(re_, html) text_conten = re.findall(re__, text_first[0]) wen_zhang_zheng_wen = "".join(text_conten) except Exception as e: try: wen_zhang_zheng_wen = pathAllNode(tree, '//*[@id="slidedesc2"]') except Exception as e: wen_zhang_zheng_wen = None message_dict['wen_zhang_zheng_wen'] = wen_zhang_zheng_wen # 抓取时间 do_time = time.time() message_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'凤凰网' message_dict['zhan_dian'] = zhan_dian # 图片链接 tu_pian_lian_jie = pathGetImg(tree, '//*[@id="main_content"]//img[@alt]/@src') if tu_pian_lian_jie: message_dict['tu_pian_lian_jie'] = " ".join(tu_pian_lian_jie) else: message_dict['tu_pian_lian_jie'] = None # 文章栏目 wen_zhang_lan_mu = pathAllNode(tree, '//div[@class="theCurrent cDGray js_crumb"]') if wen_zhang_lan_mu == None: wen_zhang_lan_mu = pathAllNode(tree, '//div[@class="speNav js_crumb"]') if wen_zhang_lan_mu == None: wen_zhang_lan_mu = pathAllNode(tree, '//div[@class="cmtNav js_crumb"]') try: message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu.replace('>', '->') except Exception as e: message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu # 文章作者 wen_zhang_zuo_zhe = None message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe # 关键词 guan_jian_ci = None message_dict['guan_jian_ci'] = guan_jian_ci # 相关标签 xiang_guan_biao_qian = None message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian # 阅读数量 yue_du_shu = yue_du_shu message_dict['yue_du_shu'] = yue_du_shu # 主键 message_dict['_id'] = news_url # #时间 # d1 = datetime.datetime.now().date() # message_url['time'] = d1 # print json.dumps(message_dict, ensure_ascii=False, indent=4) if wen_zhang_zheng_wen != None and wen_zhang_biao_ti != None: self.mongo.put_content(message_dict) self.url_list.append(json.dumps(message_url, sort_keys=True, indent=4)) print message_dict if ping_lun_shu_liang > 0: all_page = ping_lun_shu_liang / 20 for page in xrange(1, all_page + 1): self.comment.run(news_url, page) def getPinglun(self, news_url): """这一段代码是用来获取和评论数的""" comment_number = self.getCommentNumber(news_url) if comment_number: ping_lun_shu_liang = comment_number[1] else: ping_lun_shu_liang = 0 return ping_lun_shu_liang def file_close(self, url_list_before): url_dirc=dict() end_url= self.url_list+url_list_before url_dirc['url'] = end_url self.f.truncate(0) self.f.seek(0, 0) self.f.write(json.dumps(url_dirc)) self.f.close() def getCommentNumber(self, news_url): json_object = dict() comment_url = 'http://comment.ifeng.com/get.php?doc_url=%s&format=js&job=1' % news_url flag = 1 while 1: try: json_object = json.loads(requests.get(comment_url, timeout=30).content.replace('var commentJsonVarStr___=', '')[:-1]) break except Exception as e: flag += 1 print e if flag > 5: return # 阅读数 yue_du_shu = json_object['join_count'] # 评论数 ping_lun_shu_liang = json_object['count'] return yue_du_shu, ping_lun_shu_liang