def __init__(self): # 实例化工厂对象 self.field_factory = FieldFactory(u'网易新闻') self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() self.db_factory = QueueFactory() # 实例化具体对象 self.log = Logging('./Log/log_zhengwen').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.queue_news = self.queue_factory.create(config.queue_type, private_config.queue_news, config.queue_host, config.queue_port) self.queue_pinglun = self.queue_factory.create( config.queue_type, private_config.queue_pinglun, config.queue_host, config.queue_port) self.db = self.db_factory.create(config.db_type, config.db_table_news_zhengwen, config.db_host, config.db_port)
class Producer(object): def __init__(self): # 实例化工厂对象 self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() self.key_words=[u'社会',u'互联网',u'政务',u'国际',u'体育',u'财经',u'谣言',u'企业'] # 实例化具体对象 self.log = Logging('./Log/zhi_wei_producer').get_logging() self.queue = self.queue_factory.create(config.queue_type, private_config.queue_table, config.queue_host, config.queue_port) self.browser_2 = self.browser_factory.create(config.browser_type) def main(self,totalPage): cookie='nocookie' for words in self.key_words: # print words for page in range(1,totalPage): url='http://ef.zhiweidata.com/CatItem?first='+urllib.quote(words.encode('utf-8'))+'&second=%E5%85%A8%E9%83%A8&page='+str(page)+'&word=null' print url html=self.Get_page(url,cookie_j=cookie) # print html if html: try: self.Parse_data(html) except Exception as e: self.log.info('parse data wrong!%s'%e) self.log.info('put the url into queue succesfully! ') def Parse_data(self,data): resultList=data.get('resultList') for result in resultList: id_s=result.get('id') page_href='http://ef.zhiweidata.com/Baike?id='+id_s # print page_href self.queue.put(page_href) def Get_page(self,url,cookie_j): headers=self.Get_header(refer=url,cookie_j=cookie_j) html=self.browser_2.visit(url=url,headers=headers,timeout=60,retry=5) html= json.loads(html) #print json.dumps(html,ensure_ascii=False,indent=4) return html def Get_header(self,refer,cookie_j): headers={ 'Host': 'ef.zhiweidata.com', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0', 'Accept':'*/*', 'Accept-Language':'zh-Hans;q=1', 'Accept-Encoding':"gzip, deflate", 'Connection':"keep-alive"} return headers
def __init__(self): # 实例化工厂对象 self.queue = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() # 实例化具体对象 self.log = Logging('./Log/lanmuProducer').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port)
def __init__(self): # 实例化工厂对象 self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() self.key_words=[u'社会',u'互联网',u'政务',u'国际',u'体育',u'财经',u'谣言',u'企业'] # 实例化具体对象 self.log = Logging('./Log/zhi_wei_producer').get_logging() self.queue = self.queue_factory.create(config.queue_type, private_config.queue_table, config.queue_host, config.queue_port) self.browser_2 = self.browser_factory.create(config.browser_type)
def __init__(self): # 实例化工厂对象 self.queue_redis= QueueFactory() self.field_factory = FieldFactory(u'新浪四川') self.browser_factory = BrowserFactory() self.db_factory = QueueFactory() # 实例化具体对象 self.log = Logging('./Log/log_pl').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.queue_comment = self.queue_redis.create(config.queue_type, private_config.queue_comment, config.queue_host, config.queue_port) self.db = self.db_factory.create(config.db_type, config.db_table_news_pinglun, config.db_host, config.db_port)
class Lanmu_Producer(): def __init__(self): # 实例化工厂对象 self.queue = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() # 实例化具体对象 self.log = Logging('./Log/lanmuProducer').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) def main(self): '''爬取手机网页以下版块 [时政,国际,科技,军事,社会,观点,财经,教育]下面的子版块连接 ''' lan_mu_list=['22','23','28','30','24','25','26','645'] http_h = 'http://m.people.cn' file = open('lanmu_hrefs.txt','w') for lanmu in lan_mu_list: href = 'http://m.people.cn/%s/index.html' % (lanmu) try: html = self.browser.visit(href) tree = etree.HTML(html) xpath_next = tree.xpath('.//ul[@class="clear"]/li') # print len(xpath_next) if len(xpath_next)>0: for li in xpath_next: lan_mu_link = self.textxpath(li, './/@href') if 'http://m.people.cn' in lan_mu_link: # print lan_mu_link file.write(lan_mu_link+'\n') # self.queue_lanmu.put(lan_mu_link) else: lan_mu_link=http_h+lan_mu_link # self.queue_lanmu.put(lan_mu_link) file.write(lan_mu_link+'\n') else: self.log.info(u'栏目链接解析失败!') except Exception as e: self.log.info('%s page wrong!%s'%(lanmu,e)) self.log.info(u'栏目链接灌入完毕!') file.close() def textxpath(self, tree, path, pos=0): texts = tree.xpath(path) if not texts: return None try: return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos] except: return None
def __init__(self): # 实例化工厂对象 self.field_factory = FieldFactory(u'zhi_wei_shi_jian') self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() # 实例化具体对象 self.log = Logging('./Log/zhiwei_message').get_logging() self.queue = self.queue_factory.create(config.queue_type, private_config.queue_table, config.queue_host, config.queue_port) self.db=self.queue_factory.create(config.db_type, config.db_table_zhi_wei, config.db_host, config.db_port) self.browser_2 = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port)
def __init__(self): # 实例化工厂对象 self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() # 实例化具体对象 self.log = Logging('./Log/log_zhengwen').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.queue_news = self.queue_factory.create(config.queue_type, private_config.queue_news, config.queue_host, config.queue_port) self.queue_zhengwu = self.queue_factory.create(config.queue_type, private_config.queue_zhengwu_zhenwgen, config.queue_host, config.queue_port) self.queue_pinglun = self.queue_factory.create(config.queue_type, private_config.queue_pinglun, config.queue_host, config.queue_port)
def __init__(self): # 实例化工厂对象 self.queue= QueueFactory() self.field_factory = FieldFactory(u'人民网') self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() self.db_factory = QueueFactory() # 实例化具体对象 self.log = Logging('./Log/ZW_Pinglun').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.pinglun_queue = self.queue.create(config.queue_type, private_config.queue_table_pinglun, config.queue_host, config.queue_port) self.db = self.db_factory.create(config.db_type, config.db_table_news_pinglun, config.db_host, config.db_port)
class Pinglun_Consumer(): #消费者根据生产者爬取的网页爬取文章数据 def __init__(self): # 实例化工厂对象 self.queue= QueueFactory() self.field_factory = FieldFactory(u'人民网') self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() self.db_factory = QueueFactory() # 实例化具体对象 self.log = Logging('./Log/ZW_Pinglun').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.pinglun_queue = self.queue.create(config.queue_type, private_config.queue_table_pinglun, config.queue_host, config.queue_port) self.db = self.db_factory.create(config.db_type, config.db_table_news_pinglun, config.db_host, config.db_port) def main(self): #爬取文章评论数据 href = self.pinglun_queue.get() if href: self.news_link = href.split('@@@@@@')[-1] pinglun_link = href.split('@@@@@@')[0] # print pinglun_link self.ParseData(pinglun_link) else: #队列为空 self.log.info(u'评论链接队列为空,休息1h!') time.sleep(60*60*2) def ParseData(self,pinglun_link): id= pinglun_link.split('?id=')[1] pinglun_link= urllib.quote(pinglun_link) link_p= 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cyrhbddTW&topic_url=%s&page_size=30&topic_source_id=%s'%(pinglun_link,id) try: html =self.browser.visit(link_p) data = json.loads(html) topic_id =data['topic_id'] total_page_no =data['total_page_no'] for page in xrange(1,total_page_no+1): link_p='http://changyan.sohu.com/api/2/topic/comments?&client_id=cyrhbddTW&page_size=30&topic_id=%s&page_no=%s'%(topic_id,str(page)) html =self.browser.visit(link_p) data = json.loads(html) comments = data['comments'] for comment in comments: field = self.field_factory.create('ping_lun') field.set('news_url',self.news_link) field.set('ping_lun_nei_rong',comment['content']) field.set('ping_lun_shi_jian',comment['create_time']) field.set('hui_fu_shu',comment['reply_count']) field.set('dian_zan_shu',comment['support_count']) field.set('ping_lun_id',comment['comment_id']) field.set('yong_hu_ming',comment['passport']['nickname']) field.set('yong_hu_deng_ji',comment['userScore']['level']) field.set('yong_hu_sheng_fen',comment['ip_location']) field.set('id', field.ping_lun_id) data = field.make() if data: self.db.put(data) self.log.info('save Pinglun success!') # print json.dumps(data, ensure_ascii=False, indent=4) except Exception as e: self.log.info('%s page wrong!%s'%(pinglun_link,e)) def textxpath(self, tree, path, pos=0): texts = tree.xpath(path) if not texts: return None try: return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos] except: return None def datatransform(self, data): #将年月日转换为时间戳2016-08-08 10:01:56 timeStamp=time.mktime(time.strptime(data,'%Y-%m-%d %H:%M:%S')) # if not data: # data = '2016-01-02' # data=data.decode('gbk') # timeArray = data.split('-') # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2])) # timeStamp=int(time.mktime(d.timetuple())) return timeStamp
class Zhengwen_Producer(): def __init__(self): # 实例化工厂对象 self.queue = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() # 实例化具体对象 self.log = Logging('./Log/ZW_producer').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.queue = self.queue.create(config.queue_type, private_config.queue_table, config.queue_host, config.queue_port) def main(self,totalpage): '''爬取手机网页以下版块 [时政,国际,科技,军事,社会,观点,财经,教育]下面的子版块正文连接 每个版块下只有20页的新闻 ''' hrefs=open('./lanmu_hrefs.txt','r').readlines() for href in hrefs: self.GetPage(href,totalpage) self.log.info(u'文章链接灌入完毕!') def GetPage(self,href_1,totalpage): if href_1: http_h = 'http://m.people.cn' for page in xrange(1,totalpage):#翻页 href = href_1.split('.html')[0]+str(page)+'.html' try: html = self.browser.visit(href) tree = etree.HTML(html) xpath_next = tree.xpath('.//ul[@class="news_list news_list_c"]/li') if len(xpath_next)>0: for li in xpath_next: news_link = self.textxpath(li, './/@href') if 'http://m.people.cn' in news_link: # print news_link self.queue.put(news_link) else: news_link=http_h+news_link # print news_link self.queue.put(news_link) else: self.log.info(u'栏目链接解析失败!') except Exception as e: self.log.info('%s page wrong!%s'%(href,e)) else: self.log.info(u'栏目连接为空!') def textxpath(self, tree, path, pos=0): texts = tree.xpath(path) if not texts: return None try: return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos] except: return None
class Zhengwen_consumer(): def __init__(self): # 实例化工厂对象 self.field_factory = FieldFactory(u'网易新闻') self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() self.db_factory = QueueFactory() # 实例化具体对象 self.log = Logging('./Log/log_zhengwen').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.queue_zhengwu = self.queue_factory.create( config.queue_type, private_config.queue_zhengwu_zhenwgen, config.queue_host, config.queue_port) self.queue_pinglun = self.queue_factory.create( config.queue_type, private_config.queue_pinglun, config.queue_host, config.queue_port) self.db = self.db_factory.create(config.db_type, config.db_table_news_zhengwen, config.db_host, config.db_port) def main(self): while (True): url = self.queue_zhengwu.get() if url: self.ParsePage(url) else: self.log.info('new queue is empty!') break time.sleep(random.randint(10, 20)) def ParsePage(self, url): try: html = self.browser.visit(url, encoding='gbk') if html: field = self.field_factory.create('si_chuan_news') tree = etree.HTML(html) #栏目 lanmu = tree.xpath( './/span[@class="ep-crumb JS_NTES_LOG_FE"]/a/text()') #标题 biaoti = self.textxpath(tree, './/head/title/text()') #关键词 guanjianci = self.textxpath( tree, './/head/meta[@name="keywords"]/@content') #发布时间 shijian = self.textxpath( tree, './/div[@class="ep-time-soure cDGray"]/text()') timestamp = 0 if shijian: shijian = re.findall( '\d{4}-\d{1,2}-\d{1,2} \d{2}:\d{2}:\d{2}', shijian)[0] timestamp = self.datatransform(shijian) #文章来源 laiyuan = self.textxpath( tree, './/div[@class="ep-time-soure cDGray"]/a/text()') #正文 wen_zhang_zheng_wen = xpathutil.get_Node_text( tree, './/div[@id="endText"]/p') #图片链接 tu_pian_lian_jie = tree.xpath( './/div[@id="endText"]/p/img/@src') #评论数 ping_lun_shu_liang = self.GetPinglun(url) field.set('wen_zhang_wang_zhi', url) field.set('wen_zhang_lan_mu', ' '.join(lanmu)) field.set('wen_zhang_biao_ti', biaoti) field.set('guan_jian_ci', guanjianci) field.set('fa_bu_shi_jian', timestamp) #时间戳格式 field.set('wen_zhang_lai_yuan', laiyuan) field.set('wen_zhang_zheng_wen', wen_zhang_zheng_wen) field.set('tu_pian_lian_jie', tu_pian_lian_jie) field.set('ping_lun_shu_liang', ping_lun_shu_liang) field.set('id', url) data = field.make() if data: # print json.dumps(data,ensure_ascii=False) self.db.put(data) self.log.info('save data sucess!') else: self.log.info('Parsing page wrong!') except Exception as e: self.log.info(e) print url #打印出解析错误的页面 time.sleep(10) def GetPinglun(self, url): #http://news.163.com/16/0808/14/BTV3ABLP00014AEE.html pinglunshu = 0 try: uid = url.split('.html')[0].split('/')[-1] comment_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s' % uid html = self.browser.visit(comment_url) data = json.loads(html) if data["tcount"] != 0: pinglunshu = data["tcount"] uid = data["docId"] comment_url = 'http://comment.news.163.com/news_gov_bbs/%s.html' % uid # print comment_url self.queue_pinglun.put(comment_url + '@@@@@@' + url) except Exception as e: self.log.info('get pinglunshu wrong!%s' % e) return pinglunshu def textxpath(self, tree, path, pos=0): texts = tree.xpath(path) if not texts: return None try: return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos] except: return None def datatransform(self, data): #将年月日转换为时间戳2016-08-08 10:01:56 timeStamp = time.mktime(time.strptime(data, '%Y-%m-%d %H:%M:%S')) # if not data: # data = '2016-01-02' # data=data.decode('gbk') # timeArray = data.split('-') # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2])) # timeStamp=int(time.mktime(d.timetuple())) return timeStamp
class GetComment(object): def __init__(self): # 实例化工厂对象 self.queue_redis= QueueFactory() self.field_factory = FieldFactory(u'新浪四川') self.browser_factory = BrowserFactory() self.db_factory = QueueFactory() # 实例化具体对象 self.log = Logging('./Log/log_pl').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.queue_comment = self.queue_redis.create(config.queue_type, private_config.queue_comment, config.queue_host, config.queue_port) self.db = self.db_factory.create(config.db_type, config.db_table_news_pinglun, config.db_host, config.db_port) def main(self): while(True): link = self.queue_comment.get() # print link if link: pl_href = link.split('@@@@@@')[0] self.zw_href = link.split('@@@@@@')[-1] try: html=self.browser.visit(pl_href,encoding='utf-8') if html: self.getContent(html) except Exception as e: self.log.info(e) else: self.log.info('queue is empty!') time.sleep(60*60) # time.sleep(10)#每隔10ms爬取一条新闻 def getContent(self,html): #http://cmnt.sina.cn/aj/cmnt/list?&index=fxsvenx3222366&page=1 fxvixer7556179 tree =etree.HTML(html) newsid = self.textxpath(tree,'.//div[@class="cmnt_list"]/div[@class="cmnt_item"]/@data-newsid') page_num=1 while(True): try: json_link = 'http://cmnt.sina.cn/aj/v2/index?product=comos&group=0&index=%s&page=%s'%(newsid.replace('comos-',''),str(page_num)) page_content = self.browser.visit(json_link) datas=json.loads(page_content) data = datas['data'] if data['data']: data=data['data'] self.save_comment(data) page_num+=1 else: # self.log.info('get pinglun content fail!') break except Exception as e: self.log.info(e) break def save_comment(self,datas): for item in datas: field = self.field_factory.create('ping_lun') # 评论内容 ping_lun_nei_rong = item['main']['content'] #评论时间 ping_lun_shi_jian = item['main']['time'] timestamp = self.datatransform(ping_lun_shi_jian) # 点赞数量 dian_zan_shu = item['main']['agree'] #评论id ping_lun_id=item['main']['mid'] # 用户昵称 yong_hu_ming = item['main']['nick'] # 用户省份 yong_hu_sheng_fen = item['main']['source'] field.set('news_url',self.zw_href) field.set('ping_lun_nei_rong',ping_lun_nei_rong) field.set('ping_lun_shi_jian',int(timestamp)) field.set('dian_zan_shu',dian_zan_shu) field.set('ping_lun_id',ping_lun_id) field.set('yong_hu_ming',yong_hu_ming) field.set('yong_hu_sheng_fen',yong_hu_sheng_fen) field.set('id',ping_lun_id) data = field.make() if data: # print json.dumps(data, ensure_ascii=False, indent=4) self.db.put(data) self.log.info('save pinglun success!') def textxpath(self, tree, path, pos=0): texts = tree.xpath(path) if not texts: return None try: return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos] except: return None def datatransform(self, data): #将年月日转换为时间戳 if u'月' in data: sec = int(re.findall('\d+',data)[0]) *30*24*60*60 dt = datetime.datetime.now() - datetime.timedelta(seconds=sec) elif u'天' in data: sec = int(re.findall('\d+',data)[0]) *24*60*60 dt = datetime.datetime.now() - datetime.timedelta(seconds=sec) elif u'小时' in data: sec = int(re.findall('\d+',data)[0]) *60*60 dt = datetime.datetime.now() - datetime.timedelta(seconds=sec) elif u'分钟' in data: sec = int(re.findall('\d+',data)[0]) *60 dt = datetime.datetime.now() - datetime.timedelta(seconds=sec) else: dt = datetime.datetime.strptime(data, '%Y-%m-%d %H:%M') return time.mktime(dt.timetuple())
class Zhengwen_Consumer(): #消费者根据生产者爬取的网页爬取文章数据 def __init__(self): # 实例化工厂对象 self.queue= QueueFactory() self.field_factory = FieldFactory(u'人民网') self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() self.db_factory = QueueFactory() self.producer = Zhengwen_Producer() # 实例化具体对象 self.log = Logging('./Log/ZW_consumer').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.news_queue = self.queue.create(config.queue_type, private_config.queue_table, config.queue_host, config.queue_port) self.pinglun_queue = self.queue.create(config.queue_type, private_config.queue_table_pinglun, config.queue_host, config.queue_port) self.db = self.db_factory.create(config.db_type, config.db_table_news_zhengwen, config.db_host, config.db_port) def main(self): #消费者从队列中取出文章地址访问爬取数据 news_link = self.news_queue.get() # news_link='http://m.people.cn/n4/2016/0718/c203-7230678.html' if news_link: try: self.ParseData(news_link) except Exception as e: self.log.info('%s page wrong!%s'%(news_link,e)) else: #队列为空 self.log.info(u'文章链接队列为空,休息4h!') time.sleep(60*60*4) self.producer.main(10) def ParseData(self,news_link): html = self.browser.visit(news_link) tree = etree.HTML(html) field = self.field_factory.create('si_chuan_news') guan_jian_ci = self.textxpath(tree,'.//meta[@name="keywords"]/@content') if guan_jian_ci is not None: field.set('guan_jian_ci',guan_jian_ci.split()) lai_yuan = self.textxpath(tree,'.//meta[@name="source"]/@content') shi_jian = self.textxpath(tree,'.//meta[@name="publishdate"]/@content') time_stamp=self.datatransform(shi_jian) field.set('fa_bu_shi_jian', int(time_stamp)) field.set('wen_zhang_lai_yuan', lai_yuan) field.set('wen_zhang_lan_mu', self.textxpath(tree, './/header/em//a/text()')) tu_pian = tree.xpath('.//div[@class="wb_content"]/div[@id="p_content1"]//img/@src') field.set('tu_pian_lian_jie',tu_pian) ping_lun_shu = self.getPingLun(news_link) field.set('ping_lun_shu_liang',ping_lun_shu) #正文 text_all = xpathutil.get_all_text(tree, './/*[@id="p_content1"]') field.set('wen_zhang_zheng_wen',text_all) field.set('wen_zhang_biao_ti', self.textxpath(tree, './/div[@class="wb_content"]/h1/text()')) field.set('wen_zhang_wang_zhi', news_link) field.set('id', field.wen_zhang_wang_zhi) data = field.make() if data: self.db.put(data) self.log.info('save news success!') # print json.dumps(data, ensure_ascii=False, indent=4) #爬取评论 if ping_lun_shu != 0: pinglun_href = self.textxpath(tree,'.//p[@class="all_pinglun"]/a/@href') self.pinglun_queue.put(pinglun_href+'@@@@@@'+news_link) # self.GetPinglun(pinglun_href,news_link) def textxpath(self, tree, path, pos=0): texts = tree.xpath(path) if not texts: return None try: return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos] except: return None def getPingLun(self,topic_url): pinglunshu =0 topic_url =urllib.quote(topic_url) link_p= 'http://changyan.sohu.com/api/2/topic/load?client_id=cyrhbddTW&topic_url=%s&page_size=3'%(topic_url) try: html =self.browser.visit(link_p) if 'cmt_sum' in html: data = json.loads(html) pinglunshu=data['cmt_sum'] except Exception as e: self.log.info('get pinglunshu wrong!%s'%e) return pinglunshu def datatransform(self, data): #将年月日转换为时间戳2016-08-08 10:01:56 timeStamp=time.mktime(time.strptime(data,'%Y-%m-%d')) # if not data: # data = '2016-01-02' # data=data.decode('gbk') # timeArray = data.split('-') # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2])) # timeStamp=int(time.mktime(d.timetuple())) return timeStamp
class Zhengwen_producer(object): def __init__(self): # 实例化工厂对象 self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() # 实例化具体对象 self.log = Logging('./Log/log_zhengwen').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.queue_news = self.queue_factory.create(config.queue_type, private_config.queue_news, config.queue_host, config.queue_port) self.queue_zhengwu = self.queue_factory.create( config.queue_type, private_config.queue_zhengwu_zhenwgen, config.queue_host, config.queue_port) self.queue_pinglun = self.queue_factory.create( config.queue_type, private_config.queue_pinglun, config.queue_host, config.queue_port) def main(self): lanmu_list = self.get_lanmu() # lanmu_list=['http://public.house.163.com/special/03531F4E/index_news.js?callback=data_callback'] try: for url in lanmu_list: if 'gov' in url: self.getGovHref(url) else: self.getNewsHref(url) # time.sleep(random.randint(10,20) self.log.info('getting the zhengwen href successful!') except Exception as e: self.log.info(e) time.sleep(10) def getNewsHref(self, url): #获取网易新闻正文链接 url_l = url page = 2 try: while (True): # print url_l html = self.browser.visit(url_l, encoding='gbk') if html: html = html.replace(' ', '').replace('\n', '') html = str(html).replace('data_callback([', '{"data":[').replace('])', ']}') html = re.sub(r",\s*?]", "]", html) datas = json.loads(html, encoding='utf-8') for data in datas['data']: # print data docurl = data['docurl'] #正文链接 commenturl = data['commenturl'] #评论链接 # print docurl,commenturl if commenturl: self.queue_pinglun.put(commenturl + '@@@@@@' + docurl) else: uid = docurl.split('.html')[0].split('/')[-1] commenturl = 'http://comment.news.163.com/news3_bbs/%s.html' % uid self.queue_pinglun.put(commenturl + '@@@@@@' + docurl) self.queue_news.put(docurl) else: break url_l = url.split('.js?')[0] + '_%s.js' % str(page).zfill(2) page += 1 except Exception as e: self.log.info(e) def getGovHref(self, url): #获取网易政务新闻链接 url_l = url #http://gov.163.com/special/zwzx_n/;http://gov.163.com/special/zwzx_n_02/ try: if 'zwzx' in url: totalpage = 17 else: totalpage = 5 for page in xrange(2, totalpage): html = self.browser.visit(url_l, encoding='gbk') tree = etree.HTML(html) hrefs = tree.xpath('.//div[@class="cnt"]/ul/li/a/@href') # print len(hrefs) if hrefs: for href in hrefs: # print href self.queue_zhengwu.put(href) url_l = url[0:-1] + '_%s/' % str(page).zfill(2) except Exception as e: self.log.info(e) def get_lanmu(self): #从lanmu_href文件读入栏目链接 lanmu_hrefs = [] file = open('lanmu_href', 'r') for line in file.readlines(): lanmu_hrefs.append(line.split('\n')[0]) return lanmu_hrefs def textxpath(self, tree, path, pos=0): texts = tree.xpath(path) if not texts: return None try: return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos] except: return None
class Zhengwen_consumer(): def __init__(self): # 实例化工厂对象 self.field_factory = FieldFactory(u'网易新闻') self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() self.db_factory = QueueFactory() # 实例化具体对象 self.log = Logging('./Log/log_zhengwen').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.queue_zhengwu = self.queue_factory.create(config.queue_type, private_config.queue_zhengwu_zhenwgen, config.queue_host, config.queue_port) self.queue_pinglun = self.queue_factory.create(config.queue_type, private_config.queue_pinglun, config.queue_host, config.queue_port) self.db = self.db_factory.create(config.db_type, config.db_table_news_zhengwen, config.db_host, config.db_port) def main(self): while(True): url = self.queue_zhengwu.get() if url: self.ParsePage(url) else: self.log.info('new queue is empty!') break time.sleep(random.randint(10,20)) def ParsePage(self,url): try: html = self.browser.visit(url,encoding='gbk') if html: field =self.field_factory.create('si_chuan_news') tree=etree.HTML(html) #栏目 lanmu = tree.xpath('.//span[@class="ep-crumb JS_NTES_LOG_FE"]/a/text()') #标题 biaoti = self.textxpath(tree,'.//head/title/text()') #关键词 guanjianci = self.textxpath(tree,'.//head/meta[@name="keywords"]/@content') #发布时间 shijian = self.textxpath(tree,'.//div[@class="ep-time-soure cDGray"]/text()') timestamp=0 if shijian: shijian = re.findall('\d{4}-\d{1,2}-\d{1,2} \d{2}:\d{2}:\d{2}',shijian)[0] timestamp = self.datatransform(shijian) #文章来源 laiyuan = self.textxpath(tree,'.//div[@class="ep-time-soure cDGray"]/a/text()') #正文 wen_zhang_zheng_wen = xpathutil.get_Node_text(tree,'.//div[@id="endText"]/p') #图片链接 tu_pian_lian_jie = tree.xpath('.//div[@id="endText"]/p/img/@src') #评论数 ping_lun_shu_liang = self.GetPinglun(url) field.set('wen_zhang_wang_zhi',url) field.set('wen_zhang_lan_mu',' '.join(lanmu)) field.set('wen_zhang_biao_ti',biaoti) field.set('guan_jian_ci',guanjianci) field.set('fa_bu_shi_jian',timestamp)#时间戳格式 field.set('wen_zhang_lai_yuan',laiyuan) field.set('wen_zhang_zheng_wen',wen_zhang_zheng_wen) field.set('tu_pian_lian_jie',tu_pian_lian_jie) field.set('ping_lun_shu_liang',ping_lun_shu_liang) field.set('id',url) data =field.make() if data: # print json.dumps(data,ensure_ascii=False) self.db.put(data) self.log.info('save data sucess!') else: self.log.info('Parsing page wrong!') except Exception as e: self.log.info(e) print url#打印出解析错误的页面 time.sleep(10) def GetPinglun(self,url): #http://news.163.com/16/0808/14/BTV3ABLP00014AEE.html pinglunshu=0 try: uid = url.split('.html')[0].split('/')[-1] comment_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s'%uid html=self.browser.visit(comment_url) data =json.loads(html) if data["tcount"]!=0: pinglunshu = data["tcount"] uid =data["docId"] comment_url = 'http://comment.news.163.com/news_gov_bbs/%s.html'%uid # print comment_url self.queue_pinglun.put(comment_url+'@@@@@@'+url) except Exception as e: self.log.info('get pinglunshu wrong!%s'%e) return pinglunshu def textxpath(self, tree, path, pos=0): texts = tree.xpath(path) if not texts: return None try: return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos] except: return None def datatransform(self, data): #将年月日转换为时间戳2016-08-08 10:01:56 timeStamp=time.mktime(time.strptime(data,'%Y-%m-%d %H:%M:%S')) # if not data: # data = '2016-01-02' # data=data.decode('gbk') # timeArray = data.split('-') # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2])) # timeStamp=int(time.mktime(d.timetuple())) return timeStamp
class Zhengwen_consumer(): def __init__(self): # 实例化工厂对象 self.field_factory = FieldFactory(u'网易新闻') self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() self.db_factory = QueueFactory() # 实例化具体对象 self.log = Logging('./Log/log_pinglun').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.queue_pinglun = self.queue_factory.create(config.queue_type, private_config.queue_pinglun, config.queue_host, config.queue_port) self.db = self.db_factory.create(config.db_type, config.db_table_news_zhengwen, config.db_host, config.db_port) def main(self): while(True): url = self.queue_pinglun.get() if url:#http://comment.news.163.com/news_gov_bbs/BTHNT83600234IG8.html comment_url,self.docurl=url.split('@@@@@@') # comment_url,self.docurl=['http://comment.news.163.com/news3_bbs/BTSJOECA00014SEH.html','url'] try: if comment_url: self.GetPage(comment_url) except Exception as e: self.log.info(e) # print comment_url#打印出解析错误的页面 else: self.log.info('new queue is empty!') break time.sleep(random.randint(10,20)) def GetPage(self,comment_url): uid = comment_url.split('.html')[0].split('/')[-1] offset = 0 limit = 30 while(True): js_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867' \ 'c3d71db5856/threads/%s/comments/newList?offset=%s&limit=%s'%(uid,str(offset),str(limit)) html = self.browser.visit(js_url) datas = json.loads(html) newListSize =datas["newListSize"] if newListSize!=0: self.ParseData(datas) limit = 30 offset += limit time.sleep(random.randint(5,10)) else: # self.log.info('Parsing page wrong!') break def ParseData(self,datas): commentIds = datas['commentIds'] comments = datas['comments'] for ids in commentIds: idList=ids.split(',') id_n = idList[-1]#最后一个id为当前帖子的id field =self.field_factory.create('ping_lun') # 评论文章链接 field.set('news_url',self.docurl) #评论时间戳 time=comments[id_n]['createTime'] field.set('ping_lun_shi_jian',self.datatransform(time)) # 回复数量 field.set('hui_fu_shu',0) # 点赞数量 field.set('dian_zan_shu',comments[id_n]['vote']) #评论id field.set('ping_lun_id',id_n) # 用户昵称 field.set('yong_hu_ming',comments[id_n]['user']['nickname']) # 用户省份 field.set('yong_hu_sheng_fen',comments[id_n]['user']['location'])#时间戳格式 # 评论内容 content_all =u'' for id in idList: if id in comments.keys(): content=comments[id]['content'] content_all+=content field.set('ping_lun_nei_rong',content_all) field.set('id',id_n) data =field.make() if data: # print json.dumps(data,ensure_ascii=False) self.db.put(data) self.log.info('save data sucess!') def datatransform(self, data): #将年月日转换为时间戳2016-08-08 10:01:56 timeStamp=time.mktime(time.strptime(data,'%Y-%m-%d %H:%M:%S')) # if not data: # data = '2016-01-02' # data=data.decode('gbk') # timeArray = data.split('-') # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2])) # timeStamp=int(time.mktime(d.timetuple())) return timeStamp
class Zhengwen_consumer(): def __init__(self): # 实例化工厂对象 self.field_factory = FieldFactory(u'网易新闻') self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() self.db_factory = QueueFactory() # 实例化具体对象 self.log = Logging('./Log/log_pinglun').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.queue_pinglun = self.queue_factory.create( config.queue_type, private_config.queue_pinglun, config.queue_host, config.queue_port) self.db = self.db_factory.create(config.db_type, config.db_table_news_zhengwen, config.db_host, config.db_port) def main(self): while (True): url = self.queue_pinglun.get() if url: #http://comment.news.163.com/news_gov_bbs/BTHNT83600234IG8.html comment_url, self.docurl = url.split('@@@@@@') # comment_url,self.docurl=['http://comment.news.163.com/news3_bbs/BTSJOECA00014SEH.html','url'] try: if comment_url: self.GetPage(comment_url) except Exception as e: self.log.info(e) # print comment_url#打印出解析错误的页面 else: self.log.info('new queue is empty!') break time.sleep(random.randint(10, 20)) def GetPage(self, comment_url): uid = comment_url.split('.html')[0].split('/')[-1] offset = 0 limit = 30 while (True): js_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867' \ 'c3d71db5856/threads/%s/comments/newList?offset=%s&limit=%s'%(uid,str(offset),str(limit)) html = self.browser.visit(js_url) datas = json.loads(html) newListSize = datas["newListSize"] if newListSize != 0: self.ParseData(datas) limit = 30 offset += limit time.sleep(random.randint(5, 10)) else: # self.log.info('Parsing page wrong!') break def ParseData(self, datas): commentIds = datas['commentIds'] comments = datas['comments'] for ids in commentIds: idList = ids.split(',') id_n = idList[-1] #最后一个id为当前帖子的id field = self.field_factory.create('ping_lun') # 评论文章链接 field.set('news_url', self.docurl) #评论时间戳 time = comments[id_n]['createTime'] field.set('ping_lun_shi_jian', self.datatransform(time)) # 回复数量 field.set('hui_fu_shu', 0) # 点赞数量 field.set('dian_zan_shu', comments[id_n]['vote']) #评论id field.set('ping_lun_id', id_n) # 用户昵称 field.set('yong_hu_ming', comments[id_n]['user']['nickname']) # 用户省份 field.set('yong_hu_sheng_fen', comments[id_n]['user']['location']) #时间戳格式 # 评论内容 content_all = u'' for id in idList: if id in comments.keys(): content = comments[id]['content'] content_all += content field.set('ping_lun_nei_rong', content_all) field.set('id', id_n) data = field.make() if data: # print json.dumps(data,ensure_ascii=False) self.db.put(data) self.log.info('save data sucess!') def datatransform(self, data): #将年月日转换为时间戳2016-08-08 10:01:56 timeStamp = time.mktime(time.strptime(data, '%Y-%m-%d %H:%M:%S')) # if not data: # data = '2016-01-02' # data=data.decode('gbk') # timeArray = data.split('-') # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2])) # timeStamp=int(time.mktime(d.timetuple())) return timeStamp
class Consumer(object): def __init__(self): # 实例化工厂对象 self.field_factory = FieldFactory(u'zhi_wei_shi_jian') self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() # 实例化具体对象 self.log = Logging('./Log/zhiwei_message').get_logging() self.queue = self.queue_factory.create(config.queue_type, private_config.queue_table, config.queue_host, config.queue_port) self.db=self.queue_factory.create(config.db_type, config.db_table_zhi_wei, config.db_host, config.db_port) self.browser_2 = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) def main(self): while(True): href = self.queue.get() if href: cookie='nocookie' try: html=self.Get_page(href,cookie_j=cookie) if html: self.Parse_data(html,href) except Exception as e: self.log.info('parse data wrong!') else: self.log.info('queue is empty!') break time.sleep(60) def Parse_data(self,data,href): field = self.field_factory.create('zhi_wei_shi_jian') #标题 field.set('biao_ti',data.get('base').get('name')) #页面链接 field.set('ye_mian_lian_jie',href) #事件标签 field.set('shi_jian_biao_qian',data.get('base').get('tag')) #事件趋势 field.set('shi_jian_qu_shi',data.get('base').get('trend')) #参与媒体 try: field.set('can_yu_mei_ti',data.get('base').get('media').split('、')) except Exception as e: self.log.info('get can_yu_mei_ti wrong!') #参与微博大V try: field.set('can_yu_wei_bo_da_V',data.get('base').get('weibo').split('、')) except Exception as e: self.log.info('get can_yu_wei_bo_da_V wrong!') #影响力指数 field.set('ying_xiang_li_zhi_shu',data.get('base').get('index')) #同类事件 field.set('tong_lei_shi_jian',data.get('ration')) #关联事件 message_2=[] time_s=data.get('thread').get('time') times_value=data.get('thread').get('timevalue') message_2.append(time_s) message_2.append(times_value) field.set('guan_lian_shi_jian',message_2) #传播图形 line=data.get('thread').get('line') field.set('chuan_bo_tu_xing',line) #关键事件 field.set('guan_jian_shi_jian',data.get('thread').get('info')) field.set('rawdata',data) field.set('id',href) save_data=field.make() if save_data: # print json.dumps(save_data,ensure_ascii=False,indent=4) self.db.put(save_data) self.log.info('save data sucess!%s'%time.ctime()) else: self.log.error(u'数据生成失败!') def Get_page(self,url,cookie_j): headers=self.Get_header(refer=url,cookie_j=cookie_j) html=self.browser_2.visit(url=url,headers=headers,timeout=60,retry=5) html= json.loads(html) return html def Get_header(self,refer,cookie_j): headers={ 'Host': 'ef.zhiweidata.com', 'User-Agent':'Explore/4.9.4 (iPhone; iOS 7.1.2; Scale/2.00)', 'Accept':'*/*', 'Accept-Language':'zh-Hans;q=1', 'Accept-Encoding':"gzip, deflate", 'Connection':"keep-alive" } return headers
class Zhengwen_producer(object): def __init__(self): # 实例化工厂对象 self.queue_factory = QueueFactory() self.browser_factory = BrowserFactory() self.proxy_factory = ProxyFactory() # 实例化具体对象 self.log = Logging('./Log/log_zhengwen').get_logging() self.browser = self.browser_factory.create(config.browser_type) self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area, config.proxy_host, config.proxy_port) self.queue_news = self.queue_factory.create(config.queue_type, private_config.queue_news, config.queue_host, config.queue_port) self.queue_zhengwu = self.queue_factory.create(config.queue_type, private_config.queue_zhengwu_zhenwgen, config.queue_host, config.queue_port) self.queue_pinglun = self.queue_factory.create(config.queue_type, private_config.queue_pinglun, config.queue_host, config.queue_port) def main(self): lanmu_list=self.get_lanmu() # lanmu_list=['http://public.house.163.com/special/03531F4E/index_news.js?callback=data_callback'] try: for url in lanmu_list: if 'gov' in url: self.getGovHref(url) else: self.getNewsHref(url) # time.sleep(random.randint(10,20) self.log.info('getting the zhengwen href successful!') except Exception as e: self.log.info(e) time.sleep(10) def getNewsHref(self,url): #获取网易新闻正文链接 url_l = url page=2 try: while(True): # print url_l html = self.browser.visit(url_l,encoding='gbk') if html: html = html.replace(' ','').replace('\n','') html = str(html).replace('data_callback([','{"data":[').replace('])',']}') html = re.sub(r",\s*?]", "]", html) datas = json.loads(html,encoding='utf-8') for data in datas['data']: # print data docurl=data['docurl']#正文链接 commenturl = data['commenturl']#评论链接 # print docurl,commenturl if commenturl: self.queue_pinglun.put(commenturl+'@@@@@@'+docurl) else: uid = docurl.split('.html')[0].split('/')[-1] commenturl = 'http://comment.news.163.com/news3_bbs/%s.html'%uid self.queue_pinglun.put(commenturl+'@@@@@@'+docurl) self.queue_news.put(docurl) else: break url_l = url.split('.js?')[0]+'_%s.js' %str(page).zfill(2) page+=1 except Exception as e: self.log.info(e) def getGovHref(self,url): #获取网易政务新闻链接 url_l =url #http://gov.163.com/special/zwzx_n/;http://gov.163.com/special/zwzx_n_02/ try: if 'zwzx' in url: totalpage=17 else: totalpage =5 for page in xrange(2,totalpage): html = self.browser.visit(url_l,encoding='gbk') tree = etree.HTML(html) hrefs = tree.xpath('.//div[@class="cnt"]/ul/li/a/@href') # print len(hrefs) if hrefs: for href in hrefs: # print href self.queue_zhengwu.put(href) url_l = url[0:-1]+'_%s/'%str(page).zfill(2) except Exception as e: self.log.info(e) def get_lanmu(self): #从lanmu_href文件读入栏目链接 lanmu_hrefs=[] file = open('lanmu_href','r') for line in file.readlines(): lanmu_hrefs.append(line.split('\n')[0]) return lanmu_hrefs def textxpath(self, tree, path, pos=0): texts = tree.xpath(path) if not texts: return None try: return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos] except: return None