def parse_news(self, response): # populate the rest of the article article = response.meta['article'] article['url'] = response.url title = '' date = '' parsed_response_url = urlparse(response.url) #news contents contents = ' '.join( response.xpath( '//div[@class="TRS_Editor"]//p/text()').extract()).strip() #news agency agency_ = response.xpath( '//div[@class="laiyuan"]//span[@id="articleSource"]/text()' ).extract()[0].replace(' ', '').replace('\n', '') agency = agency_[3:len(agency_)] #news category #category = response.xpath('//channel').extract() #Get keywords and tagged_text rake = ChRake() keywords_list = rake.run(contents) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() #populate agency,contents,category article['agency'] = agency article['contents'] = contents article['keywords'] = keywords article['tagged_text'] = tagged_text if 'cysc' in response.url: article['category'] = '产业市场' elif 'sjjj' in response.url: article['category'] = '国际经济' elif 'district' in response.url: article['category'] = '地方经济' elif 'gnsz' in response.url: article['category'] = '国内时政' elif 'shgj' in response.url: article['category'] = '社会' elif 'qqss' in response.url: article['category'] = '全球时事' elif 'finance' in response.url: article['category'] = '经济' elif 'specials' in response.url: article['category'] = '独家专稿' else: article['category'] = '其他' yield article
def parse_news(self, response): article = response.meta['article'] news_date = time.strftime('%Y-%m-%d %H:%M:%S', self.parse_date(article['date'])) news_content = response.xpath( './/div[@id="center"]/div[@id="article"]/div[@class="article"]//text()' ).extract() news_content = ' '.join(news_content).strip() if len(news_content) < 5: news_content = response.xpath( './/div[@class="main pagewidth"]/div[@id="content"]/p/text()' ).extract() news_content = ' '.join(news_content).strip() news_author = response.xpath( './/div[@id="center"]/div[@id="article"]//em[@id="source"]//text()' ).extract() if len(news_author) > 0: news_author = news_author[0] else: news_author = response.xpath( './/div[@class="main_tit"]/div[@class="info"]/span[@id="source"]//text()' ).extract()[0] #Get keywords and tagged_text rake = ChRake() keywords_list = rake.run(news_content) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() #Populate article['contents'] = news_content article['date'] = news_date article['agency'] = news_author article['keywords'] = keywords article['tagged_text'] = tagged_text yield article #for comment page #comment_url = response.xpath('.//div[@id="da-comment"]//a[@target="_blank"]/@href').extract() comment_url = 'http://comment.home.news.cn/a/newsComm.do?_ksTS=1444922731622_49&callback=jsonp50&newsId=' + article[ 'aid'] print comment_url #if len(comment_url) >0: req = scrapy.Request(comment_url, callback=self.parse_comment, dont_filter=True) req.meta['newsId'] = article['aid'] yield req '''comment_check_url = 'http://m.news.naver.com/api/comment/count.json'
def parse_next_page(self, response): try: article = response.meta['article'] content = response.meta['contents'] content_1 = response.xpath('//*[@id="text"]/p/text()').extract() content_1_1 = ''.join(content_1) #merger this page's content with previous content content_2 = content + content_1_1 #Get keywords and tagged_text rake = ChRake() keywords_list = rake.run(content_2) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() #Populate article['contents'] = content_2 article['keywords'] = keywords article['tagged_text'] = tagged_text this_page = response.url count = this_page[-6:-5] count_1 = int(count)+1 str_1 = '//*[@id="pages"]/a['+str(count_1)+']/text()' str_2 = '//*[@id="pages"]/a['+str(count_1)+']/@href' count_2 = response.xpath(str_1).extract() #determine whether there has a next page if u'\u4e0b\u4e00\u9875' in count_2: yield article else: next_url = response.xpath(str_2).extract() next_url_1 = str(next_url[0]) req = scrapy.Request(next_url_1, callback = self.parse_next_page) req.meta['article'] = article req.meta['contents'] = content_2 req.meta['keywords'] = keywords req.meta['tagged_text'] = tagged_text yield req except Exception, e: print 'Parse_next_page ERROR!!!!!!!!!!!!! :'+response.url print traceback.print_exc(file = sys.stdout)
def parse_next_page(self, response): try: article = response.meta['article'] content = response.meta['contents'] content_1 = response.xpath('//*[@id="text"]/p/text()').extract() content_1_1 = ''.join(content_1) #merger this page's content with previous content content_2 = content + content_1_1 #Get keywords and tagged_text rake = ChRake() keywords_list = rake.run(content_2) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() #Populate article['contents'] = content_2 article['keywords'] = keywords article['tagged_text'] = tagged_text this_page = response.url count = this_page[-6:-5] count_1 = int(count) + 1 str_1 = '//*[@id="pages"]/a[' + str(count_1) + ']/text()' str_2 = '//*[@id="pages"]/a[' + str(count_1) + ']/@href' count_2 = response.xpath(str_1).extract() #determine whether there has a next page if u'\u4e0b\u4e00\u9875' in count_2: yield article else: next_url = response.xpath(str_2).extract() next_url_1 = str(next_url[0]) req = scrapy.Request(next_url_1, callback=self.parse_next_page) req.meta['article'] = article req.meta['contents'] = content_2 req.meta['keywords'] = keywords req.meta['tagged_text'] = tagged_text yield req except Exception, e: print 'Parse_next_page ERROR!!!!!!!!!!!!! :' + response.url print traceback.print_exc(file=sys.stdout)
def parse_news(self, response): article = response.meta['article'] news_date = time.strftime('%Y-%m-%d %H:%M:%S', self.parse_date(article['date'])) news_content = response.xpath('.//div[@id="center"]/div[@id="article"]/div[@class="article"]//text()').extract() news_content = ' '.join(news_content).strip() if len(news_content) <5: news_content = response.xpath('.//div[@class="main pagewidth"]/div[@id="content"]/p/text()').extract() news_content = ' '.join(news_content).strip() news_author = response.xpath('.//div[@id="center"]/div[@id="article"]//em[@id="source"]//text()').extract() if len(news_author)>0: news_author = news_author[0] else: news_author = response.xpath('.//div[@class="main_tit"]/div[@class="info"]/span[@id="source"]//text()').extract()[0] #Get keywords and tagged_text rake = ChRake() keywords_list = rake.run(news_content) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() #Populate article['contents'] = news_content article['date'] = news_date article['agency'] = news_author article['keywords'] = keywords article['tagged_text'] = tagged_text yield article #for comment page #comment_url = response.xpath('.//div[@id="da-comment"]//a[@target="_blank"]/@href').extract() comment_url = 'http://comment.home.news.cn/a/newsComm.do?_ksTS=1444922731622_49&callback=jsonp50&newsId=' + article['aid'] print comment_url #if len(comment_url) >0: req = scrapy.Request(comment_url, callback = self.parse_comment, dont_filter = True) req.meta['newsId'] = article['aid'] yield req '''comment_check_url = 'http://m.news.naver.com/api/comment/count.json'
def parse_news(self, response): try: #get the rest of the article article = response.meta['article'] #print '###############' agency = response.xpath('//div[@class="clearfix w1000_320 text_title"]//div[@class="box01"]//div[@class="fl"]//a/text()').extract() #agency = ''.join(agency_list) #agency = response.xpath('//*[@class="fl"]/a/text()').extract() #agency = response.xpath('.//div[@class="box01"]/a/text()').extract() #print agency #print '###############' #agency = response.xpath('//*[@id="p_origin"]/a/text()').extract() content_1 = response.xpath('//*[@id="rwb_zw"]/p/text()').extract() #print content_1 article['agency'] = agency #get the cagegory of news category_url = response.url if 'world' in category_url: article['category'] = '国际' elif 'politics' in category_url: article['category'] = '时政' elif 'finance' in category_url: article['category'] = '财经' elif 'money' in category_url: article['category'] = '金融' elif 'energy' in category_url: article['category'] = '能源' elif 'legal' in category_url: article['category'] = '法治' elif 'society' in category_url: article['category'] = '社会' elif 'hm' in category_url: article['category'] = '港澳' elif 'pic' in category_url: article['category'] = '图片' elif 'tw' in category_url: article['category'] = '台湾' elif 'sports' in category_url: article['category'] = '体育' elif 'military' in category_url: article['category'] = '军事' elif 'health' in category_url: article['category'] = '健康' elif 'theory' in category_url: article['category'] = '理论' elif 'opinion' in category_url: article['category'] = '观点' elif 'media' in category_url: article['category'] = '传媒' elif 'ent' in category_url: article['category'] = '娱乐' elif 'it.people' in category_url: article['category'] = 'IT' elif 'env' in category_url: article['category'] = '环保' elif 'tc' in category_url: article['category'] = '通信' elif 'homea' in category_url: article['category'] = '家电' elif 'house' in category_url: article['category'] = '房产' elif 'ccnews' in category_url: article['category'] = '央企' elif 'scitech' in category_url: article['category'] = '科技' elif 'culture' in category_url: article['category'] = '文化' elif 'yuqing' in category_url: article['category'] = '舆情' elif 'lady' in category_url: article['category'] = '时尚' elif 'game' in category_url: article['category'] = '游戏' elif 'comic' in category_url: article['category'] = '动漫' elif 'npc.people' in category_url: article['category'] = '人大新闻' elif 'usa.people' in category_url: article['category'] = '美国' elif 'shipin' in category_url: article['category'] = '食品' elif 'edu.people' in category_url: article['category'] = '教育' elif 'gongyi' in category_url: article['category'] = '公益' elif 'jiaju' in category_url: article['category'] = '家居' elif 'qipai' in category_url: article['category'] = '棋牌' elif 'www.people' in category_url: article['category'] = '人民微博' else: article['category'] = '其他' #content = ''.join(content_1) #Get keywords and tagged_text content = ''.join(content_1).replace(' ','') rake = ChRake() keywords_list = rake.run(content) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() article['contents'] = content article['keywords'] = keywords article['tagged_text'] = tagged_text if response.xpath('//*[@id="rwb_zw"]/center/table/tbody/tr/td/a/text()'): next_url_0 = response.xpath('//*[@id="rwb_zw"]/div[2]/a[2]/@href').extract() pos = category_url.find("/n/") next_url = category_url[:pos]+str(next_url_0[0]) req = scrapy.Request(next_url, callback = self.parse_next_page, dont_filter = self.dont_filter) req.meta['article'] = article req.meta['contents'] = ''.join(content_1).strip('') yield req else: yield response.meta['article'] #get the json url of comments comment_url = category_url comment_json_url = 'http://bbs1.people.com.cn/api/news.do?action=lastNewsComments&newsId='+article['aid'] req = scrapy.Request(comment_json_url, callback = self.parse_comment, dont_filter = self.dont_filter) yield req except Exception, e: print 'Parse_news ERROR!!!!!!!!!!!!! URL :'+ article['url'] print traceback.print_exc(file = sys.stdout)
def parse_news(self, response): time.sleep(1) try: # populate the rest of the article article = response.meta['article'] aid = str(article['url'])[article['url'].rfind('/') + 1:-5] title = response.xpath( '//div[@class="post_content_main"]//h1/text()').extract() #print title agency = response.xpath( '//div[@class="post_time_source"]//a[@id="ne_article_source"]/text()' ).extract() #print agency content = response.xpath( '//div[@class="post_text"]//p/text()').extract() contents = ''.join(content) #Get keywords and tagged_text rake = ChRake() keywords_list = rake.run(contents) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() #Populate article['title'] = title[0] article['agency'] = agency[0] article['aid'] = aid article['contents'] = ''.join(content) article['keywords'] = keywords article['tagged_text'] = tagged_text yield response.meta['article'] #star comment parsing category = article['category'] news_url = article['url'] print '=====================' + news_url comment_url_base = 'http://comment.news.163.com/cache/newlist/' ''' #get page source pageSource = urllib2.urlopen(news_url).read().decode("gbk").encode("utf-8") #get boardId from page source c = re.search(r"(?<=boardId = ).+?(?=$)",pageSource,re.M) boardID = self.GetMiddleStr(c.group(),'"','",') if category == 0: boardID = 'news_guonei8_bbs' elif category == 1: boardID = 'news3_bbs' elif category == 2: boardID = 'news_shehui7_bbs' elif category == 4: boardID = 'news3_bbs' elif category == 5: boardID = 'news_junshi_bbs' comment_url = comment_url_base + boardID + '/' + aid + '_1.html' print '=============' + comment_url ''' #http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/BQLQTMO800014SEH/ #comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' \ + aid + '/comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc' req = scrapy.Request(comment_url, callback=self.parse_comment, dont_filter=self.dont_filter) req.meta['aid'] = aid yield req except Exception, e: print 'Parse_news ERROR!!!!!!!!!!!!! URL :' + article['url'] print traceback.print_exc(file=sys.stdout)