def parse_news(self, response): ld = NewsLoader(item=NewsItem(),response=response) ld.add_value('url',response.url) ld.add_css('title','div.main .hd>h1::text') ld.add_css('srcurl','div.main .hd .tit-bar .ll .color-a-1 a[href]::attr(href)') ld.add_css('src','div.main .hd .tit-bar .ll .color-a-1 a[href]::text') ld.add_css('datetime','div.main .hd .tit-bar .article-time::text') ld.add_value('channel',u'qq') ld.add_value('comment_id',response.body_as_unicode(),re='cmt_id\s?=\s?([\d\w]+)') yield ld.load_item() cmturl = '/%s/comment?commentid=0&reqnum=10' % ld.get_output_value('comment_id') cmturl = self.cmturl + cmturl yield scrapy.Request(cmturl,self.parse_comment)
def parse_news(self, response): ld = NewsLoader(item=NewsItem(), response=response) ld.add_value('url', response.url) ld.add_css('title', 'div.main .hd>h1::text') ld.add_css('srcurl', 'div.main .hd .tit-bar .ll .color-a-1 a[href]::attr(href)') ld.add_css('src', 'div.main .hd .tit-bar .ll .color-a-1 a[href]::text') ld.add_css('datetime', 'div.main .hd .tit-bar .article-time::text') ld.add_value('channel', u'qq') ld.add_value('comment_id', response.body_as_unicode(), re='cmt_id\s?=\s?([\d\w]+)') yield ld.load_item() cmturl = '/%s/comment?commentid=0&reqnum=10' % ld.get_output_value( 'comment_id') cmturl = self.cmturl + cmturl yield scrapy.Request(cmturl, self.parse_comment)
def parse_news(self,response): ld = NewsLoader(NewsItem(),response) ld.add_value('url',response.url) ld.add_css('title','.page-header h1::text') ld.add_value('channel','sina') datetime = response.css('.time-source').xpath('text()').extract() ld.add_value('datetime',datetime,MapCompose(unicode.strip)) comment_id = ld.get_xpath('//meta[@name="comment"]/@content',TakeFirst()) if comment_id: cc = comment_id.split(':') if len(cc) == 2: channel,comment_id = cc if comment_id and channel: yield ld.load_item() page = 1 page_size = 20 cmurl = '&channel=%s&newsid=%s&page=%s&page_size=%s' % (channel,comment_id,page,page_size) yield scrapy.Request(self.cmturl + cmurl,self.parse_comment)