Exemple #1
0
 def parse_news(self, response):
     ld = NewsLoader(item=NewsItem(),response=response)
     ld.add_value('url',response.url)
     ld.add_css('title','div.main .hd>h1::text')
     ld.add_css('srcurl','div.main .hd .tit-bar .ll .color-a-1 a[href]::attr(href)')
     ld.add_css('src','div.main .hd .tit-bar .ll .color-a-1 a[href]::text')
     ld.add_css('datetime','div.main .hd .tit-bar .article-time::text')
     ld.add_value('channel',u'qq')
     ld.add_value('comment_id',response.body_as_unicode(),re='cmt_id\s?=\s?([\d\w]+)')
     yield ld.load_item()
     cmturl = '/%s/comment?commentid=0&reqnum=10' % ld.get_output_value('comment_id')
     cmturl = self.cmturl + cmturl
     yield scrapy.Request(cmturl,self.parse_comment)
Exemple #2
0
 def parse_news(self, response):
     ld = NewsLoader(item=NewsItem(), response=response)
     ld.add_value('url', response.url)
     ld.add_css('title', 'div.main .hd>h1::text')
     ld.add_css('srcurl',
                'div.main .hd .tit-bar .ll .color-a-1 a[href]::attr(href)')
     ld.add_css('src', 'div.main .hd .tit-bar .ll .color-a-1 a[href]::text')
     ld.add_css('datetime', 'div.main .hd .tit-bar .article-time::text')
     ld.add_value('channel', u'qq')
     ld.add_value('comment_id',
                  response.body_as_unicode(),
                  re='cmt_id\s?=\s?([\d\w]+)')
     yield ld.load_item()
     cmturl = '/%s/comment?commentid=0&reqnum=10' % ld.get_output_value(
         'comment_id')
     cmturl = self.cmturl + cmturl
     yield scrapy.Request(cmturl, self.parse_comment)
Exemple #3
0
 def parse_news(self,response):
     ld = NewsLoader(NewsItem(),response)
     ld.add_value('url',response.url)
     ld.add_css('title','.page-header h1::text')
     ld.add_value('channel','sina')
     datetime = response.css('.time-source').xpath('text()').extract()
     ld.add_value('datetime',datetime,MapCompose(unicode.strip))
     comment_id = ld.get_xpath('//meta[@name="comment"]/@content',TakeFirst())
     if comment_id:
         cc = comment_id.split(':')
         if len(cc) == 2:
             channel,comment_id = cc
     if comment_id and channel:
         yield ld.load_item()
         page = 1
         page_size = 20
         cmurl = '&channel=%s&newsid=%s&page=%s&page_size=%s' % (channel,comment_id,page,page_size)
         yield scrapy.Request(self.cmturl + cmurl,self.parse_comment)