Example #1
0
 def parse(self, response):
     logger.info('[%s] %s' % (datetime.date.today(), response.url))
     hxs = scrapy.Selector(response)
     sites = hxs.xpath('//div[@class="menu-wrapper"]/ul/li[@class="m-i "]/a[@class="i-link"]')
     for s in sites:
         tag = s.xpath('em/text()').extract()[0]
         url = s.xpath('@href').extract()[0]
         request = Request(url=urljoin_rfc(get_base_url(response), url), callback=self.tagParse, meta={'tag': tag})
         yield request
Example #2
0
 def videoParse(self, response):
     logger.info('[%s] %s' % (datetime.date.today(), response.url))
     hxs = scrapy.Selector(response)
     video = hxs.xpath('//meta[@itemprop="embedURL"]/@content').extract()[0]
     item = BilibiliItem()
     item['title'] = response.meta['title']
     item['desc'] = response.meta['desc']
     item['tag'] = response.meta['tag']
     item['picurl'] = response.meta['image']
     item['videourl'] = video
     yield item
Example #3
0
 def tagParse(self, response):
     logger.info('[%s] %s' % (datetime.date.today(), response.url))
     hxs = scrapy.Selector(response)
     sites = hxs.xpath('//div[@class="b-body"]/ul[@class="vidbox v-list sub"]/li')
     tag = response.meta['tag']
     for s in sites:
         title = s.xpath('div/a/@title').extract()[0]
         desc = s.xpath('div/@txt').extract()[0]
         image = s.xpath('div/a/div/img/@src').extract()[0]
         url = s.xpath('div/a/@href').extract()[0]
         request = Request(url=urljoin_rfc(get_base_url(response), url), callback=self.videoParse, meta={"tag": tag, "title": title, "desc": desc, "image": image})
         yield request
Example #4
0
 def parse(self, response):
     logger.info('[%s] %s' % (datetime.date.today(), response.url))
     jsoncon = json.loads(response.body)
     if jsoncon['message'] == 'success':
         conlist = jsoncon['data']
         tag = {v.split('=')[0]: v.split('=')[1] for v in urlparse(response.url).query.split('&')}.get('category', '__all__')
         for con in conlist:
             if con['middle_mode']:
                 item = WwwToutiaoComItem()
                 item['title'] = con['title'].encode('utf-8')
                 item['tag'] = tag
                 item['desc'] = con['abstract'].encode('utf-8')
                 item['image'] = con['middle_image']
                 item['url'] = con['url']
                 item['createtime'] = con['create_time']
                 yield item
Example #5
0
 def parse(self, response):
     logger.info('[%s] %s' % (datetime.date.today(), response.url))
     jsoncon = json.loads(response.body)
     if jsoncon['message'] == 'success':
         conlist = jsoncon['data']
         tag = {
             v.split('=')[0]: v.split('=')[1]
             for v in urlparse(response.url).query.split('&')
         }.get('category', '__all__')
         for con in conlist:
             if con['middle_mode']:
                 item = WwwToutiaoComItem()
                 item['title'] = con['title'].encode('utf-8')
                 item['tag'] = tag
                 item['desc'] = con['abstract'].encode('utf-8')
                 item['image'] = con['middle_image']
                 item['url'] = con['url']
                 item['createtime'] = con['create_time']
                 yield item
Example #6
0
 def parse(self, response):
     logger.info('[%s] %s' % (datetime.date.today(), response.url))
     hxs = HtmlXPathSelector(response)
     l = [
         # 视觉大片
         {
             'id': 'SI_Scroll_2_Cont',
             'cl': 'photograph_gallery'
         },
         # 八卦
         {
             'id': 'SI_Scroll_3_Cont',
             'cl': 'gossip'
         },
         # 服饰搭配
         {
             'id': 'SI_Scroll_4_Cont',
             'cl': 'style'
         },
         # 美体瘦身
         {
             'id': 'SI_Scroll_5_Cont',
             'cl': 'body'
         },
         # 彩妆美发
         {
             'id': 'SI_Scroll_6_Cont',
             'cl': 'beauty'
         },
     ]
     for d in l:
         sites = hxs.select('//div[@id="%s"]/div/div/a/@href' %
                            d['id']).extract()
         for site in sites:
             cl = d['cl']
             request = Request(
                 site,
                 callback=self.deepParse,
                 meta={'cl': cl},
             )
             yield request