Esempio n. 1
0
 def parse_item(self, response):
     item = MzituScrapyItem()
     max_num = response.xpath(
             "descendant::div[@class='main']/div[@class='content']/div[@class='pagenavi']/a[last()-1]/span/text()").extract_first(
             default="N/A")
     item['name'] = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default='N/A')
     for num in range(1, int(max_num)):
         page_url = response.url + '/' + str(num)
         yield Request(page_url, callback=self.img_url)
     item['image_urls'] = self.img_urls
     yield item
Esempio n. 2
0
 def parse_item(self, response):
     #print(response.url)
     item = MzituScrapyItem()
     item['url'] = response.url
     title = response.xpath('//h2[@class="main-title"]/text()').extract()[0]
     item['name'] = title
     max_num = response.xpath('//div[@class="pagenavi"]/a[last()-1]/span/text()').extract()[0]
     for i in range(1,int(max_num)):
         page_url = response.url+"/"+str(i)
         yield Request(page_url,callback= self.get_image_url)
     item['image_urls'] = self.img_urls
     yield item
Esempio n. 3
0
 def img_url(self, response):
     """取出图片URL 并添加进self.img_urls列表中
     :param response:
     :param img_url 为每张图片的真实地址
     """
     img_urls = response.xpath(
         "descendant::div[@class='main-image']/descendant::img/@src"
     ).extract()
     item = MzituScrapyItem()
     item['name'] = response.meta['name']
     item['url'] = response.meta['url']
     item['image_urls'] = img_urls
     #self.logger.info(item);
     yield item
Esempio n. 4
0
 def parse_item(self, response):
     """
     :param response: 下载器返回的response
     :return:
     """
     item = MzituScrapyItem()
     # max_num为页面最后一张图片的位置
     max_num = response.xpath("descendant::div[@class='main']/div[@class='content']/div[@class='pagenavi']/a[last()-1]/span/text()").extract_first(default="N/A")
     item['name'] = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default="N/A")
     item['url'] = response.url
     for num in range(1, int(max_num)):
         # page_url 为每张图片所在的页面地址
         page_url = response.url + '/' + str(num)
         yield Request(page_url, callback=self.img_url)
     item['image_urls'] = self.img_urls
     yield item
 def img_url(
     self,
     response,
 ):
     """取出图片URL 并添加进self.img_urls列表中
     :param response:
     :param img_url 为每张图片的真实地址
     """
     item = MzituScrapyItem()
     item['name'] = response.meta['name']
     item['url'] = response.meta['url']
     item['image_urls'] = response.xpath(
         "descendant::div[@class='main-image']/descendant::img/@src"
     ).extract_first()
     # for img_url in img_urls:
     #     self.img_urls.append(img_url)
     # item['image_urls'] = self.img_urls
     yield item
Esempio n. 6
0
    def parse_item(self, response):
        """获取当前主题所有页面的所有图片的URL,以构造并返回该主题的MzituScrapyItem对象

        :param response:
        :return: MzituScrapyItem
        """
        print(response.url)
        mzitu_scrapy_item = MzituScrapyItem()
        mzitu_scrapy_item['img_theme_name'] = response.xpath(
            '/html/body/div[2]/div[1]/h2/text()').extract_first(default="N/A")
        mzitu_scrapy_item['img_theme_url'] = response.url

        max_page_num = response.xpath(
            "descendant::div[@class='main']"
            "/div[@class='content']/div[@class='pagenavi']"
            "/a[last()-1]/span/text()").extract_first(default="N/A")
        for num in range(1, int(max_page_num)):
            # img_page_url 为图片所在的页面地址
            img_page_url = response.url + '/' + str(num)
            # 遍历该主题的所有页面,将该主题下的所有图片URL添加到all_img_urls_of_theme
            yield Request(img_page_url, callback=self.get_all_img_urls_of_page)

        mzitu_scrapy_item['all_img_urls_of_theme'] = self.all_img_urls_of_theme
        yield mzitu_scrapy_item