def parse_item(self, response): item = MzituScrapyItem() max_num = response.xpath( "descendant::div[@class='main']/div[@class='content']/div[@class='pagenavi']/a[last()-1]/span/text()").extract_first( default="N/A") item['name'] = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default='N/A') for num in range(1, int(max_num)): page_url = response.url + '/' + str(num) yield Request(page_url, callback=self.img_url) item['image_urls'] = self.img_urls yield item
def parse_item(self, response): #print(response.url) item = MzituScrapyItem() item['url'] = response.url title = response.xpath('//h2[@class="main-title"]/text()').extract()[0] item['name'] = title max_num = response.xpath('//div[@class="pagenavi"]/a[last()-1]/span/text()').extract()[0] for i in range(1,int(max_num)): page_url = response.url+"/"+str(i) yield Request(page_url,callback= self.get_image_url) item['image_urls'] = self.img_urls yield item
def img_url(self, response): """取出图片URL 并添加进self.img_urls列表中 :param response: :param img_url 为每张图片的真实地址 """ img_urls = response.xpath( "descendant::div[@class='main-image']/descendant::img/@src" ).extract() item = MzituScrapyItem() item['name'] = response.meta['name'] item['url'] = response.meta['url'] item['image_urls'] = img_urls #self.logger.info(item); yield item
def parse_item(self, response): """ :param response: 下载器返回的response :return: """ item = MzituScrapyItem() # max_num为页面最后一张图片的位置 max_num = response.xpath("descendant::div[@class='main']/div[@class='content']/div[@class='pagenavi']/a[last()-1]/span/text()").extract_first(default="N/A") item['name'] = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default="N/A") item['url'] = response.url for num in range(1, int(max_num)): # page_url 为每张图片所在的页面地址 page_url = response.url + '/' + str(num) yield Request(page_url, callback=self.img_url) item['image_urls'] = self.img_urls yield item
def img_url( self, response, ): """取出图片URL 并添加进self.img_urls列表中 :param response: :param img_url 为每张图片的真实地址 """ item = MzituScrapyItem() item['name'] = response.meta['name'] item['url'] = response.meta['url'] item['image_urls'] = response.xpath( "descendant::div[@class='main-image']/descendant::img/@src" ).extract_first() # for img_url in img_urls: # self.img_urls.append(img_url) # item['image_urls'] = self.img_urls yield item
def parse_item(self, response): """获取当前主题所有页面的所有图片的URL,以构造并返回该主题的MzituScrapyItem对象 :param response: :return: MzituScrapyItem """ print(response.url) mzitu_scrapy_item = MzituScrapyItem() mzitu_scrapy_item['img_theme_name'] = response.xpath( '/html/body/div[2]/div[1]/h2/text()').extract_first(default="N/A") mzitu_scrapy_item['img_theme_url'] = response.url max_page_num = response.xpath( "descendant::div[@class='main']" "/div[@class='content']/div[@class='pagenavi']" "/a[last()-1]/span/text()").extract_first(default="N/A") for num in range(1, int(max_page_num)): # img_page_url 为图片所在的页面地址 img_page_url = response.url + '/' + str(num) # 遍历该主题的所有页面,将该主题下的所有图片URL添加到all_img_urls_of_theme yield Request(img_page_url, callback=self.get_all_img_urls_of_page) mzitu_scrapy_item['all_img_urls_of_theme'] = self.all_img_urls_of_theme yield mzitu_scrapy_item