def parse_item(self, response): item = MeiziItem() try: item['img_urls'] = response.xpath( "//div[@class='main-image']/p/a/img/@src")[0].extract() item['img_name'] = response.xpath( "//div[@class='currentpath']/a[2]/text()")[0].extract() yield item except Exception: pass
def parse(self, response): image_url = response.xpath('//*[@id="content"]/a/img/@src').extract() print('image_url',image_url[0]) print("*"*40) item = MeiziItem() item['image_url'] =image_url[0] yield item for i in range(3,60): new_url=self.base_url+str(i) yield scrapy.Request(new_url,callback=self.parse)
def parse_item(self, response): item_loader = ItemLoader(item=MeiziItem(), response=response) # 标题 item_loader.add_xpath('title', '//h2/a/text()') # 图片链接 item_loader.add_xpath('image', "//div[@id='picture']/p/img/@src", Identity()) # 帖子链接 item_loader.add_xpath('link', response.url) return item_loader.load_item()
def meizi_link(self, response): title = response.xpath( '//div[@class="metaRight"]/h2/a/text()').extract() images = response.xpath( '//div[@class="postContent"]/div[@id="picture"]/p/img/@src' ).extract() for image in images: item = MeiziItem() item['titles'] = title item['image'] = image item['url'] = response.url yield item
def ImageItem(self, response): image_item = MeiziItem() if not "postContent" in response.text: image_item["imageurl"] = response.css( "div#picture p img::attr(src)").extract() else: image_item["imageurl"] = response.css( "div.postContent p img::attr(src)").extract() match_fav = re.match('.*?(\d+).*', response.url) image_item["url"] = match_fav return image_item
def parse_item(self,response): item = MeiziItem() # max_num 为页面最后一张图片的位置 # /html/body/div[2]/div[1]/div[4]/a[5]/span max_num = response.xpath("descendant::div[@class='main']/div[@class='content']/div[@class='pagenavi']/a[last()-1]/span/text()").extract_first(default="N/A") item['name'] = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default="N/A") item['url'] = response.url for num in range(1, int(max_num)): # page_url 为每张图片所在的页面地址 page_url = response.url + '/' + str(num) yield Request(page_url, callback=self.img_url) item['images_urls'] = self.img_urls print("*****************************************************************") print(item) yield item
def parse_image(self, response): """ 将数据传入item :param response: :return: """ item = MeiziItem() sel = Selector(response) tags = sel.xpath("//meta[@name='keywords']/@content").extract_first().strip() # 图片标签 # tags = sel.css("meta[name='keywords']::attr(content)").extract().strip() image_urls = sel.xpath("//div[@id='picture']/p/img/@src").extract() # 图片链接 # image_urls = sel.css("div#picture p img::attr(src)").extract() # 将tags,image_urls存入item中 item['tags'] = tags item['image_urls'] = image_urls yield item
def parse_per_meiziji(self, response): #真正妹子图的url real_mzi_url = response.css( '.main .content .main-image p a img::attr(src)').extract() item = MeiziItem() item['image_urls'] = real_mzi_url item['name'] = response.css( '.main .content .currentpath .main-title::text').extract_first() yield item #最后一个链接的提示符,如果是下一页,那么还是这个主题,所以继续添加, # 如果是下一组,那么跳出当前处理过程,不去继续爬取 next = ''.join( response.css('.main .content .pagenavi a span::text').extract() [-1:])[:3] if '下一页' == next: url_next_pic = ''.join( response.css( '.main .content .pagenavi a::attr(href)').extract()[-1:]) yield Request(url_next_pic, self.parse_per_meiziji)