def parse(self, response): print(response) hxs = Selector(response) imgs = hxs.xpath('//div[@class="photo-item"]//img/@src').extract() item = ImagesItem() item['image_urls'] = imgs return item
def parse(self, response): items = [] # to gennerate json style sel = Selector(response) sites = sel.xpath('//div[contains(@data-type,"photo")]') site1 = sites.xpath('div[starts-with(@class,"note-content")]/figure') site2 = site1.xpath('figcaption') site3 = sel.xpath('//div[contains(@data-type,"node")]') index1 = 0 # to clean the desc index2 = 0 # to add album_id # index3 = 0 for index in range(len(sites)): item = ImagesItem() item['path'] = sel.xpath('//html').re(r'"src":"(.*?)"')[index] item['photo'] = sel.xpath('//html').re( r'"src":"http://(.*?)"')[index] item['photo_id'] = sel.xpath( '//div[contains(@data-type,"photo")]/@id').extract()[index] item['datetaken'] = sel.xpath( '//div[contains(@data-type,"photo")]/div[@class="note-footer"]/time/@datetime' ).extract()[index] #generate desc item if bool(site1[index].xpath('figcaption').extract()) == True: item['desc'] = site2.xpath('p/text()').extract()[index1] index1 = index1 + 1 #generate album_id item if sites[index].re(r'data-group="(...).*?"') == site3[index2].re( r'data-group="(.*?)"'): item['album_id'] = site3[index2].xpath('@id').extract() else: index2 = index2 + 1 item['album_id'] = site3[index2].xpath('@id').extract() items.append(item) return items
def parse(self, response): item = ImagesItem() img_urls_list = re.findall(r'"imgurl":"(.*?)"', response.text) # 每页的图片链接列表 item['image_urls'] = img_urls_list item['keyword'] = response.meta['keyword'] yield item
def parse(self, response): item = ImagesItem() img_urls = re.findall('"url800":"(//.*?.jpg)"', response.text) # 每一页的所有图片链接 image_urls = ["https:" + url for url in img_urls] item['image_urls'] = image_urls item['keyword'] = response.meta['keyword'] yield item
def parse(self, response): data = json.loads(response.body)['data'] for each in data: item = ImagesItem() item['image_url'] = each['vertical_src'] yield item if self.offset < 200: self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse_images(self, response): item = ImagesItem() url_list = response.xpath( ".//li[@class='list']/a/img/@data-original").extract() item["image_urls"] = url_list yield item # 下一页 next_url = response.xpath(".//a[@class='downPage']/@href").extract() if next_url: next_url = response.urljoin(next_url[0]) yield Request(next_url)
def parse(self, response): keyword = response.meta['keyword'] item = ImagesItem() img_lists = re.findall('"ou":"(http.?://.*?)","ow":', response.text) for index, img_url in enumerate(img_lists): if "\\u003d" in img_url: img_lists[index] = img_lists[index].replace('\\u003d', '=') if "\\u0026" in img_url: img_lists[index] = img_lists[index].replace('\\u0026', '&') item['image_urls'] = img_lists item['keyword'] = keyword yield item
def parse_item(self,response): cookies=response.meta['cookies'] keyword=response.meta['keyword'] pages = self.settings['PAGE'] self.num += 1 item = ImagesItem() pic_urls = response.xpath('//div[@class="c"]/a/img/@src').extract() item['image_urls'] = [url.replace("wap180","large") for url in pic_urls] item['keyword'] = keyword yield item if self.num < pages: next_page_url = response.xpath('//div[@id="pagelist"]/form/div/a/@href').extract_first() if next_page_url: yield scrapy.Request(next_page_url,callback=self.parse_item,cookies=cookies,meta={'cookies':cookies,"keyword":keyword})
def parse(self, response): # hxs = Selector(response) # imgs = hxs.xpath('//*[@id="main"]/a/img/@src').extract() resultJson = json.loads(response.body) results = resultJson['data'] imgs = [] for result in results: if result: url = result['thumbURL'] imgs.append(url) else: print("error!!!can not find imgurl!!!") item = ImagesItem() item['image_urls'] = imgs return item
def parse(self, response): item = ImagesItem() img_list = re.findall('"thumbURL":"(https?://.*?.jpe?g)"', response.text) # 每页的图片链接列表 item['image_urls'] = img_list item['keyword'] = response.meta['keyword'] yield item