Example #1
0
 def parse(self, response):
     print(response)
     hxs = Selector(response)
     imgs = hxs.xpath('//div[@class="photo-item"]//img/@src').extract()
     item = ImagesItem()
     item['image_urls'] = imgs
     return item
Example #2
0
 def parse(self, response):
     items = []  # to gennerate json style
     sel = Selector(response)
     sites = sel.xpath('//div[contains(@data-type,"photo")]')
     site1 = sites.xpath('div[starts-with(@class,"note-content")]/figure')
     site2 = site1.xpath('figcaption')
     site3 = sel.xpath('//div[contains(@data-type,"node")]')
     index1 = 0  # to clean the desc
     index2 = 0  # to add album_id
     #		index3 = 0
     for index in range(len(sites)):
         item = ImagesItem()
         item['path'] = sel.xpath('//html').re(r'"src":"(.*?)"')[index]
         item['photo'] = sel.xpath('//html').re(
             r'"src":"http://(.*?)"')[index]
         item['photo_id'] = sel.xpath(
             '//div[contains(@data-type,"photo")]/@id').extract()[index]
         item['datetaken'] = sel.xpath(
             '//div[contains(@data-type,"photo")]/div[@class="note-footer"]/time/@datetime'
         ).extract()[index]
         #generate desc item
         if bool(site1[index].xpath('figcaption').extract()) == True:
             item['desc'] = site2.xpath('p/text()').extract()[index1]
             index1 = index1 + 1
         #generate album_id item
         if sites[index].re(r'data-group="(...).*?"') == site3[index2].re(
                 r'data-group="(.*?)"'):
             item['album_id'] = site3[index2].xpath('@id').extract()
         else:
             index2 = index2 + 1
             item['album_id'] = site3[index2].xpath('@id').extract()
         items.append(item)
     return items
Example #3
0
 def parse(self, response):
     item = ImagesItem()
     img_urls_list = re.findall(r'"imgurl":"(.*?)"',
                                response.text)  # 每页的图片链接列表
     item['image_urls'] = img_urls_list
     item['keyword'] = response.meta['keyword']
     yield item
Example #4
0
 def parse(self, response):
     item = ImagesItem()
     img_urls = re.findall('"url800":"(//.*?.jpg)"',
                           response.text)  # 每一页的所有图片链接
     image_urls = ["https:" + url for url in img_urls]
     item['image_urls'] = image_urls
     item['keyword'] = response.meta['keyword']
     yield item
Example #5
0
    def parse(self, response):
        data = json.loads(response.body)['data']
        for each in data:
            item = ImagesItem()
            item['image_url'] = each['vertical_src']
            yield item

        if self.offset < 200:
            self.offset += 20
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Example #6
0
    def parse_images(self, response):
        item = ImagesItem()
        url_list = response.xpath(
            ".//li[@class='list']/a/img/@data-original").extract()
        item["image_urls"] = url_list
        yield item

        # 下一页
        next_url = response.xpath(".//a[@class='downPage']/@href").extract()
        if next_url:
            next_url = response.urljoin(next_url[0])
            yield Request(next_url)
Example #7
0
    def parse(self, response):
        keyword = response.meta['keyword']
        item = ImagesItem()
        img_lists = re.findall('"ou":"(http.?://.*?)","ow":', response.text)
        for index, img_url in enumerate(img_lists):
            if "\\u003d" in img_url:
                img_lists[index] = img_lists[index].replace('\\u003d', '=')
            if "\\u0026" in img_url:
                img_lists[index] = img_lists[index].replace('\\u0026', '&')

        item['image_urls'] = img_lists
        item['keyword'] = keyword
        yield item
Example #8
0
    def parse_item(self,response):
        cookies=response.meta['cookies']
        keyword=response.meta['keyword']
        pages = self.settings['PAGE']
        self.num += 1
        item = ImagesItem()
        pic_urls = response.xpath('//div[@class="c"]/a/img/@src').extract()
        item['image_urls'] = [url.replace("wap180","large") for url in pic_urls]
        item['keyword'] = keyword
        yield item

        if self.num < pages:
            next_page_url = response.xpath('//div[@id="pagelist"]/form/div/a/@href').extract_first()
            if next_page_url:
                yield scrapy.Request(next_page_url,callback=self.parse_item,cookies=cookies,meta={'cookies':cookies,"keyword":keyword})
Example #9
0
    def parse(self, response):
        # hxs = Selector(response)
        # imgs = hxs.xpath('//*[@id="main"]/a/img/@src').extract()
        resultJson = json.loads(response.body)
        results = resultJson['data']
        imgs = []
        for result in results:
            if result:
                url = result['thumbURL']
                imgs.append(url)
            else:
                print("error!!!can not find imgurl!!!")

        item = ImagesItem()
        item['image_urls'] = imgs
        return item
Example #10
0
 def parse(self, response):
     item = ImagesItem()
     img_list = re.findall('"thumbURL":"(https?://.*?.jpe?g)"', response.text)  # 每页的图片链接列表
     item['image_urls'] = img_list
     item['keyword'] = response.meta['keyword']
     yield item