Ejemplo n.º 1
0
    def parse(self, response):
        imglist = response.xpath("//ul[@class='list-group']")
        for img in imglist:
            imgitem = DoutuItem()
            imgurl = img.xpath(".//a/img/@data-original").extract()

            urls = list(map(lambda url: response.urljoin(url), imgurl))
            item = DoutuItem(image_urls=urls)
            yield item

        # 获取下一页内容
        i = 0
        next_link = response.xpath(
            '//*[@id="pic-detail"]/div/div[2]/div[3]/ul/li[15]/a/@href'
        ).extract()
        # self.log(next_link)
        next_link = response.urljoin(next_link)
        if next_link is not None:
            for url in next_link:
                url = "http://www.doutula.com" + url
                yield scrapy.Request(url, callback=self.parse)
                i = i + 1
                print(i)
                print("*" * 20)
        else:
            print("---" * 15)
Ejemplo n.º 2
0
    def parse_link(self, response):
        title = response.meta.get('info')
        ps = response.xpath("//div[@id='post_content']/p")
        for p in ps:
            pic_url = p.xpath(".//img/@src").get()
            print(pic_url)

            yield DoutuItem(pic_url=pic_url, title=title)
Ejemplo n.º 3
0
 def parse_img(self, response):
     div_list = response.xpath(
         "//div[@class='pic-content']/div[@class='artile_des']")
     for div in div_list:
         item = DoutuItem()
         item["image_urls"] = div.xpath(".//img/@src").extract()
         print(item)
         yield item
Ejemplo n.º 4
0
 def parse_item(self, response):
     i = DoutuItem()
     # 图片管道字典设置,
     i['image_url'] = response.xpath(
         ".//div[@class='pic-content']//img/@src").extract()
     i['image_name'] = response.xpath(
         ".//div[@class='pic-title']//a/text()").extract()
     print(i)
Ejemplo n.º 5
0
    def parse_img(self, response):

        doutu = DoutuItem()

        doutu['image_urls'] = response.xpath(
            '//div[@class="swiper-slide"]//img/@src').extract()[0]

        yield doutu
Ejemplo n.º 6
0
    def parse(self, response):  # response 是上面网址请求到的源代码
        for content in response.xpath('//*[@id="pic-detail"]/div/div[1]/div[1]/ul/li/div/div/a'):
            item = DoutuItem()  # 实例化容器
            # print(content)
            item['img_url'] = content.xpath('./img/@data-original').extract_first()  
            item['name'] = content.xpath('./p/text()').extract_first()

            yield item
Ejemplo n.º 7
0
 def parse(self, response):
     lis = response.xpath('//div[@class="page-content text-center"]//a')
     for li in lis:
         item = DoutuItem()
         item['name'] = li.xpath("./img/@alt").extract_first()
         item['image_urls'] = li.xpath(
             "./img/@data-original").extract_first()
         yield item
     MAX_PAGES = self.settings['MAX_PAGES']
     for page in range(2, MAX_PAGES):
         url = "https://www.doutula.com/photo/list/?page=%d" % page
         yield scrapy.Request(url, callback=self.parse)
Ejemplo n.º 8
0
    def parse(self, response):
        imglist =response.xpath("//ul[@class='list-group']")
        for img in imglist:
            item=DoutuItem()
            imgurl=img.xpath(".//a/img/@data-original").extract()
            # print(imgurl)
            urls=list(map(lambda url:response.urljoin(url),imgurl))
            item=DoutuItem(image_urls=urls)
            # for i in imgurl:
            #     item["image_urls"]="https:"+i
                # print(item["image_urls"])
            yield item

        next_link = response.xpath('//*[@id="pic-detail"]/div/div[2]/div[3]/ul/li[13]/a/@href').extract()
        # self.log(next_link)
        # next_link = response.urljoin(next_link)
        print(next_link)
        if next_link is not None:
            for url in next_link:  
                url = "http://www.doutula.com" + url  
                yield scrapy.Request(url, callback=self.parse)  
            # next_link = next_link[0]
            # yield scrapy.Request("http://www.doutula.com"+next_link,callback=self.parse)
Ejemplo n.º 9
0
    def parse(self, response):  # response 是上面网址请求到的源代码
        items_list = []
        for content in response.xpath('//a[@class="col-xs-6 col-sm-3"]'):
            item = DoutuItem()  # 实例化容器
            item['img_url'] = content.xpath('./img/@data-original').extract_first()
            item['name'] = content.xpath('./p/text()').extract_first()
        #     items_list.append(item)
        # print(items_list)

            # try:
            #     filename = 'imgs\{}'.format(item['name']) + item['img_url'][-4:]  # 图片路径
            #     if not os.path.exists(filename):
            #         headers = {'User-Agent': "'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'"}
            #         r = requests.get(item['img_url'], headers=headers)
            #         with open(filename,'wb') as f:
            #             f.write(r.content)
            #         print('保存成功!')

            # except Exception as e:
            #     print(e)


            yield item
Ejemplo n.º 10
0
 def parse_item(self,response):
     self.logger.info('hi,this is an item page! %s',response.url)
     item = DoutuItem()
  #   item['url'] = []
     item['file_urls'] = response.xpath('//*[@id="pic-detail"]/div/div/div//@data-original').extract()
     return item