Example #1
0
 def parse_item(self, response):
     item = MeiziItem()
     try:
         item['img_urls'] = response.xpath(
             "//div[@class='main-image']/p/a/img/@src")[0].extract()
         item['img_name'] = response.xpath(
             "//div[@class='currentpath']/a[2]/text()")[0].extract()
         yield item
     except Exception:
         pass
Example #2
0
	def parse(self, response):
		image_url = response.xpath('//*[@id="content"]/a/img/@src').extract()
		print('image_url',image_url[0])
		print("*"*40)
		item = MeiziItem()
		item['image_url'] =image_url[0]
		yield item
		for i in range(3,60):
			new_url=self.base_url+str(i)
			yield scrapy.Request(new_url,callback=self.parse)
Example #3
0
    def parse_item(self, response):
        item_loader = ItemLoader(item=MeiziItem(), response=response)

        # 标题
        item_loader.add_xpath('title', '//h2/a/text()')
        # 图片链接
        item_loader.add_xpath('image', "//div[@id='picture']/p/img/@src",
                              Identity())
        # 帖子链接
        item_loader.add_xpath('link', response.url)

        return item_loader.load_item()
Example #4
0
    def meizi_link(self, response):
        title = response.xpath(
            '//div[@class="metaRight"]/h2/a/text()').extract()
        images = response.xpath(
            '//div[@class="postContent"]/div[@id="picture"]/p/img/@src'
        ).extract()

        for image in images:
            item = MeiziItem()
            item['titles'] = title
            item['image'] = image
            item['url'] = response.url
            yield item
Example #5
0
    def ImageItem(self, response):
        image_item = MeiziItem()
        if not "postContent" in response.text:
            image_item["imageurl"] = response.css(
                "div#picture p img::attr(src)").extract()
        else:
            image_item["imageurl"] = response.css(
                "div.postContent p img::attr(src)").extract()

        match_fav = re.match('.*?(\d+).*', response.url)
        image_item["url"] = match_fav

        return image_item
Example #6
0
 def parse_item(self,response):
     item = MeiziItem()
     # max_num 为页面最后一张图片的位置
     # /html/body/div[2]/div[1]/div[4]/a[5]/span
     max_num = response.xpath("descendant::div[@class='main']/div[@class='content']/div[@class='pagenavi']/a[last()-1]/span/text()").extract_first(default="N/A")
     item['name'] = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default="N/A")
     item['url'] = response.url
     for num in range(1, int(max_num)):
         # page_url 为每张图片所在的页面地址
         page_url = response.url + '/' + str(num)
         yield Request(page_url, callback=self.img_url)
     item['images_urls'] = self.img_urls
     print("*****************************************************************")
     print(item)
     yield item
Example #7
0
    def parse_image(self, response):
        """
        将数据传入item
        :param response: 
        :return: 
        """
        item = MeiziItem()
        sel = Selector(response)

        tags = sel.xpath("//meta[@name='keywords']/@content").extract_first().strip()  # 图片标签
        # tags = sel.css("meta[name='keywords']::attr(content)").extract().strip()
        image_urls = sel.xpath("//div[@id='picture']/p/img/@src").extract()  # 图片链接
        # image_urls = sel.css("div#picture p img::attr(src)").extract()

        # 将tags,image_urls存入item中
        item['tags'] = tags
        item['image_urls'] = image_urls
        yield item
Example #8
0
    def parse_per_meiziji(self, response):

        #真正妹子图的url
        real_mzi_url = response.css(
            '.main .content .main-image p a img::attr(src)').extract()
        item = MeiziItem()
        item['image_urls'] = real_mzi_url
        item['name'] = response.css(
            '.main .content .currentpath .main-title::text').extract_first()
        yield item

        #最后一个链接的提示符,如果是下一页,那么还是这个主题,所以继续添加,
        # 如果是下一组,那么跳出当前处理过程,不去继续爬取
        next = ''.join(
            response.css('.main .content .pagenavi a span::text').extract()
            [-1:])[:3]

        if '下一页' == next:
            url_next_pic = ''.join(
                response.css(
                    '.main .content .pagenavi a::attr(href)').extract()[-1:])
            yield Request(url_next_pic, self.parse_per_meiziji)