Beispiel #1
0
 def get_image_url(self,response):
     item = MzituItem()
     item['name'] = response.meta['name']
     image_urls = response.xpath('//div[@class="main-image"]/p/a/img/@src').extract()
     for image_url in image_urls:
         item['image_url'] = image_url
         yield item
Beispiel #2
0
    def parse(self, response):
        print("================================================")
        for _li in response.xpath("//*[@id=\"pins\"]/li"):
            item = MzituItem()

            _link = _li.xpath("a/@href").extract()[0]
            _thumb = _li.xpath("a/img/@data-original").extract()[0]
            _title = _li.xpath("span/a/text()").extract()[0]
            _time = _li.xpath("span/text()").extract()[0]

            item['title'] = re.sub(r'[?\\*|“<>:/]', '', _title)
            item['thumb'] = _thumb
            item['time'] = _time
            item['link'] = _link

            print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n')
            print(_link)
            yield item

            yield scrapy.Request(_link,
                                 meta={'item': item},
                                 callback=self.parseContent,
                                 headers=self.headers)

        print("================================================")
Beispiel #3
0
    def parsePage(self, response):
        try:
            img_srcs = []
            img_data = MzituItem()
            title = response.xpath(
                '//h2[@class="main-title"]/text()').extract_first()
            img_src = response.xpath(
                '//div[@class="main-image"]/p/a/img/@src').extract_first()
            total = response.xpath(
                '//div[contains(@class,"pagenavi")]/a[last()-1]/span/text()'
            ).extract_first()
            str1 = img_src.rsplit('0', 1)[0]
            for i in range(1, int(total) + 1):
                if i < 10:
                    i = '0' + str(i) + '.jpg'
                    next_img = str1 + i
                else:
                    next_img = str1 + str(i) + '.jpg'
                img_srcs.append(next_img)
            img_data['title'] = title
            img_data['img_list'] = img_srcs
            yield img_data

        except Exception as e:
            print(e)
Beispiel #4
0
 def pic_download_next(self, response):
     item = MzituItem()
     pic_name = scrapy.Selector(response).xpath(
         '//div[@class="main-image"]/p/a/img/@alt').extract()[0]
     # 同上
     trantab = str.maketrans('\/:*?"<>|', 'abcdefghi')
     item['pic_name'] = pic_name.translate(trantab)
     item['pic_url'] = scrapy.Selector(response).xpath(
         '//div[@class="main-image"]/p/a/img/@src').extract()[0]
     # 进入管道处理
     yield item
Beispiel #5
0
 def parse_item(self, response):
     header = {
         "User-agent":
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
     }
     item = MzituItem()
     item["name"] = response.css(".main-title::text").extract()
     item["url"] = response.url
     item['image_urls'] = response.css(
         ".main-image img::attr(src)").extract()
     time.sleep(random.randint(3, 6))
     yield Request(response.url, headers=header)
     yield item
Beispiel #6
0
 def tuji_parse(self,response):
     item=MzituItem()
     ref=response.meta['ref']
     tuji_url=response.meta['tuji_url']
     tuji_page_num=response.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span/text()').extract_first()
     for i in range(int(tuji_page_num)):
         if i+1==1:
             url=tuji_url
         else:
             url=tuji_url+'/%s'%(i+1)
         item['img_referer']=url
         # print('图集第%s页 -url--'%i,url)
         yield scrapy.Request(url=url,headers={'referer':ref},callback=self.img_parse,meta={'item':item})
 def parse_xijie(self, response):
     sel = Selector(response)
     item = MzituItem()
     rawdate1 = sel.xpath('//div[@class="month_Year"]/text()').extract()[0]
     rawdate2 = sel.xpath('//div[@class="day"]/text()').extract()[0]
     date = rawdate1[-4:] + '-' + rawdate1[:2] + '-' + rawdate2
     title = sel.xpath('//div[@class="metaRight"]/h2/a/text()').extract()[0]
     for_pic = sel.xpath('//div[@id="picture"]//img')
     for yige in for_pic:
         item['date'] = date
         item['title'] = title
         item['image_urls'] = [yige.xpath('./@src').extract()[0]]
         yield item
Beispiel #8
0
    def get_img_url(self, response):
        '''
        从page_url的response里
        找到图片的下载连接
        '''
        item = MzituItem()
        item['name'] = response.meta['name']

        # 找到图片的下载地址,注意有可能一页有两张图
        pic = response.xpath('//div[@class="main-image"]//img/@src').extract()

        for url in pic:
            item['img_urls'] = url
            yield item
Beispiel #9
0
 def parse_item(self, response):
     item = MzituItem()
     # max_num为页面最后一张图片的位置
     max_num = response.xpath(
         '//div[@class="pagenavi"]/a[last()-1]/span/text()').extract_first(
             default="N/A")
     item['name'] = response.xpath(
         '//div[@class="content"]/h2/text()').extract_first(default="N/A")
     item['url'] = response.url
     for num in range(1, int(max_num)):
         # page_url 为每张图片所在的页面地址
         page_url = response.url + '/' + str(num)
         yield scrapy.Request(page_url, callback=self.img_url)
     item['image_urls'] = self.img_urls
     yield item
Beispiel #10
0
    def parse_item(self, response):

        # sel = Selector(response)
        #
        # name = sel.xpath("//div[@class='main-image']/p/a/img/@alt").extract()[0]
        # print(name)

        l = ItemLoader(item=MzituItem(), response=response)
        l.add_xpath('image_urls', "//div[@class='main-image']/p/a/img/@src",
                    Identity())
        l.add_xpath('name', "//div[@class='main-image']/p/a/img/@alt",
                    Identity())
        # l.add_value('name', name)

        return l.load_item()
Beispiel #11
0
    def img_url(
        self,
        response,
    ):
        item = MzituItem()
        item['name'] = self.page_name
        item['url'] = self.page_url
        item['image_urls'] = self.image_urls

        img_urls = response.xpath(
            "descendant::div[@class='main-image']/descendant::img/@src"
        ).extract()
        for img_url in img_urls:
            item['image_urls'].append(img_url)
        yield item
Beispiel #12
0
    def parse(self, response):
        item = MzituItem()
        #item['title'] = response.xpath('//h2[@class="main-title"]/text()')[0] .extract()
        item['title'] = response.xpath(
            '//h2[@class="main-title"]/text()')[0].extract().split('(')[0]
        item['img'] = response.xpath(
            '//div[@class="main-image"]/p/a/img/@src')[0].extract()
        item['name'] = response.xpath('//div[@class="main-image"]/p/a/img/@src'
                                      )[0].extract().split('/')[-1]
        yield item

        next_url = response.xpath(
            '//div[@class="pagenavi"]/a/@href')[-1].extract()
        if next_url is not None:
            yield scrapy.Request(next_url, callback=self.parse)
Beispiel #13
0
    def get_parse(self, response):
        # print(response.request.headers['User-Agent'])
        item = MzituItem()
        item['title'] = response.xpath(
            '//div[@class="main-image"]//img/@alt').extract()[0]
        item['imgurl'] = response.xpath(
            '//div[@class="main-image"]//img/@src').extract()
        yield item

        # print(item)

        next_page = response.xpath(
            "//a/span[contains(text(),'下一页»')]/../@href")  #..  :当前节点父节点
        if next_page:
            url = next_page[0].extract()
            yield scrapy.Request(url, callback=self.get_parse)
Beispiel #14
0
 def second_handler(self, response):
     # 二级页面
     item = MzituItem()
     # 获取页数链接进行访问
     offset = int(
         response.xpath('//div[@class="pagenavi"]/a/span/text()')
         [4].extract())
     # 生成链接访问 遍历链接访问
     for i in [
             response.url + "/{}".format(str(x))
             for x in range(1, offset + 1)
     ]:
         item['Referer'] = i
         # 将meta传入链接  访问三级页面
         yield scrapy.Request(url=i,
                              meta={'meta_1': item},
                              callback=self.parse_ponse)
Beispiel #15
0
 def parse_get_image(self,response):
     #print('11111111111111111111111111111111'+response.url)
     item_detail = response.meta["item"]
     current = int(response.meta["current"])
     imageurl = response.xpath('//div[@class="main-image"]/p/a/img/@src').extract()[0]
     #print('22222222222222222222222222222222' + imageurl)
     item = MzituItem()
     item["mzi_name"] = item_detail["mzi_name"]
     item["mzi_link"] = item_detail["mzi_link"]
     item["mzi_time"] = item_detail["mzi_time"]
     item["mzi_view"] = item_detail["mzi_view"]
     item["mzi_image"] = imageurl
     if current == 1:
         item['mzi_index'] = 0
     else:
         item['mzi_index'] = current - 1
     yield item
Beispiel #16
0
 def parser_item(self, response):
     item = MzituItem()
     #得到没套图的名字 extract_first(default=”N/A”)取xpath返回值的第一个元素。如果xpath没有取到值,则返回N/A
     #这里我用字符串分割的方法
     name = response.selector.xpath(
         '/html/body/div[2]/div[1]/div[1]/text()[3]').extract()
     item['name'] = name[0][3:-1]
     item['url'] = response.url
     all_page = response.selector.xpath(
         '/html/body/div[2]/div[1]/div[4]/a[5]/span/text()').extract()
     for page in range(1, int(all_page[0]) + 1):
         #得到每张照片的地址
         url = response.url + '/' + str(page)
         yield scrapy.Request(url=url, callback=self.img_url)
     #上面的循环所有页面执行完 定义的图片url列表也在下面的 img_url函数里添加好了 所以放入容器
     item['image_url'] = self.img_urls
     yield item
 def detail_parse(self, response):
     item = MzituItem()
     date_str = response.css(
         'body > div.main > div.content > div.main-meta > span:nth-child(2)'
     ).extract_first()
     item['month'] = re.search(' (\d{4}-\d{2})-\d{2} ', date_str).group(1)
     item['date'] = re.search(' (\d{4}-\d{2}-\d{2}) ', date_str).group(1)
     item['title'] = response.css(
         'body > div.main > div.content > h2::text').extract_first()
     item['url'] = response.url
     item['type'] = response.css(
         'body > div.main > div.content > div.main-meta > span:nth-child(1) > a::text'
     ).extract_first()
     item['tags'] = ' '.join(
         response.css(
             'body > div.main > div.content > div.main-tags > a::text').
         extract())
     return item
Beispiel #18
0
    def get_img_url(self, response):
        '''
        从page_url的response里
        找到图片的下载链接
        :param response:
        :return:
        '''

        item = MzituItem()
        item['name'] = response.meta['name']
        item['base_urls'] = response.meta['base_urls']

        #找到图片的下载地址
        pic = response.xpath('//div[@class="main-image"]//img/@src').extract()

        for url in pic:
            item['img_urls'] = url
            yield item
Beispiel #19
0
    def parse_next(self,response):

        node_list = response.xpath('//ul[@id="pins"]/li')
        items = []
        for node in node_list:
            item = MzituItem()
            mzi_name = node.xpath('./span[1]/a/text()').extract()[0]
            mzi_link = node.xpath('./span[1]/a/@href').extract()[0]
            mzi_time = node.xpath('./span[2]/text()').extract()[0]
            mzi_view = node.xpath('./span[3]/text()').extract()[0]

            item["mzi_name"] = mzi_name
            item["mzi_link"] = mzi_link
            item["mzi_time"] = mzi_time
            item["mzi_view"] = mzi_view

            items.append(item)

        for item in items:
            yield scrapy.Request(item["mzi_link"], meta={"item": item}, callback=self.parse_detail)
Beispiel #20
0
    def parse_detail(self,response):

        item_detail = response.meta["item"]
        imageurl = response.xpath('//div[@class="main-image"]/p/a/img/@src').extract()[0]
        imagelist = response.xpath('//div[@class="pagenavi"]/a/span/text()').extract()[-2]
        url_src = ''
        file_name_type = imageurl.split('/')
        file_name = file_name_type[len(file_name_type) - 1]
        print('%s---------%s',file_name,len(file_name))
        if len(file_name) <= 9 and file_name.index('01.jpg') > -1 :
            baseURl = imageurl.split('01.jpg')[0]
            image_src = ''
            item = MzituItem()
            for i in range(1, int(imagelist) + 1):
                if i == 1:
                    item["mzi_index"] = 0
                else:
                    item['mzi_index'] = i - 1
                if i < 10:
                    image_src = baseURl + '0' + str(i) + '.jpg'
                else:
                    image_src = baseURl + str(i) + '.jpg'
                item["mzi_name"] = item_detail["mzi_name"]
                item["mzi_link"] = item_detail["mzi_link"]
                item["mzi_time"] = item_detail["mzi_time"]
                item["mzi_view"] = item_detail["mzi_view"]
                item["mzi_image"] = image_src

                yield item
        else:
            for i in range(1, int(imagelist) + 1):
                if i == 1:
                    url_src = item_detail["mzi_link"]
                else:
                    url_src = '%s/%s' % (item_detail["mzi_link"], str(i))

                # print(url_src)
                yield scrapy.Request(url_src, meta={"item": item_detail, "current": str(i)},
                                     callback=self.parse_get_image)
Beispiel #21
0
 def pic_download(self, response):
     item = MzituItem()
     pic_name = scrapy.Selector(response).xpath(
         '//div[@class="main-image"]/p/a/img/@alt').extract()[0]
     trantab = str.maketrans('\/:*?"<>|', 'abcdefghi')
     # 依然是替换非法字符,之前是为了创建目录,这里是为了把图片应存储的本地分类路劲写入到item中
     item['pic_name'] = pic_name.translate(trantab)
     # 找到该分类的页数
     item['pic_url'] = scrapy.Selector(response).xpath(
         '//div[@class="main-image"]/p/a/img/@src').extract()[0]
     # 进入管道处理
     yield item
     # 找到该分类的页数
     url_num = scrapy.Selector(response).xpath(
         '//div[@class="pagenavi"]/a[last()-1]/span/text()').extract()[0]
     # 从第二页开始遍历
     for i in range(2, int(url_num) + 1):
         link = '{}/{}'.format(response.url, i)
         if link not in final_page_link:
             # 记录
             final_page_link.append(link)
             # 调用后续函数,这里不能回调该函数,原因看readme
             yield scrapy.Request(link, callback=self.pic_download_next)
Beispiel #22
0
 def parse_item(self, response):
     sel = Selector(response)
     item = MzituItem()
     item['classification'] = sel.xpath(
         '/html/body/div[2]/div[1]/div[1]/a[2]/text()').extract_first(
             default="N/A")
     item['title'] = sel.xpath('/html/body/div[2]/div[1]/div[1]/text()[3]'
                               ).extract_first(default="N/A")
     item['name'] = sel.xpath(
         '/html/body/div[2]/div[1]/div[4]/span[1]/text()').extract_first(
             default="N/A")
     max_num = sel.xpath(
         './*//div[@class="pagenavi"]/a[last()-1]/span/text()'
     ).extract_first(default="N/A")
     print(max_num)
     item['url'] = response.url
     for num in range(1, int(max_num) + 1):
         n_url = response.url + '/' + str(num)
         print('pic-' + n_url)
         yield scrapy.Request(n_url, callback=self.imgurl)
     item['image_urls'] = self.urls
     print('ij')
     yield item