def get_image_url(self,response): item = MzituItem() item['name'] = response.meta['name'] image_urls = response.xpath('//div[@class="main-image"]/p/a/img/@src').extract() for image_url in image_urls: item['image_url'] = image_url yield item
def parse(self, response): print("================================================") for _li in response.xpath("//*[@id=\"pins\"]/li"): item = MzituItem() _link = _li.xpath("a/@href").extract()[0] _thumb = _li.xpath("a/img/@data-original").extract()[0] _title = _li.xpath("span/a/text()").extract()[0] _time = _li.xpath("span/text()").extract()[0] item['title'] = re.sub(r'[?\\*|“<>:/]', '', _title) item['thumb'] = _thumb item['time'] = _time item['link'] = _link print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n') print(_link) yield item yield scrapy.Request(_link, meta={'item': item}, callback=self.parseContent, headers=self.headers) print("================================================")
def parsePage(self, response): try: img_srcs = [] img_data = MzituItem() title = response.xpath( '//h2[@class="main-title"]/text()').extract_first() img_src = response.xpath( '//div[@class="main-image"]/p/a/img/@src').extract_first() total = response.xpath( '//div[contains(@class,"pagenavi")]/a[last()-1]/span/text()' ).extract_first() str1 = img_src.rsplit('0', 1)[0] for i in range(1, int(total) + 1): if i < 10: i = '0' + str(i) + '.jpg' next_img = str1 + i else: next_img = str1 + str(i) + '.jpg' img_srcs.append(next_img) img_data['title'] = title img_data['img_list'] = img_srcs yield img_data except Exception as e: print(e)
def pic_download_next(self, response): item = MzituItem() pic_name = scrapy.Selector(response).xpath( '//div[@class="main-image"]/p/a/img/@alt').extract()[0] # 同上 trantab = str.maketrans('\/:*?"<>|', 'abcdefghi') item['pic_name'] = pic_name.translate(trantab) item['pic_url'] = scrapy.Selector(response).xpath( '//div[@class="main-image"]/p/a/img/@src').extract()[0] # 进入管道处理 yield item
def parse_item(self, response): header = { "User-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" } item = MzituItem() item["name"] = response.css(".main-title::text").extract() item["url"] = response.url item['image_urls'] = response.css( ".main-image img::attr(src)").extract() time.sleep(random.randint(3, 6)) yield Request(response.url, headers=header) yield item
def tuji_parse(self,response): item=MzituItem() ref=response.meta['ref'] tuji_url=response.meta['tuji_url'] tuji_page_num=response.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span/text()').extract_first() for i in range(int(tuji_page_num)): if i+1==1: url=tuji_url else: url=tuji_url+'/%s'%(i+1) item['img_referer']=url # print('图集第%s页 -url--'%i,url) yield scrapy.Request(url=url,headers={'referer':ref},callback=self.img_parse,meta={'item':item})
def parse_xijie(self, response): sel = Selector(response) item = MzituItem() rawdate1 = sel.xpath('//div[@class="month_Year"]/text()').extract()[0] rawdate2 = sel.xpath('//div[@class="day"]/text()').extract()[0] date = rawdate1[-4:] + '-' + rawdate1[:2] + '-' + rawdate2 title = sel.xpath('//div[@class="metaRight"]/h2/a/text()').extract()[0] for_pic = sel.xpath('//div[@id="picture"]//img') for yige in for_pic: item['date'] = date item['title'] = title item['image_urls'] = [yige.xpath('./@src').extract()[0]] yield item
def get_img_url(self, response): ''' 从page_url的response里 找到图片的下载连接 ''' item = MzituItem() item['name'] = response.meta['name'] # 找到图片的下载地址,注意有可能一页有两张图 pic = response.xpath('//div[@class="main-image"]//img/@src').extract() for url in pic: item['img_urls'] = url yield item
def parse_item(self, response): item = MzituItem() # max_num为页面最后一张图片的位置 max_num = response.xpath( '//div[@class="pagenavi"]/a[last()-1]/span/text()').extract_first( default="N/A") item['name'] = response.xpath( '//div[@class="content"]/h2/text()').extract_first(default="N/A") item['url'] = response.url for num in range(1, int(max_num)): # page_url 为每张图片所在的页面地址 page_url = response.url + '/' + str(num) yield scrapy.Request(page_url, callback=self.img_url) item['image_urls'] = self.img_urls yield item
def parse_item(self, response): # sel = Selector(response) # # name = sel.xpath("//div[@class='main-image']/p/a/img/@alt").extract()[0] # print(name) l = ItemLoader(item=MzituItem(), response=response) l.add_xpath('image_urls', "//div[@class='main-image']/p/a/img/@src", Identity()) l.add_xpath('name', "//div[@class='main-image']/p/a/img/@alt", Identity()) # l.add_value('name', name) return l.load_item()
def img_url( self, response, ): item = MzituItem() item['name'] = self.page_name item['url'] = self.page_url item['image_urls'] = self.image_urls img_urls = response.xpath( "descendant::div[@class='main-image']/descendant::img/@src" ).extract() for img_url in img_urls: item['image_urls'].append(img_url) yield item
def parse(self, response): item = MzituItem() #item['title'] = response.xpath('//h2[@class="main-title"]/text()')[0] .extract() item['title'] = response.xpath( '//h2[@class="main-title"]/text()')[0].extract().split('(')[0] item['img'] = response.xpath( '//div[@class="main-image"]/p/a/img/@src')[0].extract() item['name'] = response.xpath('//div[@class="main-image"]/p/a/img/@src' )[0].extract().split('/')[-1] yield item next_url = response.xpath( '//div[@class="pagenavi"]/a/@href')[-1].extract() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def get_parse(self, response): # print(response.request.headers['User-Agent']) item = MzituItem() item['title'] = response.xpath( '//div[@class="main-image"]//img/@alt').extract()[0] item['imgurl'] = response.xpath( '//div[@class="main-image"]//img/@src').extract() yield item # print(item) next_page = response.xpath( "//a/span[contains(text(),'下一页»')]/../@href") #.. :当前节点父节点 if next_page: url = next_page[0].extract() yield scrapy.Request(url, callback=self.get_parse)
def second_handler(self, response): # 二级页面 item = MzituItem() # 获取页数链接进行访问 offset = int( response.xpath('//div[@class="pagenavi"]/a/span/text()') [4].extract()) # 生成链接访问 遍历链接访问 for i in [ response.url + "/{}".format(str(x)) for x in range(1, offset + 1) ]: item['Referer'] = i # 将meta传入链接 访问三级页面 yield scrapy.Request(url=i, meta={'meta_1': item}, callback=self.parse_ponse)
def parse_get_image(self,response): #print('11111111111111111111111111111111'+response.url) item_detail = response.meta["item"] current = int(response.meta["current"]) imageurl = response.xpath('//div[@class="main-image"]/p/a/img/@src').extract()[0] #print('22222222222222222222222222222222' + imageurl) item = MzituItem() item["mzi_name"] = item_detail["mzi_name"] item["mzi_link"] = item_detail["mzi_link"] item["mzi_time"] = item_detail["mzi_time"] item["mzi_view"] = item_detail["mzi_view"] item["mzi_image"] = imageurl if current == 1: item['mzi_index'] = 0 else: item['mzi_index'] = current - 1 yield item
def parser_item(self, response): item = MzituItem() #得到没套图的名字 extract_first(default=”N/A”)取xpath返回值的第一个元素。如果xpath没有取到值,则返回N/A #这里我用字符串分割的方法 name = response.selector.xpath( '/html/body/div[2]/div[1]/div[1]/text()[3]').extract() item['name'] = name[0][3:-1] item['url'] = response.url all_page = response.selector.xpath( '/html/body/div[2]/div[1]/div[4]/a[5]/span/text()').extract() for page in range(1, int(all_page[0]) + 1): #得到每张照片的地址 url = response.url + '/' + str(page) yield scrapy.Request(url=url, callback=self.img_url) #上面的循环所有页面执行完 定义的图片url列表也在下面的 img_url函数里添加好了 所以放入容器 item['image_url'] = self.img_urls yield item
def detail_parse(self, response): item = MzituItem() date_str = response.css( 'body > div.main > div.content > div.main-meta > span:nth-child(2)' ).extract_first() item['month'] = re.search(' (\d{4}-\d{2})-\d{2} ', date_str).group(1) item['date'] = re.search(' (\d{4}-\d{2}-\d{2}) ', date_str).group(1) item['title'] = response.css( 'body > div.main > div.content > h2::text').extract_first() item['url'] = response.url item['type'] = response.css( 'body > div.main > div.content > div.main-meta > span:nth-child(1) > a::text' ).extract_first() item['tags'] = ' '.join( response.css( 'body > div.main > div.content > div.main-tags > a::text'). extract()) return item
def get_img_url(self, response): ''' 从page_url的response里 找到图片的下载链接 :param response: :return: ''' item = MzituItem() item['name'] = response.meta['name'] item['base_urls'] = response.meta['base_urls'] #找到图片的下载地址 pic = response.xpath('//div[@class="main-image"]//img/@src').extract() for url in pic: item['img_urls'] = url yield item
def parse_next(self,response): node_list = response.xpath('//ul[@id="pins"]/li') items = [] for node in node_list: item = MzituItem() mzi_name = node.xpath('./span[1]/a/text()').extract()[0] mzi_link = node.xpath('./span[1]/a/@href').extract()[0] mzi_time = node.xpath('./span[2]/text()').extract()[0] mzi_view = node.xpath('./span[3]/text()').extract()[0] item["mzi_name"] = mzi_name item["mzi_link"] = mzi_link item["mzi_time"] = mzi_time item["mzi_view"] = mzi_view items.append(item) for item in items: yield scrapy.Request(item["mzi_link"], meta={"item": item}, callback=self.parse_detail)
def parse_detail(self,response): item_detail = response.meta["item"] imageurl = response.xpath('//div[@class="main-image"]/p/a/img/@src').extract()[0] imagelist = response.xpath('//div[@class="pagenavi"]/a/span/text()').extract()[-2] url_src = '' file_name_type = imageurl.split('/') file_name = file_name_type[len(file_name_type) - 1] print('%s---------%s',file_name,len(file_name)) if len(file_name) <= 9 and file_name.index('01.jpg') > -1 : baseURl = imageurl.split('01.jpg')[0] image_src = '' item = MzituItem() for i in range(1, int(imagelist) + 1): if i == 1: item["mzi_index"] = 0 else: item['mzi_index'] = i - 1 if i < 10: image_src = baseURl + '0' + str(i) + '.jpg' else: image_src = baseURl + str(i) + '.jpg' item["mzi_name"] = item_detail["mzi_name"] item["mzi_link"] = item_detail["mzi_link"] item["mzi_time"] = item_detail["mzi_time"] item["mzi_view"] = item_detail["mzi_view"] item["mzi_image"] = image_src yield item else: for i in range(1, int(imagelist) + 1): if i == 1: url_src = item_detail["mzi_link"] else: url_src = '%s/%s' % (item_detail["mzi_link"], str(i)) # print(url_src) yield scrapy.Request(url_src, meta={"item": item_detail, "current": str(i)}, callback=self.parse_get_image)
def pic_download(self, response): item = MzituItem() pic_name = scrapy.Selector(response).xpath( '//div[@class="main-image"]/p/a/img/@alt').extract()[0] trantab = str.maketrans('\/:*?"<>|', 'abcdefghi') # 依然是替换非法字符,之前是为了创建目录,这里是为了把图片应存储的本地分类路劲写入到item中 item['pic_name'] = pic_name.translate(trantab) # 找到该分类的页数 item['pic_url'] = scrapy.Selector(response).xpath( '//div[@class="main-image"]/p/a/img/@src').extract()[0] # 进入管道处理 yield item # 找到该分类的页数 url_num = scrapy.Selector(response).xpath( '//div[@class="pagenavi"]/a[last()-1]/span/text()').extract()[0] # 从第二页开始遍历 for i in range(2, int(url_num) + 1): link = '{}/{}'.format(response.url, i) if link not in final_page_link: # 记录 final_page_link.append(link) # 调用后续函数,这里不能回调该函数,原因看readme yield scrapy.Request(link, callback=self.pic_download_next)
def parse_item(self, response): sel = Selector(response) item = MzituItem() item['classification'] = sel.xpath( '/html/body/div[2]/div[1]/div[1]/a[2]/text()').extract_first( default="N/A") item['title'] = sel.xpath('/html/body/div[2]/div[1]/div[1]/text()[3]' ).extract_first(default="N/A") item['name'] = sel.xpath( '/html/body/div[2]/div[1]/div[4]/span[1]/text()').extract_first( default="N/A") max_num = sel.xpath( './*//div[@class="pagenavi"]/a[last()-1]/span/text()' ).extract_first(default="N/A") print(max_num) item['url'] = response.url for num in range(1, int(max_num) + 1): n_url = response.url + '/' + str(num) print('pic-' + n_url) yield scrapy.Request(n_url, callback=self.imgurl) item['image_urls'] = self.urls print('ij') yield item