def parse_all_shops(response): # 获取所有图片的a标签 shops = response.xpath('//div[@class="shopsearch"]/div[@class"content"]ul/li') logging.debug(shops) for shop in shops: logging.debug("=============================") logging.debug(shop.decode("gbk").encode("utf-8")) # 分别处理每个图片,取出名称及地址 item = XiaohuaItem() # 店铺名 name = shop.xpath('./div[@class="pic"]/a/img/@alt').extract()[0] # 店铺图片url picUrl = shop.xpath('./div[@class="pic"]/a/img/@src').extract()[0] # 店铺url url = shop.xpath('./div[@class="pic"]/a/@href').extract()[0] # 店铺地址 address = shop.xpath('./div[@class="txt"]/div[@class="tag-addr"]/span/text()').extract()[0] item['name'] = name item['pic'] = picUrl item['url'] = url item['address'] = address logging.debug(item) logging.debug("=============================") # 返回爬取到的数据 yield item
def pic_parse(self, response): item = XiaohuaItem() links = response.xpath('//div[@class="picbox"]/a/img/@src').extract() for link in links: #start_urls存放爬虫框架开始时的链接,该链接必须以列表形式存放,不能以字符串形式存放 item['image_urls'] = [response.urljoin(link)] yield item
def parse(self, response): headers = {'User-Agent': UserAgent(verify_ssl=False).random} item = XiaohuaItem() if response.status == 200: contents = response.xpath( '//div[@class="content-left"]/div[@class="one-cont"]') for content in contents: item['nickname'] = self.join_list( content.xpath('./div[1]/div/a/i/text()').extract()) item['content'] = self.join_list( content.xpath('./p[@class="fonts"]/a/text()').extract()) item['support'] = int( self.join_list( content.xpath('./ul/li[1]/span/text()').extract())) item['not_support'] = int( self.join_list( content.xpath('./ul/li[2]/span/text()').extract())) item['collect'] = int( self.join_list( content.xpath('./ul/li[3]/span/text()').extract())) item['message'] = int( self.join_list( content.xpath('./ul/li[4]/a/span/text()').extract())) item['share'] = int( self.join_list( content.xpath('./ul/li[5]/span/text()').extract())) print(item) yield item if self.page < 5: self.page += 1 yield scrapy.Request(self.base_url.format(self.page), headers=headers, callback=self.parse)
def parse(self, response): allPics = response.xpath('//div[@class="img"]/a') for pic in allPics: item = XiaohuaItem() name = pic.xpath('./img/@alt').extract()[0] src = pic.xpath('./img/@src').extract()[0] addr = 'http://www.xiaohuar.com' + src item['name'] = name item['addr'] = addr yield item
def parse(self, response): item = XiaohuaItem() for i, book in enumerate(response.css('div.img')): item['image_urls'] = book.xpath('./a/img/@src').extract_first() item['name'] = book.xpath('./a/img/@alt').extract_first() yield item for i in range(2, 50): nextPage = "http://www.xiaohuar.com/list-1-" + str(i) + ".html" if nextPage: nextPage = response.urljoin(nextPage) yield scrapy.Request(nextPage, callback=self.parse)
def parse_three(self,response): item=XiaohuaItem() #传入上面的item2 item3=response.meta['item2'] #匹配正则获取图片真实地址detailURL pattern=re.compile(r'<li class="pic-down h-pic-down"><a target="_blank" class="down-btn" href=\'(.*?)\'>.*?</a>',re.S) URL=re.search(pattern,response.text).group(1) item['detailURL']=URL item['path']=item3['path'] item['fileName']=item3['fileName'] yield item
def parse_last(self, response): item = XiaohuaItem() item['alt'] = response.xpath( '//div[@id="big-pic"]//img/@alt').extract()[0].strip() image_links = response.xpath( '//div[@id="big-pic"]//img/@src').extract() item['src'] = [] for src in image_links: if '.jpg' in src: item['src'].append(src) yield item
def parse_detail(self, response): title = response.meta['title'] div_list = response.xpath('//*[@id="swiper1"]/div/div') num = 1 for div in div_list: img_url = 'http://www.521609.com' + div.xpath( './/img/@src').extract_first().strip() # print('img_url:', img_url) item = XiaohuaItem() item['title'] = title item['img_name'] = str(num).zfill(2) + '.jpg' yield scrapy.Request(url=img_url, callback=self.prase_img, meta={'item': item}) num += 1
def parse(self, response): # 获取到包含图片的li列表 li_list = response.xpath( '//*[@id="content"]//div[@class="index_img list_center"]/ul/li') for li in li_list: # 匹配图片的地址,图片名称 img_url = self.base_url + li.xpath('.//img/@src').extract_first() img_name = li.xpath('.//img/@alt').extract_first() + '.jpg' item = XiaohuaItem() # 实例化对象 # 将数据封装到item中,这里只能用['img_ur']的形式,不能用点的方式 item['img_url'] = img_url item['img_name'] = img_name yield item # 拼接页码,并递归调用自己,达到处理所有页码的目的 for i in range(1, 12): new_url = self.urls % i yield scrapy.Request(url=new_url, callback=self.parse)
def parse_two(self,response): #传入上面的item1 item2=response.meta['item1'] source=requests.get(response.url) html=source.text.encode('utf-8') #用正则提取页数 pattern=re.compile(r'共(.*?)页',re.S) Num=6 items=[] for i in range(1,int(Num)+1): #注意这里,创建实例的位置 item=XiaohuaItem() item['fileName']=item2['fileName'] #构造每一个图片的存储路径 item['path']=item['fileName']+str(i)+'.jpg' #构造每一个图片入口链接,以获取源码中的原图链接 item['pageURL']=response.url[:-5]+'_'+str(i)+'.html' items.append(item) for item in items: yield Request(url=item['pageURL'],meta={'item2':item},callback=self.parse_three)
def parse_one(self,response): #创建一个大的list存储所有的item items=[] pattern=re.compile(r'<div class="title".*?<a.*?href="(.*?)">(.*?)</a></span></div>',re.S) mains=re.findall(pattern,response.text) for main in mains: #创建实例,并转化为字典 item=XiaohuaItem() item['siteURL']=main[0] item['title']=main[1] item['fileName']=self.base+item['title'] items.append(item) for item in items: #创建文件夹 fileName=item['fileName'] #if not os.path.exists(fileName): #os.makedirs(fileName) #用meta传入下一层 yield Request(url=item['siteURL'],meta={'item1':item},callback=self.parse_two)
def parse(self, response): if response.url.startswith('http://www.xiaohuar.com/list-'): picList = response.xpath('//div[@class="img"]/a') for pic in picList: item = XiaohuaItem() item['name'] = pic.xpath('./img/@alt').extract()[0] item['imgUrl'] = 'http://www.xiaohuar.com' + pic.xpath( './img/@src').extract()[0] yield item urls = response.xpath('//a/@href').extract() for url in urls: if url.startswith("http://www.xiaohuar.com/list-"): if url in self.urlSet: pass else: self.urlSet.add(url) yield self.make_requests_from_url(url) else: pass
def parse(self, response): # 获取所有图片标签 # print(response.url) print( '-----------------------------------------------------------------------------------------------------------------------' ) # datas=json.loads(response.body) # allpics=response.xpath('//div[@class="pic"]/a') allin = response.xpath( '//div[@class="p-cell cellItem nofavorite backwater"]' ) #可定位节点其中的一个属性 # print(allpics) # for pic in allpics: for pic in allin: # 分别处理每个图片,取出名称和地址 item = XiaohuaItem() # if pic.xpath('./h3/span[@class="cellTit"]/a/text()').extract(): name = pic.xpath('./h3/span[@class="cellTit"]/a/text()').extract() item['name'] = name # name=pic.xpath('./img/@src').extract()[0] # name=name.split('/')[-1] # if pic.xpath('./div[@class="pic"]/a/img/@src').extract()[0]: addr = pic.xpath('./div[@class="pic"]/a/img/@src').extract()[0] # addr=pic.xpath('./img/@src').extract()[0] addr = 'http://www.xiaohua100.cn' + addr # addr='http://www.xiaohua100.cn'+addr item['addr'] = addr # item['name']=name # item['addr']=addr # print(item) yield item page_add = re.search(r'pageno=(\d+)', response.url).group(1) page_add = 'pageno=' + str(int(page_add) + 1) next_url = re.sub(r'pageno=\d+', page_add, response.url) yield Request(next_url, headers=self.headers)