def parse(self, response): # 获取所有图片的a标签 allPics = response.xpath('//div[@class="img"]/a') for pic in allPics: # 提取每张图片信息 item = PicItem() name = pic.xpath('./img/@alt').extract()[0] addr = pic.xpath('./img/@src').extract()[0] addr = 'http://www.xiaohuar.com' + addr item['name'] = name item['addr'] = addr # print (item) # 返回爬取数据 yield item # 获取下一页 # self.fetchNext(response) navPageList = response.xpath( '//div[@id="page"]/div[@class="page_num"]/a') # print(navPageList) for navPage in navPageList: txt = navPage.xpath('./text()').extract()[0] url = navPage.xpath('./@href').extract()[0] # print(txt) if txt == '下一页': print(url) yield scrapy.Request(url, callback=self.parse) break
def parse(self, response): # 当页面以list开头才进行爬取 if response.url.startswith('http://www.xiaohuar.com/list-'): # 获取所有图片的a标签 allPics = response.xpath('//div[@class="img"]/a') for pic in allPics: # 分别处理每个图片,取出名称及地址 item = PicItem() name = pic.xpath('./img/@alt').extract()[0] addr = pic.xpath('./img/@src').extract()[0] addr = 'http://www.xiaohuar.com' + addr item['name'] = name item['addr'] = addr # 返回爬取到的数据 yield item # 获取所有的a标签的urls urls = response.xpath('//a/@href').extract() for url in urls: if url.startswith("http://www.xiaohuar.com/list-"): # url是否在url_set集合中 if url not in XhSpider.url_set: # 添加url到集合中 XhSpider.url_set.add(url) # 抛出url yield self.make_requests_from_url(url)
def parse(self, response): # 如果图片地址以http://www.xiaohuar.com/list-开头,我才取其名字及地址信息 if response.url.startswith("http://www.xiaohuar.com/list-"): allPics = response.xpath('//div[@class="img"]/a') for pic in allPics: # 分别处理每个图片,取出名称及地址 item = PicItem() name = pic.xpath('./img/@alt').extract()[0] addr = pic.xpath('./img/@src').extract()[0] addr = 'http://www.xiaohuar.com' + addr item['name'] = name item['addr'] = addr # 返回爬取到的信息 yield item # 获取所有的地址链接 urls = response.xpath("//a/@href").extract() for url in urls: # 如果地址以http://www.xiaohuar.com/list-开头且不在集合中,则获取其信息 if url.startswith("http://www.xiaohuar.com/list-"): if url in XhSpider.url_set: pass else: XhSpider.url_set.add(url) # 回调函数默认为parse,也可以通过from scrapy.http import Request来指定回调函数 # from scrapy.http import Request # Request(url,callback=self.parse) yield self.make_requests_from_url(url) else: pass
def parse(self, response): pic_item = PicItem() img_nodes = response.css('div.topic-list') pic_item['href'] = img_nodes.css('a::attr(href)').extract() pic_item['src'] = img_nodes.css('img::attr(src)').extract() pic_item['title'] = img_nodes.css('span::text').extract() yield pic_item
def parse(self, response): # if the image address start with http://www.xiaohuar.com/list-,I will get it's name and address if response.url.startswith("http://www.xiaohuar.com/list-"): allPics = response.xpath('//div[@class="img"]/a') for pic in allPics: # Process each image separately, take out its' name and address. item = PicItem() name = pic.xpath('./img/@alt').extract()[0] addr = pic.xpath('./img/@src').extract()[0] addr = 'http://www.xiaohuar.com'+addr item['name'] = name item['addr'] = addr # Return crawled data yield item # get all the address links urls = response.xpath("//a/@href").extract() for url in urls: # if address start with http://www.xiaohuar.com/list-,and not in the collection, get the message if url.startswith("http://www.xiaohuar.com/list-"): if url in XhSpider.url_set: pass else: XhSpider.url_set.add(url) # the default callback function is parse, you can assign it # by 'from scrapy.http import Request' # from scrapy.http import Request # Request(url,callback=self.parse) yield self.make_requests_from_url(url) else: pass
def parse(self, response): allpics = response.xpath('//div[@class="img"]/a') for pic in allpics: item = PicItem() name = pic.xpath('./img/@alt').extract()[0] addr = pic.xpath('./img/@src').extract()[0] if 'http' in addr: addr = addr else: addr = 'http://www.xiaohuar.com' + addr item['name'] = name item['addr'] = addr yield item # 获取所有的地址链接 urls = response.xpath('//a/@href').extract() for url in urls: if url.startswith('http://www.xiaohuar.com/list-'): if url in XhSpider.url_set: pass else: XhSpider.url_set.add(url) yield self.make_requests_from_url(url) else: pass
def parse_detail(self, response): urls = response.xpath( '//div[@class="entry-content"]/p/img/@src').extract() for url in urls: url = [url] print(url) it1 = PicItem() it1['image_urls'] = url yield it1
def parse(self, response): allPics = response.xpath("//div[@class='img']/a") for pic in allPics: item = PicItem() name = pic.xpath("./img/@alt").extract()[0] addr = pic.xpath("./img/@src").extract()[0] addr = "http://www.xiaohuar.com" + addr item["name"] = name item["addr"] = addr yield item
def parse(self, response): s_re = re.compile(r'img class="lazy" src="(.*?)"') se = re.findall(s_re, response.text) item = PicItem() srcs = [] for i in se: #item['title'] = i[0] srcs.append(i) item['src'] = se yield item
def parse(self, response): # 获取所有图片的a标签 allPics = response.xpath(' //*[@id="imgid"]/a') for pic in allPics: # 分别处理每个图片,取出名称及地址 item = PicItem() addr = pic.xpath('./imgid/@href').extract()[0] addr = 'http://image.baidu.com' + addr item['addr'] = addr # 返回爬取到的数据 yield item
def parse(self, response): # 获取所有图片的a标签 allPics = response.xpath('//td[@class="wordbook-wordlist-name"]/a') for pic in allPics: # 分别处理每个图片,取出名称及地址 item = PicItem() #name = pic.xpath('./img/@alt').extract()[0] addr = pic.xpath('./@href').extract()[0] addr = 'http://www.xiaohuar.com' + addr #item['name'] = name item['addr'] = addr # 返回爬取到的数据 yield item
def parse(self, response): # 获取所有图像的a标签 allPics = response.xpath('//div[@class="img"]/a') for pic in allPics: # 分别处理每个图片取出名称及地址 item = PicItem() name = pic.xpath('./img/@alt').extract()[0] addr = pic.xpath('./img/@src').extract()[0] addr = 'http://www.xiaohuar.com' + addr item['name'] = name item['addr'] = addr # 返回爬取数据 yield item
def parse(self, response): into = response.xpath( '//div[@class = "post-module-thumb"]/a/@href').extract() for u in into: item1 = PicItem() item1['detailed'] = u yield scrapy.Request( item1['detailed'], callback=self.parse_detail, ) ifnxt = response.xpath('//div[@class="btn-pager"]').extract()[0] if "empty button" in ifnxt: nxt = response.xpath( '//div[@class="btn-pager"]/a/@href').extract()[0] yield scrapy.Request(nxt, callback=self.parse)
def parse(self, response): print("status:"+str(response.status)) pics = response.xpath('//img[@class="img-fluid"]') #print("movies:"+str(movies)) #print(pics) for pic in pics: item = PicItem() src = pic.xpath('./@src').extract() name = pic.xpath('./@alt').extract() if len(name) != 0: item['name'] = name[0] #print(name) #print(src[0]) #item['name'] = name[0] item['src'] = src[0] yield item
def parse(self, response): item = PicItem() imgurls = response.css(".post img::attr(src)").extract() item['imgurl'] = imgurls yield item pass
def parse(self, response): if (False == XhSpider.url_list_db.query(response.url, 1)): XhSpider.url_list_db.update(response.url) #print ("2.get img ing ...... ",sys._getframe().f_lineno) if response.url.startswith("https://www.zbjuran.com/mei/"): allPics = response.xpath('//center/div[@class="picbox"]') #print ("2.get img ing ...... ",sys._getframe().f_lineno ) for pic in allPics: # 分别处理每个图片,取出名称及地址 #print ("2.get img ing ...... ",sys._getframe().f_lineno ) item = PicItem() #print ("2.get img ing ...... ",sys._getframe().f_lineno ) addr = "" if len(pic.xpath('./img/@src')) >= 1: addr = pic.xpath('./img/@src').extract()[0] else: addr = pic.xpath('./p/img/@src').extract()[0] #print ("2.get img ing ...... ",sys._getframe().f_lineno ) name_1 = response.xpath( '//div[@class="title"]/h2/text()').extract()[0] #print ("2.get img ing ...... ",sys._getframe().f_lineno ) #print (name_1) name_2 = addr.replace('/', '_').replace(':', '_') name = "" name_obj = pic.xpath('./img/@alt') if len(name_obj) >= 1: name = name_obj.extract()[0] if len(name_1) >= 1: item['name'] = name_1 else: if len(name) >= 1: item['name'] = name else: item['name'] = name_2 if addr.startswith('/'): addr = "https://www.zbjuran.com/" + addr item['addr'] = addr # 返回爬取到的信息 #print ("hav:",item['addr'],item['name']) #print ("2.get img ing ...... ",sys._getframe().f_lineno ) yield item print("2.get img ing ...... ", sys._getframe().f_lineno) # 获取所有的地址链接 #print ("2.get href ing ...... ") urls = response.xpath("//a/@href").extract() for url in urls: #print(url) url_arr = url.split("_") # 如果地址以http://www.xiaohuar.com/list-开头且不在集合中,则获取其信息 if url.startswith("/mei/") and url.endswith(".html"): url_whole = "https://www.zbjuran.com" + url if XhSpider.url_list_db.query(url_whole): #print ("Exist:",url,url_whole) pass else: #XhSpider.url_set.add(url_whole) XhSpider.url_list_db.insert(url_whole) # 回调函数默认为parse,也可以通过from scrapy.http import Request来指定回调函数 # from scrapy.http import Request # Request(url,callback=self.parse) print("add", url_whole) yield self.make_requests_from_url(url_whole) elif url_arr[0].isdigit(): u_arr = response.url.split("/") u_arr.pop() u_arr.append(url) url_whole = "/" url_whole = url_whole.join(u_arr) if XhSpider.url_list_db.query(url_whole): #print ("Exist:",url,url_whole) pass else: print("add", url_whole) #XhSpider.url_set.add(url_whole) XhSpider.url_list_db.insert(url_whole) # 回调函数默认为parse,也可以通过from scrapy.http import Request来指定回调函数 # from scrapy.http import Request # Request(url,callback=self.parse) yield self.make_requests_from_url(url_whole) elif url.startswith("http") and url.endswith(".html"): url_whole = url if XhSpider.url_list_db.query(url_whole): #print ("Exist:",url,url_whole) pass else: #XhSpider.url_set.add(url_whole) XhSpider.url_list_db.insert(url_whole) # 回调函数默认为parse,也可以通过from scrapy.http import Request来指定回调函数 # from scrapy.http import Request # Request(url,callback=self.parse) print("add", url_whole) yield self.make_requests_from_url(url_whole) else: pass print("3.get waiting href ... ") url_whole = XhSpider.url_list_db.query_data() if len(url_whole) > 10: yield self.make_requests_from_url(url_whole) else: print("4. we finished this site")
def parse_url(self, response): items = PicItem() pic_list = response.xpath("//img/@src").extract() for pic in pic_list: items['pic_url'] = pic yield items