def parse(self, response): #페이지의 기업 홈페이지 주소를 스크래핑함 self.log('I just visited: ' + response.url) infos = response.xpath('//*[@id="devStarterForm"]/div[2]/ul//li') #여기서 페이지 이동해서 값 가져오는 함수 호출(???) for info in infos: item = TutorialItem() item['company_name'] = info.xpath('div[1]/div[1]/a/text()')[0].extract() link = info.xpath('div[1]/div[1]/a/@href')[0].extract() if link[0] == '/': self.log('first char is /') item['company_info'] = 'www.jobkorea.co.kr'+info.xpath('div[1]/div[1]/a/@href')[0].extract() elif link[0] == 'h': item['company_info'] = info.xpath('div[1]/div[1]/a/@href')[0].extract() item['title'] = info.xpath('div[2]/div[1]/a/span/text()')[0].extract() deadline = info.xpath('div[4]/span[@class="day"]/text() | div[4]/span[@class="day schedule"]/text() | div[4]/span[@class="day tomorrow"]/text() | div[4]/span[@class="day today"]/text()')[:].extract() item['deadline'] = deadline item['achievement'] = info.xpath('div[3]/span[1]/text()')[0].extract() item['career'] = info.xpath('div[3]/strong/text()')[0].extract() item['area'] = info.xpath('div[3]/span[2]/text()')[0].extract() item['job'] = info.xpath('div[2]/div[2]/span/text()')[:].extract() link = response.urljoin(info.xpath('div[2]/div[1]/a/@href')[0].extract()) #print(link) #yield scrapy.Request(link, callback=self.parse_homepage) #lenth = len(item) #for i in range(lenth): # self.collection.insert({list(item.keys())[i]:list(item.values())[i]}) for key in item: self.dictionary[key] = item.get(key) self.collection.insert(self.dictionary,manipulate=False) #self.collection.insert({'hi': 'www.jobkorea.co.kr/Recruit/Co_Read/C/fany77sy?Oem_Code=C1&PageGbn=ST', # "sdfsdf":12}) yield item
def parse(self, response): #parse方法用来处理request返回的结果。关于这一部分的一些内容,我在后面详细介绍。 title = response.selector.xpath( ".//*[@id='content_bg']/div[2]/div[1]/div[1]/div[2]/p/text()" ).extract() version = response.selector.xpath( ".//*[@id='content_bg']/div[2]/div[1]/div[1]/div[2]/ul[1]/li[1]/a/text()" ).extract() grade = response.selector.xpath( ".//*[@id='content_bg']/div[2]/div[1]/div[1]/div[2]/ul[1]/li[2]/a/text()" ).extract() subject = response.selector.xpath( ".//*[@id='content_bg']/div[2]/div[1]/div[1]/div[2]/ul[1]/li[3]/a/text()" ).extract() publishing = response.selector.xpath( ".//*[@id='content_bg']/div[2]/div[1]/div[1]/div[2]/ul[1]/li[4]/text()" ).extract() Tutorial = TutorialItem() # 存入items Tutorial["title"] = title Tutorial["version"] = version Tutorial["grade"] = grade Tutorial["subject"] = subject Tutorial["publishing"] = publishing yield Tutorial
def parse(self, response): try: for book in response.css('article.product_pod'): items = TutorialItem() #抓取商品名称 items['name'] = book.xpath('./h3/a/@title').extract_first() #抓取商品价格 #price = book.xpath('./div[2]/p[1]/text()').extract_first() items['price'] = book.css( 'p.price_color::text').extract_first() #抓取商品链接 href = book.xpath('./div[1]/a/@href').extract_first() href = response.urljoin(href) items['href'] = href yield scrapy.Request(url=href, meta={'items': items}, callback=self.pare_detail, dont_filter=True) except BaseException, err: print(err)
def parse(self, response): filename = response.url.split('/')[-2] + ".txt" item = TutorialItem() item['title'] = response.xpath( "/html/body/div[8]/div[2]/div").extract() yield item
def parse(self, response): for line in response.xpath('//li[@class="j_thread_list clearfix"]'): # 初始化item对象保存爬取的信息 item = TutorialItem() # 解析爬取的内容 item['title'] = line.xpath( './/div[contains(@class, "threadlist_title pull_left j_th_tit")]/a/text()' ).extract() item['author'] = line.xpath( './/div[contains(@class, "threadlist_author pull_right")]//span[contains(@class, "frs-author-name-wrap")]/a/text()' ).extract() item['reply'] = line.xpath( './/div[contains(@class, "col2_left j_threadlist_li_left")]/span/text()' ).extract() self.logger.info( 'ssssssssssssssssssssssssssssssssssssssssssssssssssssssssss') self.logger.info({ 'title': item['title'], 'author': item['author'], 'reply': item['reply'] }) self.logger.info( 'qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq') yield item
def getList(self, response): item = TutorialItem() res = response.xpath('//div[@class="volume"]') url_list = [] if response.xpath('//div[@class="volume"][1]/h3/text()').extract( )[1].strip() == '作品相关': for i in range(2, len(res) + 1): url_list.extend( response.xpath( '//div[@class="volume"][{}]//ul//li//a/@href'.format( i)).extract()) else: for i in range(1, len(res) + 1): url_list.extend( response.xpath( '//div[@class="volume"][{}]//ul//li//a/@href'.format( i)).extract()) chapterID = 1 for url in url_list: item['chapterID'] = chapterID yield scrapy.Request(url='https:' + url, callback=self.getContent, meta={'key': chapterID}, errback=self.errback_httpbin) chapterID += 1
def parse(self, response): print("receive data") host = 'http://www.t66y.com/' hxs = Selector(response) sites = hxs.xpath('//tr[@class="tr3 t_one"]') for site in sites: item = TutorialItem() linkSite = site.xpath( 'td[@style="text-align:left;padding-left:8px"]/h3/a[@target="_blank"]/@href' ) titleSite = site.xpath( 'td[@style="text-align:left;padding-left:8px"]/h3/a[@target="_blank"]/text()' ) timeSite = site.xpath('td/a[@class="f10"]/text()') countTime = site.xpath('td[@class="tal f10 y-style"]/text()') if titleSite.__len__() == 0: continue for title in titleSite: item['title'] = title.extract() for link in linkSite: item['link'] = host + (link.extract()) for time in timeSite: item['publicTime'] = (time.extract()) for count in countTime: item['clickTimes'] = (count.extract()) self.saveInDb(item)
def detail_parse(self, response): items = TutorialItem() baseurl = 'https://www.tjnhm.com/' items['midx'] = '16' url = '' if response.xpath('/html/body/div[2]/div[1]/div[2]/p[1]/span/img/@src' ).get() != None: url = response.xpath( '/html/body/div[2]/div[1]/div[2]/p[1]/span/img/@src').get() else: url = response.xpath( '/html/body/div[2]/div[1]/div[2]/p[1]/img/@src').get() if url == None: return items['pic'] = baseurl + url items['name'] = response.xpath( '/html/body/div[2]/div[1]/div[2]/h1/text()').get().strip() t = '' for i in response.xpath('//*[@id="aboutus_text"]//span'): if i.xpath('text()').get() == None: return t = t + i.xpath('text()').get().strip() items['text'] = t print(items['name']) print(items['text']) print(items['pic']) yield items
def parse(self, response): jsonresponse = json.loads(response.text) for ads in jsonresponse["ads"]: item = TutorialItem() if "subject" in ads: item["subject"] = ads["subject"] if "price" in ads: item["price"] = ads["price"][0] if "category_name" in ads: item["category"] = ads["category_name"] if "location" in ads: if "city" in ads["location"]: item["location"] = ads["location"]["city"] item["url"] = ads["url"] yield {'object': item} self.i = self.i + 1 print(self.i) if len(jsonresponse["ads"]) == 35: self.frmdata["offset"] = self.frmdata["offset"] + 35 yield scrapy.Request(self.url, callback=self.parse, method='POST', body=json.dumps(self.frmdata), headers=self.headr, dont_filter=True)
def parse(self, response): self.logger.info('A response from %s just arrived!', response.url) # filename = response.url.split("/")[-1] + '.html' # with open(filename, 'wb') as f: # f.write(response.body) # items = [] # for node in response.xpath("//div[@class='col-md-8']/div[@class='quote']"): # item=TutorialItem() # content = node.xpath("./span[1]/text()").extract() # author=node.xpath("./span[2]/small/text()").extract() # tags = node.xpath('./div[@class="tags"]/a/text()').extract() # item["author"]=author # item["content"]=content # item["tags"]=tags # items.append(item) # yield item # return items for node in response.xpath("//div[@class='col-md-8']/div[@class='quote']"): item = TutorialItem() item["author"]=node.xpath("./span[2]/small/text()").extract() item["content"]=node.xpath("./span[1]/text()").extract() item["tags"]=node.xpath('./div[@class="tags"]/a/text()').extract() yield item for href in response.xpath('//li[@class="next"]/a/@href'): if href: url=response.urljoin(href.extract()) yield scrapy.Request(url,callback=self.parse)
def parse(self, response): for quote in response.css('div.quote'): item = TutorialItem() item['author'] = quote.css('small.author::text').extract_first() item['text'] = quote.css('span.text::text').extract_first() item['tags'] = quote.css('div.tags a.tag::text').extract() yield item
def parse_list(self, response): hxs = Selector(response) titles = hxs.select( "//div[contains(@class,'product-unit unit-4 browse-product new-design')]" ) items = [] count1 = 0 for title in titles: count1 = count1 + 1 item = TutorialItem() item['model'] = str( title.select(".//div[contains(@class,'pu-title')]/a/text()"). extract()).encode('utf-8').strip() item['offer'] = title.select( ".//div[contains(@class,'pu-final')]/span/text()").extract() item['image'] = title.select( ".//div[contains(@class,'pu-visual-section')]/a/img/@data-src") item['standard_url'] = "http://www.flipkart.com" + \ title.select( ".//div[contains(@class,'pu-title')]/a/@href")[0].extract() # return items request = Request(item['standard_url'], callback=self.new_features) request.meta['item'] = item items.append(item) yield request
def parse(self, response): item = TutorialItem() selector = Selector(response) Movies = selector.xpath('//div[@class="info"]') for eachMovie in Movies: title = eachMovie.xpath('div[@class="hd"]/a/span/text()').extract() # 多个span标签 fullTitle = "".join(title) # 将多个字符串无缝连接起来 movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract() star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()[0] quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract() # quote可能为空,因此需要先进行判断 if quote: quote = quote[0] else: quote = '' item['title'] = fullTitle item['movieInfo'] = ';'.join(movieInfo) item['star'] = star item['quote'] = quote yield item nextLink = selector.xpath('//span[@class="next"]/link/@href').extract() # 第10页是最后一页,没有下一页的链接 if nextLink: nextLink = nextLink[0] yield Request(urljoin(response.url, nextLink), callback=self.parse)
def parse(self, response): bs_obj = BeautifulSoup(response.text) items = bs_obj.find_all('div', {'class': 'quote'}) # 获取列表 for item in items: item_field = TutorialItem() # 每一个item信息存储到item_field里 text = item.find('span', { 'itemprop': 'text', 'class': 'text' }).text author = item.find('small', {'class': 'author'}).text tags = item.find_all('a', {'class': 'tag'}) tags = [tag.text for tag in tags] # 获取tags列表里的每一个tag的文本 item_field['text'] = text # 存储数据到item_field item_field['author'] = author item_field['tags'] = tags self.logger.info(item_field['text']) yield item_field # 使用生成器,每次调用都会从结束处开始,会生成新的item_field,爬取和计算后会返回数据 # 获取下一页,由于使用BeautifulSoup比较麻烦,而且错误处理比较不方便,所以使用css选择器 next_page = response.css('.pager .next a::attr(href)').extract_first() # 如果next_page存在 if next_page: # 使用urljoin获取绝对地址 next_url = response.urljoin(next_page) # 回调函数,继续调用该parse函数,传入next_url进行请求,生成一个新的Request加入队列 yield scrapy.Request(url=next_url, callback=self.parse)
def parse(self, response): item = TutorialItem() item["identification"] = response.xpath('//*[@id="b_footerItems"]//text()').extract() item["name"] = response.xpath('//*[@id="sbox"]//text()').extract() # print(next_page) yield item
def parse(self, response): ul = response.xpath('//ul[@class="gl-warp clearfix"]') items = [] for li in ul.xpath('.//li[@class="gl-item"]'): item = TutorialItem() price = li.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-price"]/strong/i/text()').extract() title = li.xpath( './/div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/em/text()').extract() description = li.xpath( './/div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/@title').extract() img = li.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/img/@src').extract() if img: pic1 = img else: pic1 = li.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/img/@data-lazy-img').extract() pic2 = pic1 item['sourceName'] = "jd" sourceURL = li.xpath( './/div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/@href').extract() item['title'] = title[0].encode('utf-8') if price: item['price'] = price[0] else: item['price'] = 0 item['description'] = description[0].encode('utf-8') item['pic1'] = pic1[0].encode('utf-8') item['pic2'] = pic2[0].encode('utf-8') item['sourceURL'] = sourceURL[0].encode('utf-8') items.append(item) return items
def parse(self, response): if response.status == '404': return # print(response.body.decode('utf-8')) baseurl = 'http://www.19371213.com.cn/collection/zdwwjs/201811' item = TutorialItem() item['midx'] = '44' item['name'] = response.xpath( '/html/body/div[6]/div/div/div/div[2]/div/div/article/section/div[2]/header/h4/text()' ).get().strip() t = '' for i in response.xpath( '/html/body/div[6]/div/div/div/div[2]/div/div/article/section/div[2]/div/div/div//img' ): t = i.xpath('@src').get()[1:] item['pic'] = baseurl + t t = '' for i in response.xpath( '/html/body/div[6]/div/div/div/div[2]/div/div/article/section/div[2]/div/div/div//p' ): if i.xpath('text()').get() != None: t = t + i.xpath('text()').get().strip() item['text'] = t print(item['name']) print(item['text']) print(item['pic']) yield item
def parse(self, response): item = TutorialItem() selector = Selector(response) movies = selector.xpath('//div[@class="info"]') for movie in movies: title = movie.xpath('div[@class="hd"]/a/span/text()').extract() fullTitle = '' for each in title: fullTitle += each movieInfo = movie.xpath('div[@class="bd"]/p/text()').extract() star = movie.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] quote = movie.xpath('div[@class="bd"]/p/span/text()').extract() if quote: quote = quote[0] else: quote = '' item['title'] = fullTitle item['movieInfo'] = ';'.join(movieInfo).replace(' ', '').replace( '\n', '') item['star'] = star[0] item['quote'] = quote yield item nextPage = selector.xpath('//span[@class="next"]/link/@href').extract() if nextPage: nextPage = nextPage[0] print(self.url + str(nextPage)) yield Request(self.url + str(nextPage), callback=self.parse)
def parse(self, response): sel = Selector(response) sites = sel.xpath('//*[@id="js-sort-filter-results"]/section/article') items = [] for site in sites: item = TutorialItem() # title = site.xpath('a/div[2]/h3/text()').extract() # content = site.xpath('a/div[2]/p/text()').extract() # //*[@id="js-sort-filter-results"]/section/article[2]/a/div[2]/p/text() # item['title'] = [t.encode('utf-8') for t in title] # item['content'] = [c.encode('utf-8') for c in content] url = site.xpath('a/@href').extract() title = site.xpath('a/div[2]/h3/text()').extract() item['url'] = [u.encode('utf-8') for u in url] item['title'] = [t.encode('utf-8') for t in title] items.append(item) # print "item appending" # log.msg("Appending Item",'INFO') # log.msg("Appending Done",'INFO') # print "item appended" # print items ''' print 'len item is: ' + str(len(item['url'])) for i in range(len(item['url'])): print item['title'][i] ''' return items
def parse_next_again(self, response): item = TutorialItem() name = response.xpath( '//div[@class = "add-to-cart"]/div[1]/p[1]/text()').extract()[0] x = response.xpath( '//div[@class = "add-to-cart"]/span/text()').extract() y = response.xpath('//div[@class = "add-to-cart"]/a/text()').extract() if x != []: sale = x if y != []: sale = y if x == [] and y == []: sale = response.xpath( '//div[@class = "add-to-cart"]/div[2]/span/text()').extract() #color = response.xpath('//div[]') item['name'] = name item['sale'] = sale[0].strip() s = sale[0].strip().encode('unicode-escape').decode('string_escape') print s if s == "Notify me": item['count'] = 1 else: item['count'] = 0 #print name list.append(name) #print(list) yield item
def parse_product(self, response): '商品页获取title,price,product_id' req = [] phonename = response.xpath( '//div/ul[contains(@class, "parameter2 p-parameter-list")]/li[1]/@title' ).extract_first() issueyear = response.xpath( '//div[contains(@class, "Ptable-item")][1]/dl/dd/text()').re( u'[1-9][0-9][0-9][1-9]\u5e74') issuemonth = response.xpath( '//div[contains(@class, "Ptable-item")][1]/dl/dd/text()').re( u'[1-9][0-9]*\u6708') pattern = r"(\d+)\.html$" id = re.findall(pattern, response.url) priceUrl = "https://p.3.cn/prices/mgets?&skuIds=J_" + str(id[0]) item = TutorialItem() item['phonename'] = phonename item['issueyear'] = issueyear item['issuemonth'] = issuemonth item['itemurl'] = response.url request = scrapy.Request(priceUrl, method="GET", callback=self.parse_price) request.meta['item'] = item logging.log(logging.DEBUG, request) yield request
def parse_next_one(self, response): name = response.xpath( '//div[@class = "add-to-cart"]/div[1]/p[1]/text()').extract( )[0].strip() if name.find('Lite') != -1: sale = response.xpath('//div[@class = "add-to-cart"]/span/text()' ).extract()[0].strip() else: sale = response.xpath( '//div[@class = "add-to-cart"]/div[2]/span/text()').extract( )[0].strip() if sale == []: sale = response.xpath( '//div[@class = "add-to-cart"]/div[2]/text()').extract( )[0].strip() item = TutorialItem() item['name'] = name item['sale'] = sale s = sale.encode('unicode-escape').decode('string_escape') print s if s == "BUY NOW": item['count'] = 0 else: item['count'] = 1 #print name list.append(name) yield item
def parse(self, response): for sel in response.xpath('//ul/li'): item = TutorialItem() item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('text()').extract() yield item
def parse(self, response): for quote in response.css('div.quote'): yield TutorialItem(text=quote.css('span.text::text').get(), author=quote.css('small.author::text').get()) next_page = response.css('li.next a::attr(href)').get() if next_page is not None: yield response.follow(next_page, self.parse)
def parse(self, response): movie_list = response.xpath( "//div[@class='article']/ol[@class='grid_view']/li") for item in movie_list: movie_item = TutorialItem() movie_item["number"] = item.xpath( ".//div[@class='item']/div[@class='pic']/em/text()" ).extract_first() movie_item["title"] = item.xpath( ".//div[@class='info']/div[@class='hd']/a/span[@class='title'][1]/text()" ).extract_first() content = item.xpath( ".//div[@class='item']/div[@class='info']/div[@class='bd']/p[1]/text()" ).extract() for i_content in content: movie_item["desc"] = "".join(i_content.split()) movie_item["star"] = item.xpath( ".//div[@class='item']/div[@class='info']/div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()" ).extract_first() movie_item["intro"] = item.xpath( ".//div[@class='item']/div[@class='info']/div[@class='bd']/p[@class='quote']/span[@class='inq']/text()" ).extract_first() movie_item["link"] = item.xpath( ".//div[@class='item']/div[@class='pic']/a/@href" ).extract_first() yield movie_item next_link = response.xpath( "//div[@class='paginator']/span[@class='next']/a/@href" ).extract() if next_link: next_link = next_link[0] yield scrapy.Request("https://movie.douban.com/top250" + next_link, callback=self.parse)
def parse(self, response): print "crwal: " + response.url tt_item = TutorialItem() page_url = response.url links = response.selector.re( r'href="(/item/[\w%]+)"') # response自带正则解析 for link in links: next_url = urlparse.urljoin(page_url, link) yield Request(next_url) # title = response.xpath('//dd[@class="lemmaWgt-lemmaTitle-title"]/h1/text()').extract_first('error') title = response.css('div .lemmaWgt-lemmaTitle-title h1::text' ).extract_first('error: not found') # para_lst = response.xpath('//div[@class="lemma-summary"]/div[@class="para"]//text()').extract() para_lst = response.css('div .lemma-summary div.para ::text').extract() para = '' for i in para_lst: para += i tt_item['url'] = page_url tt_item['title'] = title tt_item['para'] = para yield tt_item
def callmefordebug(self, response): self.log('call me') x = response.xpath('//a[@class="bizDownload"]') if x: #x1=x.xpath('./text()').extract() #x1=''.join(x) filen = response.xpath( '//a[@class="bizDownload"]/text()').extract() filen = ''.join(filen) self.log('---***---download file : %s***---s' % filen) file = TutorialItem() #here hook to filepipleline lzg for lu in x: filen = lu.xpath( '//a[@class="bizDownload"]/text()').extract()[0] #x=''.join(x) #filen=''.join(filen) urlid = lu.xpath('//a[@class="bizDownload"]/@id').extract()[0] self.log('*****download file urlid %s' % urlid) url = 'http://www.ccgp.gov.cn/oss/download?uuid=' + urlid #file=TutorialItem() #here hook to filepipleline lzg #first get the outline part file href file['file_urls'] = [url] file['file_name'] = filen #file[name]=filen self.log('*****download file Saved file %s' % filen) yield file
def parse(self, response): # print response.body item = TutorialItem() selector = Selector(response) Movies = selector.xpath('//div[@class="info"]') for eachMoive in Movies: title = eachMoive.xpath('div[@class="hd"]/a/span/text()').extract() fullTitle = '' for each in title: fullTitle += each movieInfo = eachMoive.xpath('div[@class="bd"]/p/text()').extract() star = eachMoive.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract() quote = eachMoive.xpath( 'div[@class="bd"]/p[@class="quote"]/span/text()').extract() if quote: quote = quote[0] else: quote = '' item['title'] = fullTitle item['movieInfo'] = ';'.join(movieInfo) item['star'] = star item['quote'] = quote yield item nextLink = selector.xpath('//span[@class="next"]/a/@href') if nextLink: url = response.urljoin(nextLink[0].extract()) print url yield Request(url, self.parse)
def parse(self, response): item = TutorialItem() item["job_title"] = response.css( "span.listing-company-name a::text").getall() item["location"] = response.css( "span.listing-location a::text").getall() yield item
def parse(self,response): try: page_number = response.meta['page_number'] except: page_number = 2 try: house_list = response.xpath("//div[@class='resblock-list-container clearfix']//ul[2]/li"); for i_items in house_list: #item文件导入 item = TutorialItem() #数据处理 item['title'] = i_items.xpath(".//div//div[1]//a/text()").extract_first() item['address'] = i_items.xpath(".//div//div[2]//span[1]/text()").extract_first() + i_items.xpath(".//div//div[2]//span[2]/text()").extract_first() + i_items.xpath(".//div//div[2]//a/text()").extract_first() try: item['total'] = i_items.xpath(".//div//div[@class='resblock-price']//div[@class='second']/text()").extract_first() except: item['total'] = 'none' item['unitprice'] = i_items.xpath(".//div//div[@class='resblock-price']//div[@class='main-price']//span[1]//text()").extract_first() try: item['buildingface'] = i_items.xpath(".//div//div[@class='resblock-area']//span//text()").extract_first() except: item['buildingface'] = 'none' #数据导入到pipeline yield item #解析下一页的规则 #next_link = response.xpath("//div[@class='page-box']//a[@class='next']/@href") print(page_number) if page_number < 5: url = "https://sz.fang.lianjia.com/loupan/pg" + str(page_number) print(url) page_number = page_number + 1 yield scrapy.Request(url=url,meta={"page_number": page_number},callback=self.parse) except Exception as e: self.logger.error("error=" + str(response.url) + ", " + str(e))