def start_requests(self): #this month thismonth = datetime.date.today().month #modellist with open('blm/' + self.dbname + '/modellist.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) modellist = [row for row in reader] #citylist with open('blm/' + self.dbname + '/citylist.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) citylist = [row for row in reader] step = len(modellist) / self.parts + 1 starti = self.part * step if self.part == self.parts - 1: step = len(modellist) - starti #urllist for city in citylist[2:3]: for model in modellist[starti:(starti + step)]: for year in range(int(model['min_reg_year']), int(model['max_reg_year']) + 1): for month in [1, 7, 12]: if year == 2018: month = min(thismonth, month) date = str(year) + '-' + str(month) usedyear = float( (datetime.date(2017, 4, 15) - datetime.date(year, month, 1)).days) / 365 mile = round(usedyear * 5 / 3, 2) if mile >= 45 or mile <= 0.1: mileagelist = [0.1, 45] else: mileagelist = [0.1, mile, 45] for mile in mileagelist: url ="https://dingjia.che300.com/app/EvalResult/getPreSaleRate?callback=jQuery18303745581165454668_1491989508797" \ "&prov=" + str(city['provid']) +"&city=" + str(city['cityid']) + \ "&brand=" + str(model['brandid']) + "&series=" + str(model['familyid']) + \ "&model=" + str(model['salesdescid']) + "®Date=" + date + "&mile=" + str(mile) url1='https://dingjia.che300.com/app/EvalResult/allProvPrices?callback=jQuery18307234983962413968_1492479620941' + \ "&brand=" + str(model['brandid']) + "&series=" + str(model['familyid']) + \ "&model=" + str(model['salesdescid']) + "®Date=" + date + "&mile=" + str(mile) if not (dfcheck(self.df, url, self.tag)): meta = dict() meta['provid'] = city['provid'] meta['cityid'] = city['cityid'] meta['salesdescid'] = model['salesdescid'] meta['regDate'] = date meta['mile'] = str(mile) yield scrapy.Request(url=url, meta={"datainfo": meta}, callback=self.parse) if not (dfcheck(self.df, url1, self.tag)): meta = dict() meta['salesdescid'] = model['salesdescid'] meta['regDate'] = date meta['mile'] = str(mile) yield scrapy.Request( url=url1, meta={"datainfo": meta}, callback=self.parse_allprov)
def start_requests(self): #this month thismonth = datetime.date.today().month #modellist with open('blm/' + self.dbname + '/modellist.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) modellist = [row for row in reader] #citylist with open('blm/' + self.dbname + '/citylist.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) citylist = [row for row in reader] step = len(modellist) / self.parts + 1 starti = self.part * step if self.part == self.parts - 1: step = len(modellist) - starti #urllist for city in citylist[2:3]: for model in modellist[starti:(starti + step)]: for year in range(int(model['min_reg_year']), int(model['max_reg_year']) + 1): if year == 2018: for month in range(1, datetime.datetime.now().month + 1): date = str(year) + '-' + str(month) mile = 0.2 url = "https://dingjia.che300.com/app/EvalResult/getResidualAnalysis?prov="\ + str(city['provid']) +"&city="+ str(city['cityid']) +"&series="+ str(model['familyid']) +"&model="+ \ str(model['salesdescid']) +"®Date="+ date +"&mile="+str(mile) if not (dfcheck(self.df, url, self.tag)): meta = dict() meta['provid'] = city['provid'] meta['cityid'] = city['cityid'] meta['salesdescid'] = model['salesdescid'] meta['regDate'] = date meta['mile'] = str(mile) yield scrapy.Request(url=url, meta={"datainfo": meta}, callback=self.parse) else: for month in range(1, 13): date = str(year) + '-' + str(month) mile = 0.2 url = "https://dingjia.che300.com/app/EvalResult/getResidualAnalysis?prov="\ + str(city['provid']) +"&city="+ str(city['cityid']) +"&series="+ str(model['familyid']) +"&model="+ \ str(model['salesdescid']) +"®Date="+ date +"&mile="+str(mile) if not (dfcheck(self.df, url, self.tag)): meta = dict() meta['provid'] = city['provid'] meta['cityid'] = city['cityid'] meta['salesdescid'] = model['salesdescid'] meta['regDate'] = date meta['mile'] = str(mile) yield scrapy.Request(url=url, meta={"datainfo": meta}, callback=self.parse)
def parse(self, response): # car_item x = response.xpath( '//ul[@class="car-pic-form-box car-box-list clear"]/li') for temp in x: urlbase = temp.xpath("a/@href").extract_first() # urlbase.strip() urltemp = str(urlbase.strip()) url = "http://" + urltemp[2:len(urltemp)] #num = urlbase.find("html") + 4 #urlbase = urlbase[0:num] datasave1 = temp.extract() #url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) # next page next_page = response.xpath( u'//div[@class="pages-box"]/a[contains(text(),"下一页")]/@href') if next_page: urlbase = str(next_page.extract_first()) urlbase2 = "http://so.iautos.cn" url = urlbase2 + urlbase yield scrapy.Request(url, self.parse)
def select4_parse(self, response): counts = response.xpath( '//h1[@class="historyRecord_title"]/text()').re('\d+') if counts: counts = int(counts[0]) if counts <= 6000: for href in response.xpath('//div[@class="carsItem carItem"]'): urlbase = href.xpath('a/@href').extract_first() datasave1 = href.extract() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={'datasave1': datasave1}, callback=self.parse_car) # next page next_page = response.xpath( '//a[@class="next"]/@href').extract_first() if next_page: url = response.urljoin(next_page) yield scrapy.Request(url, self.select4_parse) else: for href in response.xpath( u'//dd[@click_type="sale-transmissions"]/div/ul/li/a[not(contains(text(),"不限"))]/@href' ): url = response.urljoin(href.extract()) yield scrapy.Request(url, self.select5_parse)
def select2_parse(self, response): #数量 counts = response.xpath( '//h1[@class="historyRecord_title"]/text()').re('\d+') print counts if counts: counts = int(counts[0]) if counts <= 6000: for href in response.xpath('//div[@class="carsItem carItem"]'): urlbase = href.xpath('a/@href').extract_first() datasave1 = href.extract() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={'datasave1': datasave1}, callback=self.parse_car) # next page next_page = response.xpath( '//a[@class="next"]/@href').extract_first() if next_page: url = response.urljoin(next_page) yield scrapy.Request(url, self.select2_parse) else: for href in response.xpath( '//dl[@class="clearfix"]/dd/a/@href'): url = response.urljoin(href.extract()) yield scrapy.Request(url, self.select4_parse)
def select4_parse(self, response): counts = response.xpath( '//strong[@class="fc-org"]/text()').extract_first() if counts: counts = float(counts[0:-1]) if counts <= 3000: for href in response.xpath( '//dl[contains(@class,"list-pic clearfix cursor_pointer")]' ): urlbase = href.xpath( 'dt/div[2]/div/a/@href').extract_first() datasave1 = href.extract() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={'datasave1': datasave1}, callback=self.parse_car) # next page next_page = response.xpath('//a[@class="next"]/@href') if next_page: url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, self.select4_parse) else: for href in response.xpath( '//dd[@class="ddmyprice"]/a[not(@class="cur")]/@href'): url = response.urljoin(href.extract()) yield scrapy.Request(url, self.select5_parse)
def select3_parse(self,response): counts = response.xpath('//a[@name="view_v"]/h4/text()').re('\d+')[0] \ if response.xpath('//a[@name="view_v"]/h4/text()').re('\d+') else "0" if counts: counts = float(counts) if counts<=4000: temp = response.xpath('//li[@class="con caritem conHeight"]') for x in temp: # print x urlbase = x.xpath('//a[@class="aimg"]/@href').extract_first() url = response.urljoin(urlbase) datasave1 = x.extract() # urlbase = str(x.xpath('//a[@class="aimg"]/@href').extract_first()) # print urlbase # print len(urlbase) # url = "http://www.xin.com" + urlbase # print url # print "parse3" # print type(url) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) next_page = response.xpath(u'//a[contains(text(),"下一页")]') if next_page: urlbase = next_page.xpath('@href').extract_first() # urlbase = str(next_page.extract()[0]) url = response.urljoin(urlbase) # url = "http://www.xin.com" + urlbase yield scrapy.Request(url, self.select3_parse) else: for href in response.xpath('//div[@class="select-menu"]/div[@class="menu menu6"]/dd/a'): urlbase = href.xpath('@href').extract_first() # urlbase = str(href.extract()) url = response.urljoin(urlbase) # url = "http://www.xin.com" + urlbase yield scrapy.Request(url, self.select4_parse)
def parse(self,response): for i in range(150000, 250000): # print(i) # for i in range(self.start, self.start+self.num): url = "http://www.akd.cn/car/" + str(i) + "/" if not (dfcheck(self.df, str(i), self.tag)): yield scrapy.Request(url,callback=self.parse_car)
def select3_parse(self,response): print("select3") counts = response.xpath('//div[@class="totalCarsNum"]/span/text()').re('\d+')[0] listok = True if counts: counts = float(counts) if counts > 2400: listok = False if listok: x = response.xpath('//div[@class="carShow"]/div') for temp in x: urlbase = temp.xpath('a[@class="car-link"]/@href').extract_first() url = response.urljoin(urlbase) datasave1 = temp.extract() if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) next_page = response.xpath('//span[contains(text(),">")]/../@href').extract_first() if next_page: url = response.urljoin(next_page) yield scrapy.Request(url, self.select3_parse) else: x = response.xpath('//ul[@class="sc-option-list"]/li') for temp in x: urlbase = temp.xpath('a/@href').extract_first() url = response.urljoin(urlbase) yield scrapy.Request(url, self.select4_parse)
def parse(self,response): print(123123213) counts=response.xpath('//div[@class="totalCarsNum"]/span/text()').re('\d+')[0] listok=True if counts: counts=float(counts) if counts>2400: listok=False if listok: x=response.xpath('//div[@class="carShow"]/div') for temp in x: urlbase=temp.xpath('a[@class="car-link"]/@href').extract_first() url=response.urljoin(urlbase) datasave1=temp.extract() if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) next_page=response.xpath('//span[contains(text(),">")]/../@href').extract_first() if next_page: url=response.urljoin(next_page) yield scrapy.Request(url, self.parse) else: price=response.xpath('//div[@class="price-sc clearfix lab_block"]/a') for temp in price[1:9]: urlbase=temp.xpath('@href').extract_first() url=response.urljoin(urlbase) yield scrapy.Request(url, self.select2_parse)
def select2_parse(self, response): # logging.log(msg="do this step2",level=logging.INFO) # print(response.body) counts = response.xpath( '//div[@class="info_funcs_right"]/span/i/text()') listok = True if counts: counts = float(counts.extract_first()) if counts > 3500: listok = False if listok: for href in response.xpath( '//ul[@class="car_list ac_container"]/li/div[@class="col col2"]' ): url = str(href.xpath('a/@href').extract_first()) datasave1 = href.extract() # url = response.urljoin(urlbase) # print urlbase # print url if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) # next page next_page = response.xpath( '//div[@class="pager"]/a[@class="next"]/@href') if next_page: url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, self.select2_parse) else: for href in response.xpath( u'//dt[contains(text(),"类型:")]/../dd/a/@href')[1:14]: url = str(href.extract()) yield scrapy.Request(url, self.select3_parse)
def list_parse(self, response): print(response) for href in response.xpath('//li[@class="span6 list-item car-item"]'): datasave1 = href.extract() urlbase = href.xpath('a/@href').extract_first() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, cookies=self.cookies_for_request, headers=self.headers, meta={'datasave1': datasave1}, callback=self.parse_car) next_page = response.xpath( '//a[@rrc-event-name="switchright"]/@href').extract_first() if not (next_page): time.sleep(0.5) try: page = int( response.xpath( '//li[@class="active"]/a[@href="javascript:void(0);"]/text()' ).extract_first()) + 1 except Exception as e: print(e) yield scrapy.Request(url=response.url, dont_filter=True) return location = response.url.find("ershouche") + 9 newpage = response.url[0:location] + "/p" + str(page) print newpage url = response.urljoin(newpage) else: url = response.urljoin(next_page) yield scrapy.Request(url, self.list_parse, cookies=self.cookies_for_request, headers=self.headers)
def start_requests(self): cars = [] for i in range(40000, self.carnum): urlbase = 'http://www.kx.cn/chejia/' + str(i) if not (dfcheck(self.df, urlbase, self.tag)): car = scrapy.Request(urlbase, meta={'datasave1': 'zero'}) cars.append(car) return cars
def start_requests(self): #this month thismonth = datetime.date.today().month #modellist with open('blm/' + self.dbname + '/modellist.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) modellist = [row for row in reader] #citylist with open('blm/' + self.dbname + '/citylist.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) citylist = [row for row in reader] step = len(modellist) / self.parts + 1 starti = self.part * step if self.part == self.parts - 1: step = len(modellist) - starti #urllist # for model in modellist: # if model["salesdescid"] == "1127558": # index = modellist.index(model) for city in citylist[2:3]: for model in modellist[29682:29683]: # if model["salesdescid"] == "1127558": for year in range(1999, 2019): monthlist = range(1, 13) for month in monthlist: date = str(year) + '-' + str(month) for mile in range(1, 121) + [0.2]: if mile % 2 == 0: mile = mile / 2 else: mile = float(mile) / 2.0 # url ="https://dingjia.che300.com/app/EvalResult/getPreSaleRate?callback=jQuery183006719160584858153_1534397702297" \ # "&prov=" + str(city['provid']) +"&city=" + str(city['cityid']) + \ # "&brand=" + str(model['brandid']) + "&series=" + str(model['familyid']) + \ # "&model=" + str(model['salesdescid']) + "®Date=" + date + "&mile=" + str(mile) url1='https://dingjia.che300.com/app/EvalResult/allProvPrices?callback=jQuery183006719160584858153_1534397702298' + \ "&brand=" + str(model['brandid']) + "&series=" + str(model['familyid']) + \ "&model=" + str(model['salesdescid']) + "®Date=" + date + "&mile=" + str(mile) # if not (dfcheck(self.df, url + "-" + update_code, self.tag)): # meta =dict() # meta['provid']= city['provid'] # meta['cityid']= city['cityid'] # meta['salesdescid']= model['salesdescid'] # meta['regDate']= date # meta['mile']= str(mile) # yield scrapy.Request(url=url, meta={"datainfo":meta},callback=self.parse) if not (dfcheck(self.df, url1 + "-" + update_code, self.tag)): meta = dict() meta['salesdescid'] = model['salesdescid'] meta['regDate'] = date meta['mile'] = str(mile) yield scrapy.Request( url=url1, meta={"datainfo": meta}, callback=self.parse_allprov)
def parse(self, response): # self.start = 0 if self.df == 'none' else len(self.df)-1 # self.counts = self.start #global carnum # self.size = min(self.carnum-self.counts, self.size) for i in range(self.counts, self.counts + self.size + 1): url = "http://www.chemao.com/show" + str(i) + ".html" if not (dfcheck(self.df, str(i), self.tag)): yield scrapy.Request(url, callback=self.parse_car)
def parse(self, response): for href in response.xpath('//ul[@class="carList"]/li'): urlbase = href.xpath("a/@href").extract_first() url = response.urljoin(urlbase) datasave1 = href.extract() if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url,meta={"datasave1":datasave1},callback= self.parse_car) next_page = response.xpath(u'//a[contains(text(),"下一页")]/@href') if next_page: url_next = response.urljoin(next_page.extract_first()) yield scrapy.Request(url_next, self.parse) else: # as list for id in range(1, 290000): urlbase = 'http://www.carking001.com/ershouche/detail/' + str(id) + '.html' url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url,meta={"datasave1":"zero"} ,callback=self.parse_car)
def start_requests(self): #this month thismonth =datetime.date.today().month #modellist with open('blm/'+self.dbname+'/che168modellist.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) modellist = [row for row in reader] #citylist with open('blm/'+self.dbname+'/che168citylist.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) citylist = [row for row in reader] # valuelist with open('blm/' + self.dbname + '/che168valuelist.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) valuelist = [row for row in reader] step=len(modellist)/self.parts+1 starti = self.part * step if self.part==self.parts-1: step = len(modellist) - starti #urllist for city in citylist[8:9]: for model in modellist[starti:(starti+step)]: for year in range(int(model['min_reg_year']),int(model['max_reg_year'])+1)+[1995,2017]: if year == 2017 and year!=int(model['max_reg_year']): monthlist = [12] elif year == 2017 and year==int(model['max_reg_year']): monthlist = [1,thismonth, 12] elif year==1995: monthlist = [1] else: monthlist = [1,5,6,12] for month in monthlist: date = str(year)+'/'+str(month)+'/01' mile=1 #mileagelist = [1] # for mile in mileagelist: # #4S置换GetPinGuData('2.03v', 'PingGuCallBack2', 'uahm10033'); # #卖给个人GetPinGuData('2.09v','PingGuCallBack3','uahm10034'); # #卖给车商GetPinGuData('2.04v','PingGuCallBack1','uahm10035'); # #保障车GetPinGuData('2.07v','PingGuCallBack1','uahm10036'); # #商家车GetPinGuData('2.08v','PingGuCallBack2','uahm10037'); # #个人车GetPinGuData('2.09v','PingGuCallBack3','uahm10038'); for value in valuelist: url='https://cacheapi.che168.com/Assess/UsedCarAssess.ashx?_appid=m.m&_sign=&_encoding=gb2312&pid='\ +str(city['provid']) +"&cid="+ str(city['cityid']) +"&mileage="+ str(mile) + \ "&firstregtime="+ date +"&specid="+ str(model['autohomeid']) +"&_appversion="+\ value['_appversion']+"&mark="+value['mark']+"&_callback="+value['_callback'] if not (dfcheck(self.df, url, self.tag)): meta =dict() meta['provid']= city['provid'] meta['cityid']= city['cityid'] meta['autohomeid']= model['autohomeid'] meta['regDate']= date meta['milage']= str(mile) meta['type']=value['type'] yield scrapy.Request(url=url, meta={"datainfo":meta},callback=self.parse)
def start_requests(self): with open('blm/'+self.dbname+'/modellist.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) modellist = [row for row in reader] cars = [] for model in modellist: i = model['salesdescid'] url = 'https://dingjia.che300.com/app/CarDetail/getModelConfigure/' + str(i) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url=url, meta={"datainfo": {"salesdescid":i}}, callback=self.parse)
def parse(self, response): datacheck = len(response.xpath("//html").extract_first()) if datacheck > 20000: # as list for href in response.xpath('//li[@class="item"]'): urlbase = href.xpath("a/@href").extract_first() datasave1 = href.extract() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={'datasave1': datasave1}, callback=self.parse_car)
def parse(self, response): # car_item for href in response.xpath('//ul[@class="carlist-content"]/li'): datasave1 = href.extract() urlbase = href.xpath('a/@href').extract_first() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url,meta={"datasave1":datasave1},callback= self.parse_car) # next page next_page=response.xpath(u'//a[contains(text(),"下一页")]/@href').extract_first() if next_page: url=response.urljoin(next_page) yield scrapy.Request(url, self.parse)
def parse(self, response): # print(response.body.decode('gbk')) for href in response.xpath('//div[@class="search_car_lb"]/dl'): urlbase = href.xpath('dt/a/@href').extract_first() datasave1 = href.extract() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) next_page= response.xpath(u'//a[contains(text()," >> ")]') if next_page: nexturl = next_page.xpath('@href').extract_first() url = response.urljoin(nexturl) yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse)
def parse_list(self, response): # car_item for href in response.xpath('//div[@class="Datu_cars"]/div'): urlbase = href.xpath('div/a/@href').extract_first() datasave1 = href.extract() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url,meta={"datasave1":datasave1},callback= self.parse_car) # next page next_page = response.xpath('//a[@class="num"]/@href').extract_first() if next_page: url = response.urljoin(next_page) yield scrapy.Request(url, self.parse_list)
def select5_parse(self,response): print("select5") x = response.xpath('//div[@class="carShow"]/div') for temp in x: urlbase = temp.xpath('a[@class="car-link"]/@href').extract_first() url = response.urljoin(urlbase) datasave1 = temp.extract() if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) next_page = response.xpath('//span[contains(text(),">")]/../@href').extract_first() if next_page: url = response.urljoin(next_page) yield scrapy.Request(url, self.select5_parse)
def select4_parse(self, response): x = response.xpath('//ul[@class="list-ad-items"]/li') for temp in x: datasave1 = temp.extract() url = str(temp.xpath('a/@href').extract_first()) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) next_page = response.xpath(u'//a[contains(text(),"下一页")]/@href') if next_page: url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, self.select4_parse)
def parse_list(self, response): for href in response.xpath('//div[@class="car_filter_list"]/ul/li'): urlbase = href.xpath('@data-jslink').extract_first() datasave1 = href.extract() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) # next page next_page = response.xpath('//a[@data-eqselog="/list@etype=click@page=next"]/@href') if next_page: url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, self.parse_list)
def select8_parse(self, response): for href in response.xpath('//div[@class="carsItem carItem"]'): urlbase = href.xpath('a/@href').extract_first() datasave1 = href.extract() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={'datasave1': datasave1}, callback=self.parse_car) # next page next_page = response.xpath('//a[@class="next"]/@href').extract_first() if next_page: url = response.urljoin(next_page) yield scrapy.Request(url, self.select8_parse)
def parse_list(self, response): # car_item for href in response.xpath('//div[contains(@class,"cheyuan")]/ul/li'): urlbase = href.xpath("span/a/@href").extract_first() datasave1 = href.extract() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) # next page next_page = response.xpath(u'//a[contains(text(),"下一页")]/@href') if next_page: url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, self.parse_list)
def select9_parse(self, response): for href in response.xpath( '//dl[contains(@class,"list-pic clearfix cursor_pointer")]'): urlbase = href.xpath('dt/div[2]/div/a/@href').extract_first() datasave1 = href.extract() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={'datasave1': datasave1}, callback=self.parse_car) # next page next_page = response.xpath('//a[@class="next"]/@href') if next_page: url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, self.select9_parse)
def parse_list(self, response): # if len(response.xpath('//html').extract_first())>=70000: for href in response.xpath( '//div[contains(@class,"list-items pt15 list-duibi")]'): urlbase = href.xpath('div[1]/span/a/@href').extract_first() datasave1 = href.extract() url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) nextpage = response.xpath('//a[@class="arrow-rh fl"]/@href') if nextpage: url = response.urljoin(nextpage.extract_first()) yield scrapy.Request(url, callback=self.parse_list)
def parse(self, response): # car_item for href in response.xpath('//div[@id="container_base"]/div'): url = href.xpath('./div[@class="item_main clearfix"]/h2/a/@href' ).extract_first() # urlbase = href.xpath("div[1]/a/@href").extract_first() datasave1 = href.extract() # url = response.urljoin(urlbase) if not (dfcheck(self.df, url, self.tag)): yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car) # next page next_page = response.xpath('//a[@class="pages-next"]/@href') if next_page: url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, callback=self.parse, dont_filter=True)