def parse(self, response): self.isOK = True try: data = json.loads(response.body) except: self.isOK=False yield scrapy.Request(self.start_urls[0], method='POST', headers=self.custom_settings.get('HEADERS'), body=response.meta.get('meta_data').get('payload'), callback=self.parse, meta={'meta_data': response.meta.get('meta_data')}, errback=self.errback) return if data.get('errors'): self.isOK = False if data.get('errors')[0].get('message') == 'nsk-server:InvalidStationCode' or 'nsk-server:AuthorizationStationCategoryNotAllowed': print('Invalid airfield pair') return print('get data error') # print(data.get('errors')) yield scrapy.Request(self.start_urls[0], method='POST', headers=self.custom_settings.get('HEADERS'), body=response.meta.get('meta_data').get('payload'), callback=self.parse, meta={'meta_data': response.meta.get('meta_data')}, errback=self.errback) return if data.get('message') or data.get('statusCode'): # print('6' * 66) print(data.get('message')) # time.sleep(20) self.isOK = False yield scrapy.Request(self.start_urls[0], method='POST', headers=self.custom_settings.get('HEADERS'), body=response.meta.get('meta_data').get('payload'), callback=self.parse, meta={'meta_data': response.meta.get('meta_data')}, errback=self.errback) return currency= data.get('data').get('currencyCode') if not currency: num_day = 0 for lowFareDateMarket in data.get('data').get('lowFareDateMarkets'): lowFares = lowFareDateMarket.get('lowFares') if len(lowFares) == 0: num_day += 1 else: break meta = response.meta.get('meta_data') payload = meta.get('payload') begin_dt=meta.get('begin_dt') add_day=meta.get('add_day')+ num_day duration = meta.get('duration')-num_day invalid = meta.get('invalid') date = invalid.get('date') new_date = datetime.strptime(date, '%Y%m%d') for i in range(num_day): date_time = new_date + timedelta(days=i) # print(date_time) invalid['date'] = date_time.strftime('%Y%m%d') self.task.append(invalid) if duration <= 0: print('No flight on current date') return begin_dt, end_dt = pubUtil.time_add_5j(begin_dt, add_day, duration) payload = json.loads(payload) payload['Criteria'][0]['BeginDate'] = "%sT00:00:00" % begin_dt payload['Criteria'][0]['EndDate'] = "%sT00:00:00" % end_dt payload = json.dumps(payload) invalid = meta.get('invalid') invalid['date'] = begin_dt.replace('-', '') meta_data = dict( invalid=invalid, payload=payload, begin_dt=begin_dt, add_day=meta.get('add_day'), duration = duration ) print('No flight today,to requests new time:%s,%s'% (begin_dt, end_dt)) yield scrapy.Request(self.start_urls[0], method='POST', headers=self.custom_settings.get('HEADERS'), body=payload, callback=self.parse, meta={'meta_data': meta_data}, errback=self.errback) return lowFareDateMarkets=data.get('data').get('lowFareDateMarkets') #显示几天结果的列表 for lowFareDateMarket in lowFareDateMarkets: #取出当天航班列表 lowFares = lowFareDateMarket.get('lowFares') for lowFare in lowFares: #先进行判断是否中转 legs = lowFare.get('legs') if len(legs) > 1: continue leg=legs[0] carrier =leg.get('carrierCode') flightNumber='%s%s'%(carrier,leg.get('flightNumber')) deptime = time.strptime(leg.get('departureTime'), '%Y-%m-%dT%H:%M:%S') depTime = time.mktime(deptime) arrtime = time.strptime(leg.get('arrivalTime'), '%Y-%m-%dT%H:%M:%S') arrTime = time.mktime(arrtime) depAirport = leg.get('origin') arrAirport = leg.get('destination') aircraftType = leg.get('equipmentType') admin_tax = self.custom_settings.get('ADMIN_FEES').get('%s%s'%(depAirport,arrAirport)) if not admin_tax.get('currency') == currency or not admin_tax: invalid = response.meta.get('meta_data').get('invalid') invalid['date'] = time.strftime('%Y%m%d',deptime) # print('--------------------------------------------invaild:%s----------------------------------------'%time.strftime('%Y%m%d',deptime)) self.task.append(invalid) continue # adultPrice = netFare + adultTax + float(self.custom_settings.get('ADMIN_FEES').get('%s%s'%(depAirport,arrAirport))) fares = lowFare.get('passengers').get('ADT') netFare = fares.get('fareAmount') adultTax = fares.get('taxesAndFeesAmount') + float(admin_tax.get('tax')) adultPrice = netFare + adultTax maxSeats = lowFare.get('availableCount') cabin = lowFare.get('bookingClasses')[0] isChange = 1 segments = dict( flightNumber=flightNumber, aircraftType=aircraftType, number=1, departureTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(depTime)), destinationTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(arrTime)), airline=carrier, dep=depAirport, dest=arrAirport, seats=maxSeats, duration=dataUtil.gen_duration(depTime, arrTime), depTerminal='' ) getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = '[]' item['getTime'] = getTime yield item
def parse(self, response): # print(response.text) # self.isOK = True # print len(response.text) if len(response.text) == 0: print "not flight" return data_dict = json.loads(response.text) #特殊的结果 days_data = data_dict.get('AirAvailabilityData') if type(days_data) == dict: days_data = [data_dict.get('AirAvailabilityData').get('1')] for day_data in days_data: #判断中转 flight = day_data.get('flight') if len(flight) > 1: print "is change" continue flight_segment = flight[0] carrier = flight_segment.get('Carrier') flightNumber = carrier + flight_segment.get('Flight') deptime_tuple = time.strptime(flight_segment.get('Depart'), '%Y-%m-%dT%H:%M:00') depTime = time.mktime(deptime_tuple) arrtime_tuple = time.strptime(flight_segment.get('Arrivee'), '%Y-%m-%dT%H:%M:00') arrTime = time.mktime(arrtime_tuple) depAirport = flight_segment.get('From') arrAirport = flight_segment.get('To') maxSeats = int(flight_segment.get('Stock')) adultPrice = day_data.get('prix') / self.custom_settings.get("SEAT") - 76 currency = day_data.get('deviseGet') adultTax = 0 netFare = adultPrice-adultTax cabin = 'X' isChange = 1 getTime = time.time() #增加套餐价格 price_dict = { 'LIGHT': 0, 'EASY': 0, 'FLEX': 0, } if adultPrice != 0: price_dict['EASY'] = adultPrice + 76 price_dict['FLEX'] = adultPrice + 76 + 87 segments = [ [price_dict.get('EASY'), maxSeats], [price_dict.get('FLEX'), maxSeats], ] item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = json.dumps(segments) item['getTime'] = getTime yield item
def parse(self, response): self.isOK = True # print('*'*50) # print response.text try: json_dict = json.loads(response.body) except: # 出现503错误,重新进行请求 meta_data = response.meta.get('meta_data') # print '503 error' yield scrapy.Request(self.start_urls[0], callback=self.parse, method='POST', meta={'meta_data': meta_data}, body=json.dumps(meta_data.get('payload')), errback=self.errback) return try: flight_data = json_dict.get('flights')[0] except: print 'no airport', response.meta.get('invalid') return arrAirport = flight_data.get('destination') depAirport = flight_data.get('origin') currency = flight_data.get('currencyCode') flights = flight_data.get('flights') # 循环遍历航班 for flight in flights: # 判断中转,之前中转判断有误,增加对sellkey的判断 sell_Key = flight.get('fares')[0].get('sellKey') if sell_Key: if len(sell_Key.split('^')) >= 2: # print('is Change') continue if flight.get('stops') > 0: continue carrier = flight.get('carrierCode') flightNumber = '%s%s' % (carrier, flight.get('flightNumber')) deptime = time.strptime(flight.get('std'), '%Y-%m-%d %H:%M:%S') depTime = time.mktime(deptime) arrtime = time.strptime(flight.get('sta'), '%Y-%m-%d %H:%M:%S') arrTime = time.mktime(arrtime) maxSeats = self.custom_settings.get('SEAT') isChange = 1 # 存在两种票价 fares = flight.get('fares') adultPrice, adultTax, netFare, cabin = sys.maxint, 0, 0, 0 for fare in fares: price = fare.get('total') if price > adultPrice: continue adultPrice = price adultTax = fare.get('tax') netFare = fare.get('base') cabin = fare.get('fareClass') info = {'farekey': fare.get('sellKey')} #增加套餐价格,先定义价格表 price_dict = { 'Express': 0, } for fare in fares: if fare.get('name') == 'Express': price_dict['Express'] = fare.get('total') segments = [[price_dict.get('Express'), maxSeats]] getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = json.dumps(segments) item['getTime'] = getTime item['info'] = json.dumps(info) yield item
def parse(self, response): # print(response.text) # print('6'*66) self.isOK = True error = response.xpath('//title/text()')[0].extract() if error == 'Internal Server Error': self.isOK = False print(error) # print(response.text) yield scrapy.Request(self.start_urls[0], method='POST', headers=self.custom_settings.get('HEADERS'), body=response.meta.get('meta_data').get('payload'), callback=self.parse, meta={'meta_data': response.meta.get('meta_data')}, errback=self.errback) return flights = response.xpath('//div[@id="tbl-depart-flights"]/div[@class="flight-item"]') #当天没有航班加失效 if len(flights) == 0: invalid = response.meta.get('meta_data').get('invalid') self.task.append(invalid) print('no flight') return # print('\n'*5) # print(response.body) # print('\n'*5) # print('7' * 66) # price = flights[0].xpath('.//td[@class="promo"]//*[@class="price"]') # adultPrice_str = price.xpath('./span[@class="fare-price"]/text()').extract() # print(flights) for flight in flights: flight_data = flight.xpath('.//*[@class="detail"]/a') #当天没有航班 # print('7' * 66) if not flight_data: invalid = response.meta.get('meta_data').get('invalid') self.task.append(invalid) print('No flight to day') return # print(type(flight_data[0])) flight_dict =flight_data[0] carrier = flight_dict.xpath('./@carriercode').extract()[0] if len(carrier.split(',')) > 1: print('is change') continue flightNumber = carrier + str(flight_dict.xpath('./@flightnumber').extract()[0]) # dt = response.meta.get('meta_data').get('flight_time') # dt_str = flight.xpath('.//td[@class="promo"]//@std').extract()[0] dt = response.meta.get('meta_data').get('flight_time') dep_dt_str = flight_dict.xpath('./@departuretime').extract()[0] dep_dt = dt + 'T' + dep_dt_str dep_tupletime = time.strptime(dep_dt, '%Y-%m-%dT%H:%M') depTime = time.mktime(dep_tupletime) # 判断第二天的情况 arr_dt_str = flight_dict.xpath('./@arrivaltime').extract()[0] if int(dep_dt_str.split(':')[0]) > int(arr_dt_str.split(':')[0]): arr_dt = pubUtil.time_add_num(dt, 1) + 'T' + arr_dt_str else: arr_dt = dt + 'T' + arr_dt_str arr_tupletime = time.strptime(arr_dt, '%Y-%m-%dT%H:%M') arrTime = time.mktime(arr_tupletime) depAirport = flight_dict.xpath('./@departure').extract()[0] arrAirport = flight_dict.xpath('./@arrival').extract()[0] aircraftType = flight_dict.xpath('./@equipmenttype').extract()[0] duration = dataUtil.time_standard(flight_dict.xpath('./@traveltime').extract()[0]) # print('9' * 66) # print(flightNumber, depTime, arrTime, depAirport, arrAirport) # self.attrib(flight_data) price = flight.xpath('.//*[@class="flight-price"]') if not price: print('no seat') adultPrice_str = '' else: adultPrice_str = price[1].xpath('./span[@class="fare-price"]/text()').extract() #当没有座位时 if not adultPrice_str: adultPrice = 0 currency = 'RPG' netFare = 0 adultTax = 0 maxSeats =0 print('no seat') else: # adultPrice = float(adultPrice_str[0].replace('.','').replace(',','.')) currency_unit = price.xpath('normalize-space(./span[@class="currency"]/text())').extract()[0] # print(currency_unit) if currency_unit == '$': adultPrice = float(adultPrice_str[0]) else: adultPrice = float(adultPrice_str[0].replace('.', '').replace(',', '.')) adultTax = 0 netFare = adultPrice - adultTax # print(price.xpath('normalize-space(./span[@class="currency"]/text())').extract()) currency = self.custom_settings.get('CURRENCY_CACHE').get(currency_unit,currency_unit) # 目前暂未发现座位,使用请求的座位 maxSeats = self.custom_settings.get('SEAT') isChange = 1 cabin= 'X' segments = dict( flightNumber=flightNumber, aircraftType=aircraftType, number=1, departureTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(depTime)), destinationTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(arrTime)), airline=carrier, dep=depAirport, dest=arrAirport, seats=maxSeats, duration=duration, depTerminal='' ) getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = '[]' item['getTime'] = getTime yield item
def parse(self, response): # print('*'*50) # print response.text # 数据存储在meta里,在middleware有解释 res = response.request.meta.get('response') # if res.status_code == 502: # # # return try: flight_list = re.compile("console.log\(\[(.*?)\]\);").findall(res.text) except: try: log_mail.log_mail('TRcookie:失效'%response.request.headers.get('Cookie')) except: pass # 全局更换cookie cookie = random.choice(self.custom_settings.get('COOKIE_LIST')) self.custom_settings.get('HEADERS')['Cookie'] = 'kuhang_=%s;' % cookie print '之前cookie失效,使用cookie:%s进行访问' % cookie meta_data = response.meta.get('meta_data') yield scrapy.Request(self.start_urls[0], callback=self.parse, headers=self.custom_settings.get('HEADERS'), method='POST', meta={'meta_data': meta_data}, body=meta_data.get('data'), errback=self.errback) return if len(flight_list) == 0: # 当天无航班 return for i in flight_list: flight = json.loads('[' + i + ']') # 判断中转 if len(flight) > 1: continue flight = flight[0] arrAirport = flight.get('ArrivalStation') depAirport = flight.get('DepartureStation') carrier = flight.get('CarrierCode') flightNumber = '%s%s' % (carrier, flight.get('FlightNumber').replace(' ', '')) deptime = time.strptime(flight.get('STD'), '%Y-%m-%dT%H:%M:%S') depTime = time.mktime(deptime) arrtime = time.strptime(flight.get('STA'), '%Y-%m-%dT%H:%M:%S') arrTime = time.mktime(arrtime) isChange = 1 # 存在多种票价 fares = flight.get('Fares') # 增加套餐价格,先定义价格表 price_dict = { 'E1': [0, 0], 'E2': [0, 0], 'E3': [0, 0], 'J': [0, 0], } adultPrice, adultTax, netFare, cabin, maxSeats, currency = 0, 0, 0, 'X', 0, None for key in fares.keys(): # 获取不同套餐的价格 adult_Tax, net_Fare = 0, 0 fare = fares.get(key) prices = fare.get('PaxFare').get('ADT').get('BookingServiceCharge') for price in prices: # 获取税价和净票价 if price.get('ChargeType') == 'FarePrice': net_Fare = price.get('Amount') else: adult_Tax = adult_Tax + price.get('Amount') if currency: if currency != price.get('CurrencyCode'): break else: currency = price.get('CurrencyCode') try: adult_Price = net_Fare + adult_Tax except: print net_Fare, adult_Tax, json.dumps(prices) traceback.print_exc() print '6'*66, json.dumps(fares), '6'*66, return cabin_ = fare.get('FareBasisCode')[0] maxSeats = fare.get('AvailableCount') price_dict[key] = [adult_Price, maxSeats] if key == 'E1': adultPrice, adultTax, netFare,cabin = adult_Price, adult_Tax, net_Fare, cabin_ # info = {'farekey': fare.get('FareSellKey')} segments = [ [x for x in price_dict.get('E2')], [x for x in price_dict.get('E3')], # [x for x in price_dict.get('J')], ] getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = json.dumps(segments) item['getTime'] = getTime # item['info'] = json.dumps(info) yield item
def parse(self, response): # print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++====') self.isOK = True parrten = re.compile(r'config\s:\s([\s\S]*), pageEngine : pageEngine') data = parrten.findall(response.text) if not len(data): print('data not') print('response:' + response.text) # print(response.text) return data = data[0].strip('\n ') dict_data = json.loads(data) Availability = jsonpath(dict_data, '$..Availability') if not Availability: self.task.append(response.meta.get('invalid')) return # 航班列表 flight_list = jsonpath(Availability[0], '$..proposedFlightsGroup')[0] # 获取当天每个航班信息 for flight_data in flight_list: # 是否中转,不确定后期调整 if len(flight_data.get('segments')) > 1: continue # 航班号 flightNumber = jsonpath(flight_data, '$..airline')[0].get('code') + jsonpath( flight_data, '$..flightNumber')[0] deptime = time.strptime( jsonpath(flight_data, '$..beginDate')[0], '%b %d, %Y %I:%M:%S %p') # 出发时间 depTime = time.mktime(deptime) arrtime = time.strptime( jsonpath(flight_data, '$..endDate')[0], '%b %d, %Y %I:%M:%S %p') # 到达时间 arrTime = time.mktime(arrtime) # 出发城市代码 fromCity = jsonpath(flight_data, '$..beginLocation')[0].get('cityCode') # 到达城市代码 toCity = jsonpath(flight_data, '$..endLocation')[0].get('cityCode') # 出发机场代码 depAirport = jsonpath(flight_data, '$..beginLocation')[0].get('locationCode') # 到达机场代码 arrAirport = jsonpath(flight_data, '$..endLocation')[0].get('locationCode') # 货币种类 currency = jsonpath(Availability, '$..currencyBean')[0].get('code') final_price, adultPrice, adultTax, netFare = sys.maxint, 0, 0, 0 maxSeats, cabin = 0, '' # 分类价格,获取当天每个航班座位数对比 flight_id = flight_data.get('proposedBoundId') for recommendation in jsonpath(Availability, '$..recommendationList')[0]: # 价格 price = jsonpath(recommendation, '$..boundAmount')[0] price_current = price.get('totalAmount') if price_current >= final_price: continue for flightGroup in jsonpath(recommendation, '$..flightGroupList')[0]: # 座位数,舱位 if flightGroup.get('flightId') == flight_id: # 含税价 adultPrice = price.get('totalAmount') / 3 final_price = price_current # 税价 adultTax = price.get('tax') / 3 # 净票价 netFare = price.get('amountWithoutTaxAndFee') / 3 maxSeats = flightGroup.get('numberOfSeatsLeft') cabin = flightGroup.get('rbd') # 航司二字码 carrier = jsonpath(flight_data, '$..airline')[0].get('code') isChange = len(flight_data.get('segments')) flightTime = jsonpath(flight_data, '$..flightTime')[0] # 航段信息 segments = dict( flightNumber=flightNumber, aircraftType=jsonpath(flight_data, '$..equipment')[0].get('code'), number=1, departureTime=time.strftime('%Y-%m-%d %H:%M:%S', deptime), destinationTime=time.strftime('%Y-%m-%d %H:%M:%S', arrtime), airline=carrier, dep=depAirport, dest=arrAirport, seats=maxSeats, duration='%02d:%02d' % (flightTime / 60000, flightTime % 60000), depTerminal='') getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, fromCity) item['toCity'] = self.portCitys.get(arrAirport, toCity) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = segments item['getTime'] = getTime yield item
def parse(self, response): self.isOK = True # print('*'*50) # print response.text # return try: json_dict = json.loads(response.body) except: print response.text # 出现503错误,重新进行请求 # meta_data = response.meta.get('meta_data') # print '503 error' # yield scrapy.Request(self.start_urls[0], # callback=self.parse, # method='POST', # meta={'meta_data': meta_data}, # body=json.dumps(meta_data.get('payload')), # errback=self.errback) return flights = json_dict.get('Route') if not flights: return # 循环遍历航班 for flight in flights: # 判断中转 if len(flight) > 1: continue flight_data = flight[0] depAirport = flight_data.get('DepartureAirportCode') arrAirport = flight_data.get('ArrivalAirportCode') # 取不到货币,暂定CNY currency = 'CNY' flightNumber = flight_data.get('No') carrier = flightNumber[:2] deptime = time.strptime(flight_data.get('DepartureTime'), '%Y-%m-%d %H:%M:%S') depTime = time.mktime(deptime) arrtime = time.strptime(flight_data.get('ArrivalTime'), '%Y-%m-%d %H:%M:%S') arrTime = time.mktime(arrtime) isChange = 1 getTime = time.time() cabin, adultTax, adultPrice, maxSeats, netFare = 'X', 0, 0, 0, 0 for flight_price in flight_data.get('AircraftCabins'): cabin = flight_price.get('CabinLevel') price_and_seat = flight_price.get('AircraftCabinInfos')[0] adultTax = price_and_seat.get( 'AirportConstructionFees') + price_and_seat.get( 'FuelSurcharge') + price_and_seat.get('OtherFees') netFare = price_and_seat.get('Price') adultPrice = adultTax + netFare maxSeats = price_and_seat.get('Remain') if maxSeats != -1: break item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = json.dumps([]) item['getTime'] = getTime item['info'] = json.dumps([]) yield item
def parse(self, response): self.isCOOKIE = True self.isOK = True # print('----------------------------') # print(response.text) #把当天的航班信息和价格信息分类 time_numbers = response.xpath('//td[contains(@class,"fl_date")]') prices = response.xpath( '//td[contains(@class,"outward-total-fare-td")]') # print('-'*50) # print(response.text) # print('+' * 50) if not time_numbers: # print('!'*30) # print '%s' % (response.text.decode('utf-8').encode('gbk', 'ignore')) # self.log("no data",40) try: page = response.xpath('//h2/text()')[0].extract() # print(page) # print(response.status) # # print response.text # # proxy_invalid = response.xpath('//td[4]/text()')[0].extract() # # print proxy_invalid # if response.status == 404: # self.isOK = False # yield scrapy.Request(self.start_urls[0], # method='POST', # headers=self.custom_settings.get('headers'), # body=response.meta.get('meta_data').get('form'), # callback=self.parse, # meta={'meta_data': response.meta.get('meta_data')}, # errback=self.errback) if page == 'Are you human?': self.isCOOKIE = False yield scrapy.Request( self.start_urls[0], method='POST', headers=self.custom_settings.get('headers'), body=response.meta.get('meta_data').get('form'), callback=self.parse, meta={'meta_data': response.meta.get('meta_data')}, errback=self.errback) self.task.append(response.meta.get('meta_data').get('invalid')) except: self.log("no data", 10) self.task.append(response.meta.get('meta_data').get('invalid')) return #循环取出每个航班信息,year指的是航班的年份 year = response.meta.get('meta_data').get('year') # print(len(time_numbers)) for i in range(len(time_numbers)): #取出当次的航班信息,出发时间,到达时间,航班号 time_number = time_numbers[i].xpath('./span/text()').extract() # 出发时间 deptime = time.strptime(year + time_number[0], '%Y%m/%d\xa0%H:%M') depTime = time.mktime(deptime) # 到达时间 arrtime = time.strptime(year + time_number[1], '%Y%m/%d\xa0%H:%M') arrTime = time.mktime(arrtime) # 航班号 flightNumber = time_number[2] carrier = re.search('\D{2}', time_number[2]).group() #根据n定位当前航班价格,总价除以maxSeats为单人价格 maxSeats = response.meta.get('meta_data').get('maxSeats') try: price = prices[i].xpath( './div[@id="outward_hp_' + str(i + 1) + '_total_fare"]//span/text()').extract() except: #这种情况是ip有问题,得到数据是错误的 self.log('Dangerous error data....', 40) self.isOK = False if price[3] == '0': price = prices[i].xpath( '//div[@id="outward_hpp_' + str(i + 1) + '_total_fare"]//span/text()').extract() if price[3] == '0': price = prices[i].xpath( '//div[@id="outward_prime_' + str(i + 1) + '_total_fare"]//span/text()').extract() if price[3] == '0': self.task.append(response.meta.get('meta_data').get('invalid')) continue # print(price) # 取价格 netFare = int( re.search(r"\d.*", price[0]).group().replace(',', '')) / maxSeats adultTax = int( re.search(r"\d.*", price[1]).group().replace(',', '')) / maxSeats #增加价格打折的判断 promo = response.xpath('//td[@id="outward_hp_' + str(i + 1) + '_list"]/@class').extract()[0].split(' ') if promo[-1] == 'promo': adultPrice = int( re.search(r"\d.*", price[3]).group().replace( ',', '')) / maxSeats / 0.7 cabin = 'S' else: cabin = 'X' adultPrice = int( re.search(r"\d.*", price[3]).group().replace( ',', '')) / maxSeats # 判断网页信息是否虚假 if not price[2]: return currency = self.custom_settings.get('CURRENCY_CACHE').get(price[2]) depAirport = response.meta.get('meta_data').get('invalid').get( 'depAirport') arrAirport = response.meta.get('meta_data').get('invalid').get( 'arrAirport') isChange = 1 segments = dict( flightNumber=flightNumber, aircraftType='', number=1, departureTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(depTime)), destinationTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(arrTime)), airline=carrier, dep=depAirport, dest=arrAirport, seats=int(maxSeats), duration=dataUtil.gen_duration(depTime, arrTime), depTerminal='') getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = '[]' item['getTime'] = getTime yield item
def parse(self, response): # print(response.text) self.isOK = True results = response.xpath("//div[@class='results-list']/div[@*]") # print(results) #分析每个航班 for result in results: #判断中转 fare_amenities = result.xpath('.//div[@class="fare-amenities"]') if len(fare_amenities) > 1: # print('is change') continue #航司网页没显示,设置默认 carrier ="B6" # print(fare_amenities.extract()) flight_number = str(fare_amenities.xpath('normalize-space(./ul/li[1]//text()[3])').extract()[0]) # 去掉航班号的** flight_number = re.compile("\d+").search(flight_number) if not flight_number: invalid = response.meta.get('meta_data').get('invalid') self.task.append(invalid) print("no flight") continue flightNumber = carrier + flight_number.group() data_summary = result.xpath('./div[1]/ul/li') dep_time_airport = data_summary[0].xpath('./span/text()').extract() depAirport = dep_time_airport[1] dt = response.meta.get('meta_data').get('flight_time') dep_dt = dt + 'T' + dep_time_airport[0] dep_tupletime = time.strptime(dep_dt, '%Y-%m-%dT%I:%M %p') depTime = time.mktime(dep_tupletime) arr_time_airport = data_summary[2].xpath('./span/text()').extract() arrAirport = arr_time_airport[1] if arr_time_airport[0][-2:] == "+1": # print arr_time_airport[0],response.meta.get('meta_data').get('aaa') arr_dt = pubUtil.time_add_num(dt, 1) + 'T' + arr_time_airport[0][:-2] else: arr_dt = dt + 'T' + arr_time_airport[0] arr_tupletime = time.strptime(arr_dt, '%Y-%m-%dT%I:%M %p') arrTime = time.mktime(arr_tupletime) price = data_summary[4].xpath('./span[2]/text()').extract() if not price: currency = "RPG" adultPrice = 0 maxSeats = 0 else: currency = self.custom_settings.get('CURRENCY_CACHE').get(price[0][0],price[0][0]) adultPrice = float(price[0][1:]) maxSeats = self.custom_settings.get('SEAT') adultTax = 0 netFare = adultPrice-adultTax cabin = 'X' isChange = 1 getTime = time.time() #增加套餐价 price_dict = { 'Blue': 0, 'Blue Plus': 0, 'Blue Flex': 0, } # aaa = result.xpath('.//div[@class="fare-row BN"]//div[@style="font-size: 18px; margin-top: 3px;"]/text()').extract() # print(aaa) plus_price = result.xpath('normalize-space(.//div[@class="fare-row CN ribbon"]//div[@style="font-size: 18px; margin-top: 3px;"]/text())').extract()[0][1:] if plus_price.replace('.','').isnumeric(): price_dict['Blue Plus'] = float(plus_price) flex_price = result.xpath('normalize-space(.//div[@class="fare-row BN "]//div[@style="font-size: 18px; margin-top: 3px;"]/text())').extract()[0][1:] if flex_price.replace('.','').isnumeric(): price_dict['Blue Flex'] = float(flex_price) segments = [[price_dict.get('Blue Plus'),maxSeats],[price_dict.get('Blue Flex'),maxSeats]] # print(segments) item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = json.dumps(segments) item['getTime'] = getTime # flight_number_time = str(time.strftime('%Y-%m-%dT%H:%M',dep_tupletime)) + flightNumber # if not self.data_task.get(flight_number_time): # self.data_task[flight_number_time] = adultPrice # print self.data_task yield item
def parse(self, response): # print(response.text) data = json.loads(response.body) try: flights = data.get('segments')[0].get('flights') except: # self.task.append(response.meta.get('invalid')) # print(response.text) # traceback.print_exc() print('6' * 66) # print(response.meta.get('meta_data').get('aaa')) # print('6' * 66) # meta_data = response.meta.get('meta_data') # yield scrapy.Request(self.start_urls[0], # callback=self.parse, # method='POST', # headers=self.custom_settings.get('HEADERS'), # meta={'meta_data': meta_data}, # body=meta_data.get('payload'), # errback=self.errback # ) return #判断是否无航班 if not flights: print('No flight to day') # print('2'*66) self.task.append(response.meta.get('meta_data').get('invalid')) return # print(len(flights)) # print(response.text) for flight in flights: #先判断是否中转 legs = flight.get('legs') if len(flight.get('stops')) > 0 or len(legs) > 1: print('is change:%s'%len(flight.get('stops')) ) # print(len(legs) ) continue # print('---'*20) leg =legs[0] carrier = leg.get('operatingCarrier') flightNumber = '%s%s' % (carrier, leg.get('marketingFlightNum')) dep_tupletime = time.strptime(leg.get('departureDate'), '%Y-%m-%dT%H:%M:%S') depTime = time.mktime(dep_tupletime) arr_tupletime = time.strptime(leg.get('arrivalDate'), '%Y-%m-%dT%H:%M:%S') arrTime = time.mktime(arr_tupletime) duration = leg.get('flightDuration') depAirport = leg.get('origin') arrAirport = leg.get('destination') aircraftType = leg.get('equipmentType') fareTypes = flight.get('fareTypes') final_price = sys.maxint fareType = {} for fareType_data in fareTypes: try: adultPrice = float(jsonpath(fareType_data,'$..adultFarePerPax')[0].replace(',','')) except: # traceback.print_exc() # self.task.append(response.meta.get('meta_data').get('invalid')) continue if adultPrice >= final_price: continue final_price = adultPrice fareType =fareType_data adultPrice = final_price if not fareType: # print('1'*66) self.task.append(response.meta.get('meta_data').get('invalid')) continue currency = jsonpath(fareType,'$..currencyCode')[0] adultTax = float(jsonpath(fareType,'$..taxPerPax')[0].replace(',','')) netFare = float(jsonpath(fareType,'$..baseAdultFarePerPax')[0].replace(',','')) cabin = jsonpath(fareType,'$..fareClass')[0] #目前暂未发现座位,使用请求的座位 maxSeats = self.custom_settings.get('SEAT') isChange = 1 segments = dict( flightNumber=flightNumber, aircraftType=aircraftType, number=1, departureTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(depTime)), destinationTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(arrTime)), airline=carrier, dep=depAirport, dest=arrAirport, seats=maxSeats, duration=duration, depTerminal='' ) getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = '[]' item['getTime'] = getTime yield item
def parse(self, response): # print('6'*66) flights = response.xpath('//*[@data-validation-prefix="Please choose your outbound flight."]') # print('1'*66) if not flights: print('No flight to day') return # print('2' * 66) # print(len(flights)) for i in range(len(flights)): #判断是否中转 # change = flights[i].xpath('.//*[@class="flight-information float-left"]/text()').extract() change = flights[i].xpath('.//*[@class="layover"]').extract() # print(change) # if len(change) > 2: if change: # print('3' * 66) print('is change') continue # print('4'*66) flight_number = flights[i].xpath('.//*[@class="flight-information float-left"]/text()').extract()[1] # print(flight_number) flight_numberlist = re.compile(r'\S+').findall(flight_number) flightNumber = ''.join(flight_numberlist) dep_time_airport = flights[i].xpath('normalize-space(.//*[@class="departure-time float-left"])').extract()[0].split(' ') # print(dep_time_airport) dt = response.meta.get('meta_data').get('flight_time') dep_dt =dt + 'T' + dep_time_airport[0] dep_tupletime = time.strptime(dep_dt, '%Y-%m-%dT%H:%M') depTime = time.mktime(dep_tupletime) depAirport = dep_time_airport[-1] #判断是第二天的情况 arr_time_airport = flights[i].xpath('normalize-space(.//*[@class="arrival-time float-left"])').extract()[0].split(' ') if flights[i].xpath('normalize-space(.//*[@class="time-offset"])').extract()[0]: arr_dt = pubUtil.time_add_num(dt,1) + 'T' + arr_time_airport[0] else: arr_dt = dt + 'T' + arr_time_airport[0] arr_tupletime = time.strptime(arr_dt, '%Y-%m-%dT%H:%M') arrTime = time.mktime(arr_tupletime) arrAirport = arr_time_airport[-1] #票卖完了 try: # fare_currency = flights[i].xpath('.//*[@name="selectedHiddenFarepos_0"]/@value').extract()[0].split(':') fare_currency = flights[i].xpath('normalize-space(.//*[@class="book-inner"]/text())').extract()[0].split(' ') adultPrice = float(fare_currency[0].replace('.','').replace(',','.')) adultTax = 0 netFare = adultPrice - adultTax currency = fare_currency[-1] # currency = re.compile(r'_(\w+)').findall(fare_currency[2])[-1].split('_')[-1] except: # print(flights[i].xpath('normalize-space(.//*[@class="book-inner"])')) # # print(flights[i].xpath('.//*[@name="selectedHiddenFarepos_0"]/@value').extract()) # print(response.meta.get('meta_data').get('aaa')) # print(dep_dt) print('flight invalid') # self.task.append(response.meta.get('meta_data').get('invalid')) adultPrice = 0 netFare = 0 adultTax = 0 currency = 'A' # print('4' * 66) # traceback.print_exc() # continue seat = flights[i].xpath('normalize-space(.//*[@class="seats-left"])').extract()[-1] seat_num = re.compile(r'\d+').search(seat) # print('---------------------------seat:%s%s-------------------------'%(seat_num,seat)) if not seat_num: #没有时座位比较多 maxSeats = 9 else: maxSeats = seat_num.group() if adultPrice == 0: maxSeats = 0 isChange = 1 aircraftType = '' cabin = 'X' carrier = flight_numberlist[0] duration_data = flights[i].xpath('.//*[@class="duration float-left"]/text()').extract()[0] duration = ':'.join(re.compile(r'\d+').findall(duration_data)) segments = dict( flightNumber=flightNumber, aircraftType=aircraftType, number=1, departureTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(depTime)), destinationTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(arrTime)), airline=carrier, dep=depAirport, dest=arrAirport, seats=maxSeats, duration=duration, depTerminal='' ) getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = '[]' item['getTime'] = getTime yield item
def parse(self, response): # print(response.text) data = json.loads(response.text).get('data') # flights = data.get('flights') try: currency = data.get('currency') except: print response.text traceback.print_exc() return for flight in data.get('flights'): #当天没有航班设定失效 if not flight: invalid = response.meta.get('invalid') self.task.append(invalid) # print("no flight") continue #先判断中转 flight_segments = flight.get('segments') if len(flight_segments) > 1: continue flight_segment = flight_segments[0] carrier = flight_segment.get('airline').get('code') flightNumber = flight_segment.get('flightCode') departure = flight_segment.get('departure') deptime_tuple = time.strptime( departure.get('dateTime')[:-6], '%Y-%m-%dT%H:%M') depTime = time.mktime(deptime_tuple) arrival = flight_segment.get('arrival') arrtime_tuple = time.strptime( arrival.get('dateTime')[:-6], '%Y-%m-%dT%H:%M') arrTime = time.mktime(arrtime_tuple) depAirport = departure.get('airportCode') arrAirport = arrival.get('airportCode') #选取最低票价,在列表里面,暂时先不做 fares = flight.get('cabins')[0].get('fares') cabin = fares[0].get('code') fare = fares[0].get('price').get('adult') adultPrice = fare.get('total') adultTax = fare.get('taxAndFees') netFare = fare.get('amountWithoutTax') isChange = 1 getTime = time.time() maxSeats = self.custom_settings.get('SEAT') item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = '123:123:1223' item['getTime'] = getTime yield item
def parse(self, response): # print response.text self.isOK = True data = json.loads(response.text) self.isToken = True response_code = data.get('ResponseCode') if response_code in ["-1", "100"]: # 无效机场 if data.get('Message') != "Session Token authentication failure.": print "invalid station:%s" % data.get('Message') return self.isToken = False yield scrapy.Request( response.meta.get("meta_data").get('url'), callback=self.parse, method='GET', headers=self.custom_settings.get('HEADERS'), meta={'meta_data': response.meta.get('meta_data')}, errback=self.errback) return # 没有数据错误,换代理 if response_code in ["12"]: self.isOK = False print "proxy invalid" yield scrapy.Request( response.meta.get("meta_data").get('url'), callback=self.parse, method='GET', headers=self.custom_settings.get('HEADERS'), meta={'meta_data': response.meta.get('meta_data')}, errback=self.errback) return schedules = data.get('SchedulesIj') #判断没有航班时 if not schedules: print("not flight today") # print response.text invalid = response.meta.get("meta_data").get("invalid") self.task.append(invalid) return journeys = schedules[0].get('JourneysIj') for journey in journeys: #判断中转 flight_segments = journey.get('SegmentsIj') if len(flight_segments) > 1: # print "is change" continue #把需要的数据从JourneySellKey取出来 sell_data = re.split(r'~[~|\s]*', journey.get('JourneySellKey')) carrier = sell_data[0] flightNumber = carrier + sell_data[1] depAirport = sell_data[2] arrAirport = sell_data[4] deptime_tuple = time.strptime(sell_data[3], '%m/%d/%Y %H:%M') depTime = time.mktime(deptime_tuple) arrtime_tuple = time.strptime(sell_data[5], '%m/%d/%Y %H:%M') arrTime = time.mktime(arrtime_tuple) fares = flight_segments[0].get('FaresIj') adultPrice, adultTax, maxSeats, currency, cabin = 0, 0, 0, "", 'X' #有票价时判断 if fares: adultPrice = fares[0].get('TotalFare') currency = jsonpath(fares[0], '$..CurrencyCode')[0] maxSeats = self.custom_settings.get('SEAT') cabin = fares[0].get('ClassOfService') netFare = adultPrice - adultTax isChange = 1 getTime = time.time() # 增加套餐价格,先定义价格表 price_dict = {'ECO': 0, 'ADVANTAGE': 0, 'EXTRA': 0, 'SUPER_ECO': 0} item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = '[]' item['getTime'] = getTime yield item
def parse(self, response): # print(response.text) # print('1' * 66) self.isOK = True if response.text == 'Service Unavailable, Rate limit reached, No Direct Access.': self.isOK = False # print(response.text) # print(response.status) # print(response.request.cookies) yield scrapy.Request(self.start_urls[0], method='POST', headers=self.custom_settings.get('HEADERS'), body=response.meta.get('meta_data').get('body'), callback=self.parse, meta={'meta_data': response.meta.get('meta_data')}, errback=self.errback ) return try: json_dict = json.loads(response.body) except: print(response.text) print(response.status) traceback.print_exc() if json_dict.get('message').get('code') == 400: self.isOK = False print(response.text) print(response.status) # print(response.request.cookies) # print(response.meta.get('meta_data').get('aaa')) yield scrapy.Request(self.start_urls[0], method='POST', headers=self.custom_settings.get('HEADERS'), body=response.meta.get('meta_data').get('body'), callback=self.parse, meta={'meta_data': response.meta.get('meta_data')}, errback=self.errback ) return if json_dict.get('message').get('code') == 500: # print(response.text) # print(response.meta.get('meta_data').get('aaa')) return # try: availableOptions = json_dict.get('data').get('originDestinationResponse')[0].get('availableOptions') # except: # traceback.print_exc() # print(response.text) # print(response.status_code) currency = json_dict.get('data').get('currency') print('6'*66) # print(response.request.cookies) # 这个请求会显示七天的航班,在这个列表里 for flight in availableOptions: # 判断是否是中转 flight_segments = flight.get('segments') if len(flight_segments) > 1: print('is change') continue flight_segment = flight_segments[0] carrier = flight_segment.get('carrierCode') flightNumber = flight_segment.get('filghtDesignator') deptime_tuple = time.strptime(flight_segment.get('departureDateTime').get('local'), '%Y-%m-%dT%H:%M:%S') depTime = time.mktime(deptime_tuple) arrtime_tuple = time.strptime(flight_segment.get('arrivalDateTime').get('local'), '%Y-%m-%dT%H:%M:%S') arrTime = time.mktime(arrtime_tuple) depAirport = flight.get('originAirportCode') arrAirport = flight.get('destinationAirportCode') availableFare = flight.get('availableFareClasses') if not availableFare: maxSeats = 0 adultPrice = 0 else: maxSeats = availableFare[0].get('availableSeats') if maxSeats== -1: maxSeats = 9 adultPrice = availableFare[0].get('price') * 1.03 adultTax=0 netFare = adultPrice - adultTax isChange = 1 cabin = 'X' getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = '[]' item['getTime'] = getTime yield item
def parse(self, response): self.isOK = True if response.xpath('//*[@id="selectMainBody"]/h2'): print(response.xpath('//*[@id="selectMainBody"]/h2').extract()[0]) return # print(response.xpath('//*[@id="market1"]/td[2]/div/span').extract()[0]) flights = response.xpath('//tr[@id="market1"]') for flight in flights: # 当天没有航班 if not flight.xpath('./@data-ismacjourney'): print('No flight to day') return #判断是否中转 change = flight.xpath( '//td[@class="direction JourneyInfo"]/div').extract() if len(change) > 2: # print('is change') continue flightNumber = flight.xpath( './/div[@class="code"]/text()').extract()[0] carrier = re.compile('\D+').search(flightNumber).group() dep_dt_str = flight.xpath('./@data-departuretime').extract()[0] dep_tupletime = time.strptime(dep_dt_str, '%Y-%m-%dT%H:%M:00') depTime = time.mktime(dep_tupletime) # 判断第二天的情况 arr_dt_str = flight.xpath('./@data-arrivaltime').extract()[0] arr_tupletime = time.strptime(arr_dt_str, '%Y-%m-%dT%H:%M:00') arrTime = time.mktime(arr_tupletime) depAirport = flight.xpath('./@data-departure-code').extract()[0] arrAirport = flight.xpath('./@data-arrival-code').extract()[0] isChange = 1 price = flight.xpath('.//span[@style="font-size: 12"]/text()' ).extract()[0].split(' ') info = flight.xpath('.//input/@value').extract()[0] adultPrice = float(price[0].replace(' ', '').replace(',', '.')) adultTax = 0 netFare = adultPrice - adultTax currency_unit = price[1] currency = self.custom_settings.get('CURRENCY_CACHE').get( currency_unit, currency_unit) # 目前暂未发现座位,使用请求的座位 maxSeats = self.custom_settings.get('SEAT') cabin = 'X' # segments = dict( # flightNumber=flightNumber, # aircraftType=aircraftType, # number=1, # departureTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(depTime)), # destinationTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(arrTime)), # airline=carrier, # dep=depAirport, # dest=arrAirport, # seats=maxSeats, # duration=duration, # depTerminal='' # ) getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = '[]' item['getTime'] = getTime item['info'] = info yield item
def parse(self, response): # print('6'*66) # print(response.body) self.isOK = True self.isJS = True error = response.xpath('//title/text()') if error: if error[0].extract() == 'Distil Validate': # self.isOK = False self.isJS = False print(error[0].extract()) if error[0].extract() == 'Distil Captcha': self.isOK = False self.isJS = False print(error[0].extract()) js = re.compile('src="(.*?)" .*?><').search(response.text).group(1) self.custom_settings['JS_RANDOM_POSTFIX'] = js # print(js) yield scrapy.Request( self.start_urls[0], method='POST', headers=self.custom_settings.get('HEADERS'), body=response.meta.get('meta_data').get('payload'), callback=self.parse, meta={'meta_data': response.meta.get('meta_data')}, errback=self.errback) return try: data = re.compile('config : (.*), pageEngine :', re.S).search(response.text).group(1) except: print(response.text) # try: data = json.loads(data) # except: # data = re.compile('"Availability":(.*),"FareReview":').search(response.text).group(1) # print('6' * 66) # try: # availability = json.loads(data) # except: # print(response.text) # print(type(availability)) # print(availability) availability = jsonpath(data, '$..Availability') if not availability: print('No flight to day') self.task.append(response.meta.get('meta_data').get('invalid')) return currency = availability[0].get('currencyBean').get('code') isChange = 1 proposedFlightsGroup = availability[0].get('proposedBounds')[0].get( 'proposedFlightsGroup') for proposedFlight in proposedFlightsGroup: segments = proposedFlight.get('segments') #先判断是否是中转 if len(segments) > 1: print('is change') continue segment = segments[0] carrier = segment.get('airline').get('code') flightNumber = carrier + str(segment.get('flightNumber')) dep_dt_str = segment.get('beginDate') dep_tupletime = time.strptime(dep_dt_str, '%b %d, %Y %I:%M:00 %p') depTime = time.mktime(dep_tupletime) arr_dt_str = segment.get('endDate') arr_tupletime = time.strptime(arr_dt_str, '%b %d, %Y %I:%M:00 %p') arrTime = time.mktime(arr_tupletime) depAirport = segment.get('beginLocation').get('locationCode') arrAirport = segment.get('endLocation').get('locationCode') #这个ID是定位价格的 flightId = proposedFlight.get('proposedBoundId') #目前还没找到快捷取值方法,先遍历 recommendationList = availability[0].get('recommendationList') final_price = sys.maxint maxSeats, netFare, adultTax, adultPrice, cabin = 0, 0, 0, 0, 'x' for recommendation in recommendationList: #先确定ID flightGroupList = recommendation.get('bounds')[0].get( 'flightGroupList') for flightGroup in flightGroupList: if flightId == flightGroup.get('flightId'): maxSeats = flightGroup.get('numberOfSeatsLeft') cabin = flightGroup.get('rbd') else: # print('6' * 66) # print(flightId,flightGroup.get('flightId')) continue boundAmount = recommendation.get('bounds')[0].get( 'boundAmount') adultPrice = float(boundAmount.get('totalAmount')) if adultPrice >= final_price: continue final_price = adultPrice netFare = float(boundAmount.get('amountWithoutTax')) adultTax = float(boundAmount.get('tax')) # segments = dict( # flightNumber=flightNumber, # aircraftType=aircraftType, # number=1, # departureTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(depTime)), # destinationTime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(arrTime)), # airline=carrier, # dep=depAirport, # dest=arrAirport, # seats=maxSeats, # duration=duration, # depTerminal='' # ) getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = '[]' item['getTime'] = getTime yield item print(item)
def parse(self, response): self.isOK = True # print('*'*50) json_dict = json.loads(response.body) try: daily_flight_list = json_dict.get('departureRouteList')[0].get( 'dailyFlightList') except: return #这个请求会显示三天的航班,在这个列表里 for daily_flight in daily_flight_list: #分析当天航班列表 flightList = daily_flight.get('flightList') if not flightList: invalid = response.meta.get('invalid') invalid['date'] = daily_flight.get('date').replace('-', '') self.task.append(invalid) # print("no flight") continue for flight in flightList: #判断是否是中转 if flight.get('connectedFlight'): # print('is change') continue fare_seat = flight.get('fare') #判断是否有票 if not fare_seat: #没票是设置为0 maxSeats = 0 adultPrice = 0 currency = 'TRY' cabin = 'X' net_fare = 0 else: #目前发现是座位数少才会显示,先这样做判断,以后数量多在分析 if fare_seat.get('remainingSeatLabel'): maxSeats = int( fare_seat.get('remainingSeatLabel').get('values') [0]) else: maxSeats = 9 adultPrice = fare_seat.get('shownFare').get('amount') currency_symbol = fare_seat.get('shownFare').get( 'currency') currency = self.custom_settings.get('CURRENCY_CACHE').get( currency_symbol) or currency_symbol cabin = fare_seat.get('reservationClass') net_fare = fare_seat.get('totalFareDetailList')[0].get( 'subDetailList')[0].get('amount').get('amount') netFare = net_fare / self.custom_settings.get('SEAT') adultTax = adultPrice - netFare isChange = 1 carrier = flight.get('airline') flightNumber = '%s%s' % (carrier, flight.get('flightNo')) deptime = time.strptime(flight.get('departureDateTime'), '%Y-%m-%dT%H:%M:%S') depTime = time.mktime(deptime) arrtime = time.strptime(flight.get('arrivalDateTime'), '%Y-%m-%dT%H:%M:%S') arrTime = time.mktime(arrtime) dep_city_port_name = flight.get('departureLocation') arr_city_port_name = flight.get('arrivalLocation') # fromCity = dep_city_port_name.get('cityCode') # toCity = arr_city_port_name.get('cityCode') depAirport = dep_city_port_name.get('portCode') arrAirport = arr_city_port_name.get('portCode') #增加套餐价格,先定义价格表 price_dict = { 'ECO': 0, 'ADVANTAGE': 0, 'EXTRA': 0, 'SUPER_ECO': 0 } if adultPrice != 0: bundleList = fare_seat.get('bundleList') for bundle in bundleList: package_name = bundle.get('bundleType') package_price = bundle.get('shownFare').get('amount') price_dict[package_name] = package_price # segments = '%s:%s:%s' % (price_dict.get('ECO'), price_dict.get('ADVANTAGE'), price_dict.get('EXTRA')) segments = [[price_dict.get('ECO'), maxSeats], [price_dict.get('ADVANTAGE'), maxSeats], [price_dict.get('EXTRA'), maxSeats]] getTime = time.time() item = WowSpiderItem() item['flightNumber'] = flightNumber item['depTime'] = depTime item['arrTime'] = arrTime item['fromCity'] = self.portCitys.get(depAirport, depAirport) item['toCity'] = self.portCitys.get(arrAirport, arrAirport) item['depAirport'] = depAirport item['arrAirport'] = arrAirport item['currency'] = currency item['adultPrice'] = adultPrice item['adultTax'] = adultTax item['netFare'] = netFare item['maxSeats'] = maxSeats item['cabin'] = cabin item['carrier'] = carrier item['isChange'] = isChange item['segments'] = json.dumps(segments) item['getTime'] = getTime yield item