def parse(self, response): meta = response.meta _from = meta.get('_from') _to = meta.get('_to') _date = meta.get('_date') result = json.loads(response.text) # print(response.text) # 航班 listFareIntAvail = jsonpath(result, '$..list.*') item = None if listFareIntAvail: for rec in listFareIntAvail: judge = rec.get('fareRate') if not judge: continue # 税 fuelAd = int(jsonpath(rec, '$..fuelAD')[0]) taxAd = int(jsonpath(rec, '$..taxAD')[0]) # 机场和日期 depDate = jsonpath(rec, '$..depDate')[0] depCity = jsonpath(rec, '$..depCity')[0] arrCity = jsonpath(rec, '$..arrCity')[0] # 航班详情 flightNumber = jsonpath(rec, '$..flightNo')[0] depTime = '%s%s' % (depDate, jsonpath(rec, '$..depTime')[0]) arrTime = '%s%s' % (depDate, jsonpath(rec, '$..arrTime')[0]) # 城市-机场 from_city = self.city_airport.get(depCity, depCity) to_city = self.city_airport.get(arrCity, arrCity) bookingClass = jsonpath(rec, '$..bookingClass')[0] fareNet = int(jsonpath(rec, '$..fareNet')[0]) availSeat = jsonpath(rec, '$..availSeat')[0] item = FlightsItem() item.update( dict( flightNumber=flightNumber, # 航班号 depTime=time.mktime( time.strptime(depTime, "%Y%m%d%H%M")).__int__(), # 出发时间 arrTime=time.mktime( time.strptime(arrTime, "%Y%m%d%H%M")).__int__(), # 达到时间 fromCity=from_city, # 出发城市 toCity=to_city, # 到达城市 depAirport=depCity, # 出发机场 arrAirport=arrCity, # 到达机场 currency='KRW', # 货币种类 adultPrice=fareNet + taxAd + fuelAd, # 成人票价 adultTax=taxAd + fuelAd, # 税价 netFare=fareNet, # 净票价 maxSeats=availSeat, # 可预定座位数 cabin=bookingClass, # 舱位 carrier=flightNumber[:2], # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="[]", # 中转时的各个航班信息 getTime=time.mktime( datetime.now().timetuple()).__int__(), )) yield item else: # params = {'carrier': 'BX'} # data_array = [] # data = { # 'fromCity': _from, # 'toCity': _to, # 'date': _date, # } # data_array.append(data) # # res = push_date(settings.PUSH_DATA_URL, params=params, # action='invalid', data_array=data_array) if 'EE590' in response.text: self.log("ip 不可用", 40) else: self.log("%s-%s:%s no flights" % (_from, _to, _date), 20) pass
def parse(self, response): meta = response.meta _from = meta.get('_from') _to = meta.get('_to') _date = meta.get('_date') _num = meta.get('_num') if '<title>WAF</title>' in response.text: self.log('<title>WAF</title> be banned. retry...', 40) self.banned = True # if not self.proxy: # time.sleep(2) # yield response.request elif 'origin' in response.meta: headers = { 'accept': "text/html, */*; q=0.01", 'accept-encoding': "gzip, deflate, br", 'accept-language': "zh-CN,zh;q=0.9", 'referer': "https://makeabooking.flyscoot.com/Book/Flight", 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36", 'x-distil-ajax': "ysbctttfwzatvbzercutdyvxzsasrf", 'x-requested-with': "XMLHttpRequest", 'Cache-Control': "no-cache" } for i, _date in enumerate(self._get_dates(_date, int(_num))): # if i == 0: # continue params = parse.urlencode({ 'AvailabilityAjax.LowFareMarketDate': '0|%s' % _date, 'AvailabilityAjax.Market': '%s|%s' % (_from, _to), }) total_url = self.start_urls + params yield scrapy.Request(total_url, cookies=response.request.cookies, headers=headers, meta={'_from': _from, '_to': _to, '_date': _date}, callback=self.parse, errback=self.errback ) else: results = response.xpath('//*[@id="departure-results"]/div') item = None for departure_results in results: span = departure_results.xpath( 'div[@class="flight__stop"]//div[@class="flight-stop"]/span/text()').extract_first() # 跳过非直航 if span != u'Direct Flight': continue # 航空公司,航班号, 机场,时间 _input = departure_results.xpath( 'div[@class="flight__upgrade-box"]//div[@data-fare="fly"]//input/@value').extract_first() # 经济舱优先 if _input: m = filter(lambda x: x, re.split(r'~|\s+', re.match(r'.*\|(.*)~$', _input).group(1))) carrier, number, depAirport, date_from, time_from, arrAirport, date_to, time_to = m flightNumber = carrier + number price = departure_results.xpath( 'div[@class="flight__fly"]//span[contains(@class, "price--sale")]/text()').extract() cabin = 'E' # 出发,到达时间 depTime = time.mktime(time.strptime(date_from + time_from, "%m/%d/%Y%H:%M")).__int__() arrTime = time.mktime(time.strptime(date_to + time_to, "%m/%d/%Y%H:%M")).__int__() # 座位 seats = departure_results.xpath('div[@class="flight__fly"]/p/text()').extract_first() left_seats = seats[:1] if seats else 10 # 商务舱 else: # 机场 depAirport = departure_results.xpath('div[@class="flight__from"]/ul/li/text()').extract_first()[:3] arrAirport = departure_results.xpath('div[@class="flight__to"]/ul/li/text()').extract_first()[:3] # 航空公司,航班号 data_content = departure_results.xpath( 'div[@class="flight__stop"]/div[@role="button"]/@data-content').extract_first() data_xml = etree.HTML(data_content) p = data_xml.xpath('//p/text()')[0] flightNumber = re.sub(r'\s*', '', re.match(r'Departing Flight:(.*)\(Scoot\)', p).group(1)) carrier = flightNumber[:2] # 出发,到达时间 li = data_xml.xpath('//ul/li/text()') time_from, time_to = map((lambda x: re.match(r'\w+: (.*?)\).*', x).group(1)), li[:2]) depTime = time.mktime(time.strptime(time_from, "%H:%M%p (%a, %d %b %Y")).__int__() arrTime = time.mktime(time.strptime(time_to, "%H:%M%p (%a, %d %b %Y")).__int__() price = departure_results.xpath( 'div[@class="flight__scootbiz visible-xs"]//span[contains(@class, "price--sale")]/text()').extract() cabin = 'S' # 座位 seats = departure_results.xpath('div[@class="flight__scootbiz visible-xs"]/p/text()').extract_first() left_seats = seats[:1] if seats else 10 # 货币种类,价格,座位 if len(price) == 1: currency = price[0][:3] adultPrice = float(price[0][3:].replace(',', '')) elif len(price) == 2: # 换行 currency = price[0] adultPrice = float(price[1].replace(',', '')) else: continue from_city = self.city_airport.get(depAirport, depAirport) to_city = self.city_airport.get(arrAirport, arrAirport) item = FlightsItem() item.update(dict( flightNumber=flightNumber, depTime=depTime, # 出发时间 arrTime=arrTime, # 达到时间 fromCity=from_city, # 出发城市 toCity=to_city, # 到达城市 depAirport=depAirport, # 出发机场 arrAirport=arrAirport, # 到达机场 currency=currency, # 货币种类 adultPrice=adultPrice, # 成人票价 adultTax=0, # 税价 netFare=adultPrice, # 净票价 maxSeats=left_seats, # 可预定座位数 cabin=cabin, # 舱位 carrier=carrier, # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=time.time().__int__(), )) yield item # 无航班设置失效 if not item: data = { 'fromCity': _from, 'toCity': _to, 'date': re.sub(r'(\d+)/(\d+)/(\d+)', r'\3\1\2', _date), } res = push_date(settings.PUSH_DATA_URL, {'carrier': 'TR'}, 'invalid', [data]) self.log('%s-%s: %s no flights' % (_from, _to, _date), 20) self.crawler.engine.unpause()
def parse(self, response): meta = response.meta _from = meta.get('_from') _to = meta.get('_to') token = meta.get('token') _next_date = meta.get('_next_date') # ajax请求 select_departure = response.xpath('//*[@id="select_departure"]/table/tbody//input[@data-lowest="True"]/@value').extract_first() # 请求下一天数据时用到 btn_next = response.xpath('//*[@id="select_departure"]/a[@class="btn_next"]') DepartureDate = btn_next.xpath('@data-date').extract_first() JourneyIndex = btn_next.xpath('@data-index').extract_first() Incrementer = btn_next.xpath('@data-incrementer').extract_first() next_headers = { 'Accept': "application/json, text/javascript, */*; q=0.01", 'Accept-Encoding': "gzip, deflate, br", 'Accept-Language': "zh-CN,zh;q=0.9", 'Connection': "keep-alive", 'Content-Type': "application/x-www-form-urlencoded", 'Host': "booking.hkexpress.com", 'Origin': "https://booking.hkexpress.com", 'Referer': response.request.headers['Referer'], 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36", 'X-Requested-With': "XMLHttpRequest", 'Cache-Control': "no-cache" } # 解析初始页面,第一周请求入队 if 'origin' in response.meta and response.meta['origin'] == 1: origin_token = response.xpath('//*[@name="__RequestVerificationToken"]/@value').extract_first() li_list = response.xpath('//*[@id="select_departure"]/div[@class="selectdate"]/ul/li[@class!="disabled" and @class!="current"]') for li in li_list: data_date = li.xpath('@data-date').extract_first() data = {"DatesSelected": [data_date], "SelectedFares": [select_departure], "aftoken": origin_token} DateTabSelect_url = self.start_urls + '/Search/DateTabSelect' if li == li_list[-1]: yield scrapy.Request(DateTabSelect_url, method='POST', cookies=response.request.cookies, headers=response.request.headers, body=json.dumps(data), meta={'origin': 2, '_next_date': DepartureDate, 'token': origin_token, '_from': _from, '_to': _to, }, priority=1, dont_filter=True) else: yield scrapy.Request(DateTabSelect_url, method='POST', headers=response.request.headers, cookies=response.request.cookies, body=json.dumps(data), meta={'token': origin_token, '_from': _from, '_to': _to, }, priority=1, dont_filter=True) else: # 更新session next_data = { 'DepartureDate': DepartureDate, 'JourneyIndex': JourneyIndex, 'Incrementer': Incrementer, 'aftoken': origin_token } next_date_url = self.start_urls + '/Search/NextDate' yield scrapy.Request(next_date_url, method='POST', headers=next_headers, cookies=response.request.cookies, body=parse.urlencode(next_data), dont_filter=True, priority=1, callback=self.parse_next ) # 解析通用页面,价格不含税 tr_list = response.xpath('//*[@id="select_departure"]/table/tbody/tr') items = [] time_array = None for tr in tr_list: # 出发时间 Departure = tr.xpath('td[@data-title="Departure"]') dep_date = Departure.xpath('span[@class="sr-only"]/text()').extract_first() time_array = dep_date dep_time = Departure.xpath('strong[@class="depart-time"]/text()').extract_first() data_std = dep_date+dep_time # 出发机场 dep_text = Departure.xpath('text()').extract().pop() dep_airport = re.search(r'\w+', dep_text).group() # 到达时间 Arrival = tr.xpath('td[@data-title="Arrival"]') arr_date = Arrival.xpath('span[@class="sr-only"]/text()').extract_first() arr_time = Arrival.xpath('strong[@class="arrive-time"]/text()').extract_first().split() data_sta = arr_date+arr_time[0] # 到达机场 arr_text = Arrival.xpath('text()').extract().pop() arr_airport = re.search(r'\w+', arr_text).group() # 航班号 Flight = tr.xpath('td[@data-title="Flight"]') flight_text = Flight.xpath('strong/text()').extract_first().split() carrier, flight_number = flight_text[-2:] flight_change = Flight.xpath('text()').extract().pop().strip() if flight_change == u"Direct Flight": # 价格 Fun = tr.xpath('td[@data-title="Fun"]') _input = Fun.xpath('label/input') if _input: # value = _input.xpath('@value').extract_first() fun_currency, fun_price = Fun.xpath('label/span[@class="table_price"]/text()').extract_first().split() fun_price = float(re.sub(r',', '', fun_price)) seat = self.seat else: fun_currency, fun_price, seat = '', 0.00, 0 # Fun_plus = tr.xpath('td[@data-title="Fun+"]') # UBiz = tr.xpath('td[@data-title="UBiz"]') fromCity = self.city_airport.get(dep_airport, dep_airport) toCity = self.city_airport.get(arr_airport, arr_airport) item = FlightsItem() item.update(dict( flightNumber=carrier+flight_number, # 航班号 depTime=int(time.mktime(time.strptime(data_std, "%Y-%m-%d%H:%M"))), # 出发时间 arrTime=int(time.mktime(time.strptime(data_sta, "%Y-%m-%d%H:%M"))), # 达到时间 fromCity=fromCity, # 出发城市 toCity=toCity, # 到达城市 depAirport=dep_airport, # 出发机场 arrAirport=arr_airport, # 到达机场 currency=fun_currency, # 货币种类 adultPrice=fun_price, # 成人票价 adultTax=0, # 税价 netFare=fun_price, # 净票价 maxSeats=seat, # 可预定座位数 cabin='ECO', # 舱位 carrier=carrier, # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=int(time.time()), )) if item['netFare'] == 0: yield item else: items.append(item) # 请求税 if select_departure: # 先查询缓存有没有税价 tax_cache = self.tax_cache.get("%s%s" % (_from, _to)) if tax_cache: for item in items: item.update(dict( adultPrice=tax_cache + item['netFare'], # 成人票价 adultTax=tax_cache, # 税价 )) yield item else: tax_url = self.start_urls + '/Search/FareSelect' tax_data = { 'JourneyFareSellKeys': [select_departure], 'aftoken': token } yield scrapy.Request(tax_url, method='POST', cookies=response.request.cookies, meta={'items': items, '_from': _from, '_to': _to, }, body=json.dumps(tax_data), callback=self.parse_tax, dont_filter=True, priority=1) elif items: # 设置失效 data = {'fromCity': _from, 'toCity': _to, 'date': '{:%Y%m%d}'.format(datetime.strptime(time_array, '%Y-%m-%d'))} push_date(settings.PUSH_DATA_URL, params={'carrier': self.spider_name}, action='invalid', data_array=[data]) # for item in items: yield item # 请求下一天数据 if 'origin' in response.meta and response.meta['origin'] == 2 and \ datetime.strptime(_next_date, '%Y-%m-%d') < (datetime.now() + timedelta(30)): data = {"DatesSelected": [_next_date], "SelectedFares": [select_departure], "aftoken": token} DateTabSelect_url = self.start_urls + '/Search/DateTabSelect' yield scrapy.Request(DateTabSelect_url, method='POST', cookies=response.request.cookies, body=json.dumps(data), meta={'origin': 2, '_next_date': DepartureDate, 'token': token, '_from': _from, '_to': _to, }, dont_filter=True, priority=1) # 更新session next_data = { 'DepartureDate': DepartureDate, 'JourneyIndex': JourneyIndex, 'Incrementer': Incrementer, 'aftoken': token } next_date_url = self.start_urls + '/Search/NextDate' yield scrapy.Request(next_date_url, method='POST', headers=next_headers, cookies=response.request.cookies, body=parse.urlencode(next_data), dont_filter=True, priority=1, callback=self.parse_next ) # 恢复引擎 self.crawler.engine.unpause()
def parse(self, response): meta = response.meta FROM = meta.get('FROM') TO = meta.get('TO') _day = meta.get('_day') from_city = self.city_airport.get(FROM, FROM) to_city = self.city_airport.get(TO, TO) response_dict = json.loads(response.body) record = jsonpath.jsonpath(response_dict, '$.flights')[0] item = None for rec in record: departAirportCode = jsonpath.jsonpath(rec, '$..departAirportCode')[0] arrivalAirportCode = jsonpath.jsonpath(rec, '$..arrivalAirportCode')[0] currency = jsonpath.jsonpath(rec, '$..currency')[0] results = jsonpath.jsonpath(rec, '$..flightOptions')[0] item = None for r in results: try: flightSummaryStops = jsonpath.jsonpath( r, '$..flightSummaryStops')[0] if flightSummaryStops == "NONSTOP": flightDetails = jsonpath.jsonpath( r, '$..flightDetails') priceDetails = jsonpath.jsonpath(r, '$..priceDetails') flightNumber = jsonpath.jsonpath( flightDetails, '$..flightNumber')[0] operatingAirline = jsonpath.jsonpath( flightDetails, '$..operatingAirline')[0] departureDateRaw = jsonpath.jsonpath( flightDetails, '$..departureDateRaw')[0] arrivalDateRaw = jsonpath.jsonpath( flightDetails, '$..arrivalDateRaw')[0] for price in priceDetails[0]: fareType = jsonpath.jsonpath(price, '$..fareType')[0] totalFareAmount = float( jsonpath.jsonpath(price, '$..totalFareAmount')[0]) totalTaxAmount = float( jsonpath.jsonpath(price, '$..totalTaxAmount')[0]) try: seatsAvailable = int( jsonpath.jsonpath(price, '$..seatsAvailable')[0]) except: seatsAvailable = 6 if fareType == 'Econo' and operatingAirline == 'WS': item = FlightsItem() item.update( dict( flightNumber="WS%s" % flightNumber, # 航班号 depTime=time.mktime( time.strptime( departureDateRaw[:-4], "%Y-%m-%dT%H:%M:%S")).__int__( ), # 出发时间 arrTime=time.mktime( time.strptime( arrivalDateRaw[:-4], "%Y-%m-%dT%H:%M:%S")).__int__( ), # 达到时间 fromCity=from_city, # 出发城市 toCity=to_city, # 到达城市 depAirport=departAirportCode, # 出发机场 arrAirport=arrivalAirportCode, # 到达机场 currency=currency, # 货币种类 adultPrice=totalFareAmount, # 成人票价 adultTax=totalTaxAmount, # 税价 netFare=totalFareAmount - totalTaxAmount, # 净票价 maxSeats=seatsAvailable, # 可预定座位数 cabin='ECO', # 舱位 carrier='WS', # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=time.mktime(datetime.now( ).timetuple()).__int__(), )) yield item except Exception as e: self.log(e, 40) if item is None: # 设置失效 _day = _day.replace('-', '') data = {'fromCity': FROM, 'toCity': TO, 'date': _day} res = push_date(settings.PUSH_DATA_URL, params={'carrier': self.spider_name}, action='invalid', data_array=[data]) self.log('%s-%s: %s no flights' % (FROM, TO, _day), level=20) pass
def parse(self, response): meta = response.meta logging.debug('proxy: %s' % meta.get('proxy')) # print(response.body) item = FlightsItem() try: response_dict = json.loads(response.body) self.is_ok = True results = jsonpath.jsonpath(response_dict, '$..OutboundOptions.Option') LegOption = jsonpath.jsonpath(results, '$..LegOption') DepartureDate = jsonpath.jsonpath(results, '$..DepartureDate') if DepartureDate and DepartureDate[0]: for result in LegOption: SegmentOptions = jsonpath.jsonpath(result, '$..SegmentOptions')[0] segmentOption = SegmentOptions.get('SegmentOption') if isinstance(segmentOption, list): continue Surcharge = jsonpath.jsonpath(result, '$.Surcharges.Surcharge') Total = jsonpath.jsonpath(Surcharge, '$..Total') if Total: Total = sum(Total) else: Total = 0 Flight = jsonpath.jsonpath(SegmentOptions, '$..Flight') flightNumber = jsonpath.jsonpath(Flight, '$..Number')[0] depTime = jsonpath.jsonpath(Flight, '$..ETDLocal')[0] arrTime = jsonpath.jsonpath(Flight, '$..ETALocal')[0] DepartureAirport = jsonpath.jsonpath( Flight, '$..DepartureAirport.Code')[0] ArrivalAirport = jsonpath.jsonpath( Flight, '$..ArrivalAirport.Code')[0] FareOption = jsonpath.jsonpath(result, '$..FareOption.*') from_city = self.city_airport.get(DepartureAirport, DepartureAirport) to_city = self.city_airport.get(ArrivalAirport, ArrivalAirport) # 缓存最低价 item_cache = None for rec in FareOption: DiscountFare = jsonpath.jsonpath( rec, '$..DiscountFare')[0] DiscountFareTaxes = jsonpath.jsonpath( rec, '$..DiscountFareTaxes')[0] DiscountFareTotal = jsonpath.jsonpath( rec, '$..DiscountFareTotal')[0] Abbreviation = jsonpath.jsonpath( rec, '$..Currency.Abbreviation')[0] SeatsAvailable = jsonpath.jsonpath( rec, '$..SeatsAvailable')[0] FareCategory = jsonpath.jsonpath( rec, '$..FareCategory')[0] item.update( dict( flightNumber=flightNumber, # 航班号 depTime=time.mktime( time.strptime(depTime, "%Y-%m-%dT%H:%M:%S") ).__int__(), # 出发时间 arrTime=time.mktime( time.strptime(arrTime, "%Y-%m-%dT%H:%M:%S") ).__int__(), # 达到时间 fromCity=from_city, # 出发城市 toCity=to_city, # 到达城市 depAirport=DepartureAirport, # 出发机场 arrAirport=ArrivalAirport, # 到达机场 currency=Abbreviation, # 货币种类 adultPrice=DiscountFareTotal + Total, # 成人票价 adultTax=DiscountFareTaxes + Total, # 税价 netFare=DiscountFare, # 净票价 maxSeats=SeatsAvailable, # 可预定座位数 cabin=FareCategory, # 舱位 carrier=flightNumber[:2], # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="[]", # 中转时的各个航班信息 getTime=time.mktime( datetime.now().timetuple()).__int__(), )) if not item_cache or item['adultPrice'] < item_cache[ 'adultPrice']: item_cache = item.copy() if item_cache['cabin'] != 'SkyBoss': yield item_cache else: yield None except: self.is_ok = False logging.error(response.body) logging.error('error ddddd', traceback.format_exc()) pass
def parse(self, response): meta = response.meta _from = meta.get('_from') _to = meta.get('_to') # _date = meta.get('_date') available_dates = meta.get('available_dates') if available_dates: session = re.match(r'https://.*?/(.*?)/.*', response.url).group(1) for _date in available_dates: params = { 's': True, 'o1': _from, 'd1': _to, 'dd1': _date, 'ADT': self.search_seat, 'mon': True, 'bpc': False, # 'bc': 'EUR' } total_url = self.start_urls.format( s=session + '/') + parse.urlencode(params) yield scrapy.Request(total_url, meta={ '_from': _from, '_to': _to, '_date': _date }) # 解析页面 form_container = response.xpath( '//div[@id="js_availability_container"]/form') div_fare_row = form_container.xpath( 'div[contains(@class, "fare-row")]') for fare_row in div_fare_row: div_outbound = fare_row.xpath('div[@data-class-index="0"]/div/div') # 售完跳过 if not div_outbound: continue input_value = div_outbound.xpath( 'div[@class="fare-price-and-currency"]/input/@value').extract( ) flight_values = input_value[0].split('|')[1].split('~') # 过滤中转 if '^' not in input_value[0]: flight_values = filter(lambda x: re.sub(r'\s*', '', x), flight_values) try: carrier, flight_number, dep_airport, dep_date, arr_airport, arr_date = flight_values except Exception as e: print(e) print(flight_values) continue fromCity = self.city_airport.get(dep_airport, dep_airport) toCity = self.city_airport.get(arr_airport, arr_airport) currency, price = re.match('(^\D*)?(.*)', input_value[2]).groups() price = re.sub(r',', '', price) currency = self.currency_cache.get(currency, currency) item = FlightsItem() item.update( dict( flightNumber=carrier + flight_number.strip(), # 航班号 depTime=int( time.mktime( time.strptime(dep_date, "%m/%d/%Y %H:%M"))), # 出发时间 arrTime=int( time.mktime( time.strptime(arr_date, "%m/%d/%Y %H:%M"))), # 达到时间 fromCity=fromCity, # 出发城市 toCity=toCity, # 到达城市 depAirport=dep_airport, # 出发机场 arrAirport=arr_airport, # 到达机场 currency=currency, # 货币种类 adultPrice=float(price), # 成人票价 adultTax=0, # 税价 netFare=float(price), # 净票价 maxSeats=self.search_seat, # 可预定座位数 cabin='E', # 舱位 carrier=carrier, # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=int(time.time()), )) yield item
def parse(self, response): meta = response.meta FROM = meta.get('FROM') TO = meta.get('TO') _day = meta.get('_day') response_json = json.loads(response.text) availabilityResults = response_json.get('availabilityResults') currency = availabilityResults.get('currency') routeListOutbound = availabilityResults.get('routeListOutbound') item = None for rec in routeListOutbound: isTransit = rec.get('isTransit') # 直达 if not isTransit: # 航班号 flightList = rec.get('flightList')[0] flightCode = flightList.get('flightCode') # 城市,机场 origin = rec.get('origin') destination = rec.get('destination') depAirport= origin.get('code') arrAirport= destination.get('code') fromCity= self.city_airport.get(depAirport, depAirport) toCity= self.city_airport.get(arrAirport, arrAirport) # 时间 departureTime = rec.get('departureTime') arrivalTime = rec.get('arrivalTime') # 价格 price_cache = list() # 标准仓 price_cache.append(rec.get('standardIdFare')) price_cache.append(rec.get('standardLowFare')) price_cache.append(rec.get('standardLowFarePlus')) price_cache.append(rec.get('standardFlex')) # 高价仓 price_cache.append(rec.get('premiumLowFare')) price_cache.append(rec.get('premiumFlex')) # 添加套餐 segments = [] keys = ['standardLowFarePlus', 'standardFlex'] for key in keys: price = rec.get(key) temp_price = price.get('fareValue') if temp_price == 0: segments.append([0, 0]) else: segments.append([round(temp_price, 2), price.get('seatsAvailable')]) # 过滤掉无价格的然后按价格排序 price_cache = sorted(filter(lambda x: x.get('fareValue'), price_cache), key=lambda x: x.get('fareValue'), reverse=True) if price_cache: fare_data = price_cache.pop() else: fare_data = rec.get('standardLowFare') # 最低价,仓位,座位 fareValue = fare_data.get('fareValue') cabin = fare_data.get('bookingClass', 'E') seatsAvailable = fare_data.get('seatsAvailable') item = FlightsItem() item.update(dict( flightNumber=flightCode, # 航班号 depTime=time.mktime(time.strptime(departureTime, "%Y-%m-%dT%H:%M:%S")).__int__(), # 出发时间 "2018-04-02T06:15:00" arrTime=time.mktime(time.strptime(arrivalTime, "%Y-%m-%dT%H:%M:%S")).__int__(), # 达到时间 fromCity=fromCity, # 出发城市 toCity=toCity, # 到达城市 depAirport=depAirport, # 出发机场 arrAirport=arrAirport, # 到达机场 currency=currency, # 货币种类 adultPrice=float(fareValue), # 成人票价 adultTax=0, # 税价 netFare=float(fareValue), # 净票价 maxSeats=seatsAvailable, # 可预定座位数 cabin=cabin, # 舱位 carrier=flightCode[:2], # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments=json.dumps(segments), # 中转时的各个航班信息 getTime=time.time().__int__(), )) yield item if item is None: # 设置失效 _day = _day.replace('-', '') data = {'fromCity': FROM, 'toCity': TO, 'date': _day} res = push_date(self.settings.get('PUSH_DATA_URL'), params={'carrier': self.spider_name}, action='invalid', data_array=[data]) self.log('%s-%s: %s no flights' % (FROM, TO, _day), level=20) pass
def parse(self, response): meta = response.meta _from = meta.get('_from') _to = meta.get('_to') _date = meta.get('_date') # 验证码 if 'Are you human?' in response.body: self.banned = True self.log('\t\t be banned, retry...', 20) yield response.request # 解析页面 else: self.log('available proxy: %s' % response.meta['proxy'], 20) self.banned = False from_city = self.city_airport.get(_from, _from) to_city = self.city_airport.get(_to, _to) tr_set = response.xpath( '//*[@id="depart-table"]/tbody/tr[@class!="flight-legend"]') tr = [tr for tr in tr_set if tr.xpath('th/div').__len__() == 1] item = None for t in tr: item = FlightsItem() flight_number = re.sub( r'\s*', '', t.xpath( 'th//span[contains(@class, "flight-number")]/text()'). extract_first()) # 判断航线时间是否跨度两天 sup = t.xpath( 'td[contains(@class, "visible-sm visible-xs")]/div//sup/text()' ).extract_first() if not sup: dep_time, arr_time, _ = map( lambda x: re.sub(r'\s', '', x), t.xpath( 'td[contains(@class, "visible-sm visible-xs")]/div/div/text()' ).extract()) arr_date = _date else: dep_time, arr_time, _, _ = map( lambda x: re.sub(r'\s', '', x), t.xpath( 'td[contains(@class, "visible-sm visible-xs")]/div/div/text()' ).extract()) arr_date = self._add_one(_date) dep_airport, arr_airport = t.xpath( 'td[@class="avail-table-vert text-center"]/div/div/text()' ).extract() # 价格详情 label_l = t.xpath( 'td[contains(@class, "fare-bundle-radio-container")]/div/label' ) label = label_l[0] base_fare = label.xpath('@data-basefare').extract_first() tax = label.xpath('@data-webadminfee').extract_first() cabin = label.xpath('@data-fareclass').extract_first() currency, total_fare = (lambda x: (x[:3], x[3:]))(re.sub( r'\s|,', '', label.xpath('text()').extract_first())) if not total_fare: total_fare = 0.00 base_fare = 0.00 tax = 0.00 currency = '' segments = {'F_B': 0, 'F_B_M': 0} try: segments['F_B'] = re.sub( r'\s|,', '', label_l[1].xpath('text()').extract_first()) segments['F_B_M'] = re.sub( r'\s|,', '', label_l[2].xpath('text()').extract_first()) except: traceback.print_exc() print('error') item.update( dict( flightNumber=flight_number, # 航班号 depTime=time.mktime( time.strptime(_date + dep_time, "%Y-%m-%d%H%MH")).__int__(), # 出发时间 arrTime=time.mktime( time.strptime(arr_date + arr_time, "%Y-%m-%d%H%MH")).__int__(), # 达到时间 fromCity=from_city, # 出发城市 toCity=to_city, # 到达城市 depAirport=dep_airport, # 出发机场 arrAirport=arr_airport, # 到达机场 currency=currency, # 货币种类 adultPrice=float(total_fare), # 成人票价 adultTax=float(tax), # 税价 netFare=float(base_fare), # 净票价 maxSeats=self.search_seat, # 可预定座位数 cabin=cabin, # 舱位 carrier=flight_number[:2], # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments=json.dumps(segments), # 中转时的各个航班信息 getTime=int(time.time()), )) yield item # 无数据,设置运价失效 if not item: # 设置失效 data = {'fromCity': _from, 'toCity': _to, 'date': _date} res = push_date(settings.PUSH_DATA_URL, params={'carrier': self.spider_name}, action='invalid', data_array=[data]) self.log('[%s] %s-%s no flights' % (_date, _from, _to), 20)
def parse(self, response): meta = response.meta _from = meta.get('_from') _to = meta.get('_to') _date = meta.get('_date') if response.status == 500: self.log((_from, _to, _date), 20) self.log(response.text, 20) # 设置失效 data = {'fromCity': _from, 'toCity': _to, 'date': _date} push_date(settings.PUSH_DATA_URL, params={'carrier': self.spider_name}, action='invalid', data_array=[data]) else: response_dict = json.loads(response.text) tripInfo = jsonpath(response_dict, '$..tripInfo.*') if not tripInfo: # 设置失效 data = {'fromCity': _from, 'toCity': _to, 'date': _date} push_date(settings.PUSH_DATA_URL, params={'carrier': self.spider_name}, action='invalid', data_array=[data]) else: item = None for rec in tripInfo: segmentInfo_list = rec['segmentInfo'] if len(segmentInfo_list) == 1: segmentInfo = segmentInfo_list[0] carrierCode = jsonpath( segmentInfo, '$.flightIdentifierInfo.carrierCode')[0] flightNumber = jsonpath( segmentInfo, '$.flightIdentifierInfo.flightNumber')[0] dep_airport = jsonpath( segmentInfo, '$.departureInfo.airportCode')[0] departureDateTime = jsonpath(segmentInfo, '$.departureDateTime')[0] dep_date = ''.join( re.match(r'(.*)\(.*\)(.*)', departureDateTime).groups()) arr_airport = jsonpath(segmentInfo, '$.arrivalInfo.airportCode')[0] arrivalDateTime = jsonpath(segmentInfo, '$.arrivalDateTime')[0] arr_date = ''.join( re.match(r'(.*)\(.*\)(.*)', arrivalDateTime).groups()) # 城市 fromCity = self.city_airport.get( dep_airport, dep_airport) toCity = self.city_airport.get(arr_airport, arr_airport) # 价格 segmentAvailability = segmentInfo[ 'segmentAvailability'] # 比价 items = [] for rec_price in segmentAvailability: bookingClass = rec_price['bookingClass'] # 跳过没有数据的仓位 if not bookingClass: continue seatAvailablity = rec_price['seatAvailablity'] displayFareAmount = rec_price['displayFareAmount'] taxAmount = rec_price['taxAmount'] displayFareCurrencyCode = rec_price[ 'displayFareCurrencyCode'] item = FlightsItem() item.update( dict( flightNumber='%s%s' % (carrierCode, flightNumber), # 航班号 depTime=int( time.mktime( time.strptime( dep_date, '%Y-%m-%d %H:%M'))), # 出发时间 arrTime=int( time.mktime( time.strptime( arr_date, '%Y-%m-%d %H:%M'))), # 达到时间 fromCity=fromCity, # 出发城市 toCity=toCity, # 到达城市 depAirport=dep_airport, # 出发机场 arrAirport=arr_airport, # 到达机场 currency=displayFareCurrencyCode, # 货币种类 adultPrice=displayFareAmount + taxAmount, # 成人票价 adultTax=taxAmount, # 税价 netFare=displayFareAmount, # 净票价 maxSeats=seatAvailablity, # 可预定座位数 cabin=bookingClass, # 舱位 carrier=carrierCode, # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=int(time.time()), )) items.append(item) # 比价,座位小于3个的不要 gt_2_items = filter(lambda x: x['maxSeats'] > 2, items) if gt_2_items: yield min(gt_2_items, key=lambda x: x['adultPrice']) else: yield min(items, key=lambda x: x['adultPrice']) else: print(_from, _to, _date) print('is_change') break
def parse(self, response): # 给设置无效用的。。。。 from_port = response.meta.get('FROM') to_port = response.meta.get('TO') FROM = self.city_airport.get(from_port, from_port) TO = self.city_airport.get(to_port, to_port) list_day = response.meta.get('_day') # 创建items实例 item = FlightsItem() response_dict = json.loads(response.body) try: # 使用jsonpath获取元素 AvailableFlights = response_dict.get('AvailableFlights') DisplayCurrencyCode = response_dict.get('DisplayCurrencyCode') for i in AvailableFlights: price_pack = [[0, 0]] adult_price = sys.maxint seats = 0 for price_item in i.get('FlightFares'): i_Price = jsonpath.jsonpath(price_item, '$..Price') if not i_Price: continue i_Price = i_Price[0] PriceWithDebitCard = jsonpath.jsonpath( price_item, '$..PriceWithDebitCard')[0] i_tax = self.get_tax(i_Price, PriceWithDebitCard) i_seat = price_item.get('LowestFareSeatsAvailable') i_adult_price = i_Price + i_tax if price_item.get('FareType') == 'Flexi': price_pack[0] = [i_Price + i_tax, i_seat] if i_adult_price < adult_price and i_adult_price != 0: adult_price, seats, Price, tax = i_adult_price, i_seat, i_Price, i_tax if adult_price == sys.maxint: adult_price, seats, Price, tax = [0] * 4 depTime = i.get('LocalDepartureTime') arrTime = i.get('LocalArrivalTime') dep_port = i.get('DepartureIata') arr_port = i.get('ArrivalIata') from_city = self.city_airport.get(dep_port, dep_port) to_city = self.city_airport.get(arr_port, arr_port) item['flightNumber'] = "U2%s" % i.get('FlightNumber') # 航班号 item['depTime'] = time.mktime( time.strptime(depTime, "%Y-%m-%dT%H:%M:%S")).__int__() # 出发时间 item['arrTime'] = time.mktime( time.strptime(arrTime, "%Y-%m-%dT%H:%M:%S")).__int__() # 达到时间 item['fromCity'] = from_city # 出发城市 item['toCity'] = to_city # 到达城市 item['depAirport'] = dep_port # 出发机场 item['arrAirport'] = arr_port # 到达机场 item['currency'] = self.currency_cache.get( DisplayCurrencyCode, DisplayCurrencyCode) # 货币种类 item['adultPrice'] = adult_price # 成人票价 item['adultTax'] = tax # 税价 item['netFare'] = Price # 净票价 item['maxSeats'] = seats # 可预定座位数 item['cabin'] = "Y" # 舱位 item['carrier'] = "U2" # 航空公司 item['isChange'] = 1 # 是否为中转 1.直达2.中转 item['segments'] = json.dumps(price_pack) # 中转时的各个航班信息 item['getTime'] = time.mktime( datetime.now().timetuple()).__int__() yield item except Exception as e: # traceback.print_exc() # print(response.body) self.log(e, level=40) # 设置失效 params = {'carrier': 'U2'} data_array = list() for _day in list_day: data = { 'fromCity': FROM, 'toCity': TO, 'date': re.sub(r'(\d+)-(\d+)-(\d+)', r'\1\2\3', _day), } data_array.append(data) res = push_date(settings.PUSH_DATA_URL, params=params, action='invalid', data_array=data_array) self.log('%s-%s: %s no flights' % (FROM, TO, list_day), level=20) pass
def parse(self, response): meta = response.meta self.is_ok = True # 解析items div_list = response.xpath( '//*[@id="frm-matrix"]/div[2]/div[contains(@class, "book_bundle_row") and not(contains(@class, "has-multiflight"))]' ) item = None items = list() flight_date = None for div in div_list: row_header = div.xpath('div[@class="book_bundle_row--header"]') # 取出日期用于后面时间戳计算 h3 = row_header.xpath( 'div[@class="is-visuallyhidden"]/h3/text()').extract_first() # u'Sunday 08/04/2018, London (LHR) ab 12:05, Zurich (ZRH) an 14:45. Economy from GBP 228. Business from not available. Operated by SWISS GLOBAL AIR LINES. ' flight_date = re.match(r'\w+\s+(.*?),.*', h3).group(1) flightentry = row_header.xpath( 'div[@class="book_bundle_row--flightinfo"]/div[@class="book-bundle-flightentry"]' ) # 机场,时间 flightentry_time = flightentry.xpath( 'div[@class="book-bundle-flightentry--time"]') # 出发机场,时间 departure = flightentry_time.xpath( 'div[@class="book-bundle-flightentry--departure"]') depAirport = departure.xpath('text()').extract_first().strip() dep_time = departure.xpath('strong/text()').extract_first().strip() dep_data = flight_date + dep_time # 到达机场,时间 arrival = flightentry_time.xpath( 'div[@class="book-bundle-flightentry--arrival"]') arrAirport = arrival.xpath('text()').extract_first().strip() arr_time = arrival.xpath('strong/text()').extract_first().strip() sub = arrival.xpath('strong/sub/text()').extract_first() arr_date = flight_date + arr_time if sub: arr_date = self.add_date(arr_date, int(sub)) # 城市 fromCity = self.city_airport.get(depAirport, depAirport) toCity = self.city_airport.get(arrAirport, arrAirport) flightentry_info = flightentry.xpath( 'div[@class="book-bundle-flightentry--info"]/div[@class="book-bundle-flightentry--metainfo"]' ) # 航班号 flightentry_number = flightentry_info.xpath( 'a[@class="book-bundle-flightentry--number"]/text()' ).extract_first() flightNumber = re.sub(r'\s*', '', flightentry_number) # # 过滤共享航班 # flightentry_operator = flightentry_info.xpath( # 'span[@class="book-bundle-flightentry--operator"]/text()').extract_first() # if 'SWISS' in flightentry_operator: # pass # else: # continue buttons = row_header.xpath( 'div[@class="book_bundle_row--buttons"]') li_list = buttons.xpath('div/ul/li') for li in li_list: li_button = li.xpath('button') if li_button: span = li_button.xpath('span/text()').extract() cabin, currency, price, _ = span currency = re.match(r'from\s(.*)', currency).group(1) if currency == u'¥': currency = u'CNY' price = re.sub(r'\W', '', price) else: # 跳过已售完仓位 continue # 剩余座位 div_text = li.xpath('div/text()').extract_first() if div_text: left_seats = re.match(r'.*(\d).*', div_text, flags=re.DOTALL).group(1) else: left_seats = 9 item = FlightsItem() item.update( dict( flightNumber=flightNumber, # 航班号 depTime=int( time.mktime( time.strptime(dep_data, "%d/%m/%Y%H:%M"))), # 出发时间 arrTime=int( time.mktime( time.strptime(arr_date, "%d/%m/%Y%H:%M"))), # 达到时间 fromCity=fromCity, # 出发城市 toCity=toCity, # 到达城市 depAirport=depAirport, # 出发机场 arrAirport=arrAirport, # 到达机场 currency=currency[-3:], # 货币种类 adultPrice=float(price), # 成人票价 adultTax=0, # 税价 netFare=float(price), # 净票价 maxSeats=left_seats, # 可预定座位数 cabin=cabin, # 舱位 carrier=flightNumber[:2], # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=int(time.time()), )) # 比价 items.append(item) # 按航班号分组,返回最低价 items = sorted(items, key=lambda x: x['flightNumber']) items_group = groupby(items, key=lambda x: x['flightNumber']) for k, g in items_group: yield min(g, key=lambda x: x['adultPrice']) # 解析url if 'origin' in meta and flight_date is not None: a_list = response.xpath( '//*[@id="matrixDaySelection"]/ul/li/a[@data-has-module="yes"]/@href' ).extract() # 剔除模拟爬过的页面 a_list_s = [ link for link in a_list if parse_qs(link.split('?')[1])['selectedDate'][0] != flight_date ] for url_path in a_list_s: total_url = self.start_urls + url_path yield scrapy.Request(total_url, cookies=response.request.cookies, callback=self.parse, dont_filter=True) # 恢复引擎 self.crawler.engine.unpause()
def parse(self, response): meta = response.meta FROM = meta.get('FROM') TO = meta.get('TO') _day = meta.get('_day') # currency = meta.get('currency') a_tags = response.xpath( '//*[@id="tripDeparture"]/div[@class="tripJourneys"]/div[contains(@class, "tripJourneyDate")]/div[contains(@class, "tripJourneyDateFlight")]' ) for trip in a_tags: a = trip.xpath('a[contains(@class, "list-box")]') content = a.xpath('@data-sellkey').extract_first() # 跳过中转航班 if '^' in content: continue s = filter((lambda x: x), re.split(r'[~|\s]+', content)) carrier, number, depAirport, date_from, time_from, arrAirport, date_to, time_to = s # 无价格标签说明以售完 price_str = a.xpath( 'div[@class="flight-wrapper"]//div[contains(@class, "flight-price-wrapper")]/span[contains(@class,"price")]/span[@class="value"]/text()' ).extract_first() if price_str: currency, price = re.match(r'(\D.*?)(\d.*)', price_str).groups() # price = locale.atof(price) price = float(re.sub(r',', '', price)) currency = currency.strip() currency = self.currency_cache.get(currency, currency) maxSeats = 5 price_dif = [[0, 0]] * 2 keys = ['SMART', 'BIZclass'] trip_list = trip.xpath( 'div[@class="tariffList"]/div[contains(@class, "tripFare") and not(contains(@class, "sold-out"))]' ) price_flag = price for trip_i in trip_list: key = trip_i.xpath('./@data-tariff').extract()[0] if key not in keys: continue price_i = trip_i.xpath('./@data-price').extract()[0] price_i = float( re.findall(r'.*?(\d.*)', price_i)[0].replace(',', '')) # 获取此服务的价格 if price_i < price_flag: price_i = price_flag else: price_flag = price_i price_dif[keys.index(key)] = [price_i, maxSeats] else: currency = '' price = 0 maxSeats = 0 price_dif = None fromCity = self.city_airport.get(depAirport, depAirport) toCity = self.city_airport.get(arrAirport, arrAirport) item = FlightsItem() item.update( dict( flightNumber=carrier + number, # 航班号 depTime=time.mktime( time.strptime(date_from + time_from, "%m/%d/%Y%H:%M")).__int__(), # 出发时间 arrTime=time.mktime( time.strptime(date_to + time_to, "%m/%d/%Y%H:%M")).__int__(), # 达到时间 fromCity=fromCity, # 出发城市 toCity=toCity, # 到达城市 depAirport=depAirport, # 出发机场 arrAirport=arrAirport, # 到达机场 currency=currency, # 货币种类 adultPrice=price, # 成人票价 adultTax=0, # 税价 netFare=price, # 净票价 maxSeats=maxSeats, # 可预定座位数 cabin='BASIC', # 舱位 carrier=carrier, # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments='[]' if not price_dif else json.dumps(price_dif), # 中转时的各个航班信息 getTime=time.time().__int__(), )) yield item
def parse(self, response): # meta = response.meta # _from = meta.get('_from') # _to = meta.get('_to') # _date = meta.get('_date') self.log(response.meta['proxy']) rows = response.xpath( '//*[@id="availabilityForm"]//table[@class="table avail-table"]/tbody/tr[contains(@class, "-row")]' ) # 过滤中转航线 nonstop_rows = [ row for row in rows \ if row.xpath( 'td[contains(@class, "avail-table-vert")]//td[contains(@id, "icon_")]//td[@class="avail-table-detail"]/div/div' ).__len__() == 1 ] for row in nonstop_rows: # 过滤仓位 td_depart = row.xpath('td[contains(@class, "depart LF")]') for td in td_depart: item = FlightsItem() left_seats = td.xpath( 'div//div[@class="avail-table-seats-remaining"]/text()' ).extract_first() if left_seats: left_seats = re.search(r'\d', left_seats).group() else: left_seats = 6 input_tag = td.xpath('div//input') value = input_tag.xpath('@value').extract_first() m = re.findall(r'~\w{3}~(\d{2}/\d{2}/\d{4} \d{2}:\d{2})', value) data_json = input_tag.xpath('@data-json').extract_first() result = json.loads(data_json)[0] carrier = result.get('brand') flightNumber = result.get('dimension16') depAirport = result.get('dimension2') arrAirport = result.get('dimension3') adultPrice = result.get('price') from_city = self.city_airport.get(depAirport, depAirport) to_city = self.city_airport.get(arrAirport, arrAirport) product = input_tag.xpath('@data-productclass').extract_first() cur = input_tag.xpath('@data-cur').extract_first() item.update( dict( flightNumber=flightNumber, # 航班号 depTime=time.mktime( time.strptime(m[0], "%m/%d/%Y %H:%M")).__int__(), # 出发时间 arrTime=time.mktime( time.strptime(m[1], "%m/%d/%Y %H:%M")).__int__(), # 达到时间 fromCity=from_city, # 出发城市 toCity=to_city, # 到达城市 depAirport=depAirport, # 出发机场 arrAirport=arrAirport, # 到达机场 currency=cur, # 货币种类 adultPrice=adultPrice, # 成人票价 adultTax=0, # 税价 netFare=adultPrice, # 净票价 maxSeats=left_seats, # 可预定座位数 cabin=product, # 舱位 carrier=carrier, # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=time.mktime( datetime.now().timetuple()).__int__(), )) yield item
def parse(self, response): meta = response.meta _from = meta.get('_from') _to = meta.get('_to') _date = meta.get('_date') results = json.loads(response.text).get('AirLowFareSearchRS') # 设置失效 if results is None: params = {'carrier': self.spider_name} data_array = list() data = { 'fromCity': _from, 'toCity': _to, 'date': re.sub(r'(\d+)-(\d+)-(\d+)', r'\1\2\3', _date), } data_array.append(data) res = push_date(settings.PUSH_DATA_URL, params=params, action='invalid', data_array=data_array) self.log('%s-%s: %s no flights' % (_from, _to, _date), level=20) return item = None items = [] # 航班信息 FlightInformation = results['FlightInformationSummary'][ 'FlightInformation'] flight_info_list = [ flight for flight in FlightInformation if _date in flight['Flight'] [0]['FlightSegment'][0]['@DepartureDate'] ] for flight_info in flight_info_list: if len(flight_info['Flight'][0]['FlightSegment']) == 1: info_rec = flight_info['Flight'][0]['FlightSegment'][0] flight_id = info_rec['@ID'] flightNumber = info_rec['@FlightNumber'] dep_data = info_rec['@DepartureDate'][:19] arr_date = info_rec['@ArrivalDate'][:19] depAirport = info_rec['@OriginCode'] arrAirport = info_rec['@DestinationCode'] carrier = info_rec['@MarketingAirline'] fromCity = self.city_airport.get(depAirport, depAirport) toCity = self.city_airport.get(arrAirport, arrAirport) # 座位信息 FlightItineraryPricePoint = results[ 'FlightItineraryPricePoints']['FlightItineraryPricePoint'] for seat_info in FlightItineraryPricePoint: # 价格point PricePointRef = seat_info['PricePointRef'] FlightInformationAttributes = seat_info[ 'FlightInformationAttributes'][0] if flight_id == FlightInformationAttributes[ '@FlightSegmentRef']: cabin = FlightInformationAttributes['@BookingClass'] maxSeats = FlightInformationAttributes[ '@SeatsAvailable'] # 价格信息 PricePoint = results['PricePointSummary']['PricePoint'] for price_info in PricePoint: if PricePointRef == price_info['@ID']: Fare = price_info['BasedOnPTCPricing']['Fare'] netFare = float(Fare['@BaseFareAmount']) BaseFareCurrency = Fare['@BaseFareCurrency'] TotalFareAmount = float( Fare['@TotalFareAmount']) item = FlightsItem() item.update( dict( flightNumber=carrier + flightNumber, # 航班号 depTime=int( time.mktime( time.strptime( dep_data, "%Y-%m-%dT%H:%M:%S")) ), # 出发时间 arrTime=int( time.mktime( time.strptime( arr_date, "%Y-%m-%dT%H:%M:%S")) ), # 达到时间 fromCity=fromCity, # 出发城市 toCity=toCity, # 到达城市 depAirport=depAirport, # 出发机场 arrAirport=arrAirport, # 到达机场 currency=BaseFareCurrency, # 货币种类 adultPrice=TotalFareAmount, # 成人票价 adultTax=TotalFareAmount - netFare, # 税价 netFare=netFare, # 净票价 maxSeats=maxSeats, # 可预定座位数 cabin=cabin, # 舱位 carrier=carrier, # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=int(time.time()), )) # 比价 items.append(item) # 按航班号分组,返回最低价 items = sorted(items, key=lambda x: x['flightNumber']) items_group = groupby(items, key=lambda x: x['flightNumber']) for k, g in items_group: yield min(g, key=lambda x: x['adultPrice'])
def parse(self, response): meta = response.meta dep_code = meta.get('dep_code') arr_code = meta.get('arr_code') _date = meta.get('_date') date_array = meta.get('date_array') if date_array: _from = self.cities.get(dep_code) _to = self.cities.get(arr_code) for _date_s in date_array: body = { 'SaveFields.SelDepOptId': '-1', 'SaveFields.RetShldrSel': 'False', 'SearchFields.IsAwardBooking': 'false', 'SearchFields.PriceType': 'Lowest', 'SearchFields.UpgradeOption': 'none', 'ClientStateCode': 'HA', 'SaveFields.SelRetOptId': '-1', 'SaveFields.DepShldrSel': 'False', # 'SearchFields.DepartureCity': 'San Francisco, CA (SFO-San Francisco Intl.)', # 'SearchFields.ArrivalCity': 'Portland, OR (PDX-Portland Intl.)', 'SearchFields.NumberOfTravelers': self.search_seat, 'SearchFields.SearchType': 'OneWay', 'SearchFields.IsCalendar': 'false', 'SearchFields.DepartureCity': _from, 'SearchFields.ArrivalCity': _to, 'SourcePage': 'Search', 'SearchFields.DepartureDate': _date_s, } yield scrapy.Request(method='POST', url=self.start_urls, body=parse.urlencode(body), meta={ 'dep_code': dep_code, 'arr_code': arr_code, '_date': _date_s }, cookies=response.request.cookies, callback=self.parse, errback=self.errback) flight_day = response.xpath( '//div[@class="shoulderSelected"]/input/@value').extract_first() optionList = response.xpath( '//*[@id="result-0"]/ul/li[div[@class="optionDetail"]/div[@class="clear"]/a/text()="Nonstop"]' ) item = None for li in optionList: # 航班号 optionHeader = li.xpath('div[@class="optionHeader"]') flight_number = optionHeader.xpath( 'div[@class="optionHeaderFltNum"]/text()').re(r'\d+')[0] # 机场,时间 optionDetail = li.xpath('div[@class="optionDetail"]') dep = optionDetail.xpath('div[@class="optionDeparts"]') dep_airport = dep.xpath( 'div[@class="optionCityCode"]/text()').extract_first() dep_time = dep.xpath( 'div[@class="optionTime"]/div[@class="b"]/text()' ).extract_first() dep_date = flight_day + dep_time arr = optionDetail.xpath('div[@class="left"]') arr_airport = arr.xpath( 'div[@class="optionCityCode"]/text()').extract_first() arr_time = arr.xpath( 'div[@class="optionTime"]/div[@class="b"]/text()' ).extract_first() arrivalDaysDifferent = arr.xpath( 'div[@class="optionTime"]/div[@class="arrivalDaysDifferent"]/text()' ).re(r'\d') if arrivalDaysDifferent: num = int(arrivalDaysDifferent[0]) arr_date = self._add_date(flight_day, num) + arr_time else: arr_date = flight_day + arr_time # 价格 data_c_l_p = li.xpath('div[@data-c="l"]/@data-p').extract() price = float(min(data_c_l_p, key=float)) # 货币种类 currency = li.xpath( 'div[@data-c="l" and @data-p]/div[@class="fareprice"]/text()' ).re(r'\D+')[0] currency = self.currency_cache.get(currency) if currency is None: continue fromCity = self.city_airport.get(dep_airport, dep_airport) toCity = self.city_airport.get(arr_airport, arr_airport) if dep_code != dep_airport or arr_code != arr_airport: print('ft: %s-%s' % (dep_code, arr_code)) print('da: %s-%s' % (dep_airport, arr_airport)) item = FlightsItem() item.update( dict( flightNumber='AS' + flight_number.strip(), # 航班号 depTime=int( time.mktime(time.strptime( dep_date, "%m/%d/%Y%I:%M %p"))), # 出发时间 arrTime=int( time.mktime(time.strptime( arr_date, "%m/%d/%Y%I:%M %p"))), # 达到时间 fromCity=fromCity, # 出发城市 toCity=toCity, # 到达城市 depAirport=dep_airport, # 出发机场 arrAirport=arr_airport, # 到达机场 currency=currency, # 货币种类 adultPrice=float(price), # 成人票价 adultTax=0, # 税价 netFare=float(price), # 净票价 maxSeats=self.search_seat, # 可预定座位数 cabin='E', # 舱位 carrier='AS', # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=int(time.time()), )) yield item # 设置失效 if item is None: data = { 'depAirport': dep_code, 'arrAirport': arr_code, 'date': datetime.strptime(_date, "%m/%d/%Y").strftime('%Y%m%d') } content = push_date(self.settings.get('PUSH_DATA_URL'), params={'carrier': self.spider_name}, action='invalid', data_array=[data]) self.log('[%s] %s-%s no flight.' % (_date, dep_code, arr_code), 20)
def parse(self, response): meta = response.meta _from = meta.get('_from') _to = meta.get('_to') _date = meta.get('_date') if response.status == 404: # data=null self.log( '404, No flights on this day: %s-%s[%s]' % (_from, _to, _date), 20) return elif response.status == 500: self.log(response.text, 40) return data_map = json.loads(response.text).get('data') trips = data_map.get('trips') if not trips: # 当天无航班 pass else: # 货币种类 currencyCode = data_map.get('currencyCode') faresAvailable = data_map.get('faresAvailable') for trip in trips: journeysAvailable = trip['journeysAvailable'] for journey in journeysAvailable: if journey['stops'] == 0: segments = journey['segments'][0] designator = segments['designator'] # 机场,时间 dep_airport = designator['origin'] arr_airport = designator['destination'] dep_date = designator['departure'] arr_date = designator['arrival'] from_city = self.city_airport.get( dep_airport, dep_airport) to_city = self.city_airport.get( arr_airport, arr_airport) # 航班号,航司 identifier = segments['identifier'] flight_number = identifier['identifier'] carrierCode = identifier['carrierCode'] items = [] fares = journey['fares'] for fare_key, fare_value in fares.items(): # 座位 availableCount = fare_value['availableCount'] # 仓位 classOfService = fare_value['classOfService'] fare_rec = faresAvailable[fare_key] passengerFares = fare_rec['passengerFares'][0] # 纯价格 serviceCharges = float( passengerFares['serviceCharges'][0]['amount']) # 总价 fareAmount = float(passengerFares['fareAmount']) # 管理费 admin_fees = self.admin_fees.get(dep_airport + arr_airport) if admin_fees is None: self.log( 'new flight line, no found admin fees', 40) else: admin_fees = float(admin_fees) item = FlightsItem() item.update( dict( flightNumber=carrierCode + flight_number, # 航班号 depTime=time.mktime( time.strptime( dep_date, "%Y-%m-%dT%H:%M:%S")).__int__( ), # 出发时间 arrTime=time.mktime( time.strptime( arr_date, "%Y-%m-%dT%H:%M:%S")).__int__( ), # 达到时间 fromCity=from_city, # 出发城市 toCity=to_city, # 到达城市 depAirport=dep_airport, # 出发机场 arrAirport=arr_airport, # 到达机场 currency=currencyCode, # 货币种类 adultPrice=fareAmount + admin_fees, # 成人票价 adultTax=admin_fees, # 税价 netFare=fareAmount, # 净票价 maxSeats=availableCount, # 可预定座位数 cabin=classOfService, # 舱位 carrier=carrierCode, # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=int(time.time()), )) items.append(item) if not fares: item = FlightsItem() item.update( dict( flightNumber=carrierCode + flight_number, # 航班号 depTime=time.mktime( time.strptime(dep_date, "%Y-%m-%dT%H:%M:%S") ).__int__(), # 出发时间 arrTime=time.mktime( time.strptime(arr_date, "%Y-%m-%dT%H:%M:%S") ).__int__(), # 达到时间 fromCity=from_city, # 出发城市 toCity=to_city, # 到达城市 depAirport=dep_airport, # 出发机场 arrAirport=arr_airport, # 到达机场 currency=currencyCode, # 货币种类 adultPrice=0, # 成人票价 adultTax=0, # 税价 netFare=0, # 净票价 maxSeats=0, # 可预定座位数 cabin='', # 舱位 carrier=carrierCode, # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="[]", # 中转时的各个航班信息 getTime=int(time.time()), )) yield item continue yield min(items, key=lambda item: item['adultPrice'])
def parse(self, response): meta = response.meta # _from = meta.get('_from') # _to = meta.get('_to') # _date = meta.get('_date') self.is_ok = True if 'origin' in meta: scid = response.xpath('//a[@class="logout__name"]/@href').re_first( r'.*scid=(.*)') dsid = response.xpath( '//div[@class="flight-results__wrapper clearfix"]/@data-dsid' ).extract_first() set_cookies = response.headers.getlist('Set-Cookie') cookie_items = [ re.match(r'(.*?)=(.*?);', i).groups() for i in set_cookies ] cookies = [] for k, v in cookie_items: cookie = { u'domain': u'.jet2.com', u'secure': False, u'value': unicode(v), u'expiry': None, u'path': u'/', u'httpOnly': False, u'name': unicode(k) } cookies.append(cookie) flight_search_url = 'https://www.jet2.com/api/search/flightsearchresults/update?scid=' + scid headers = { 'accept-encoding': "gzip, deflate, br", 'accept-language': "zh-CN,zh;q=0.9", 'adrum': "isAjax:true", 'content-type': "application/json; charset=UTF-8", 'origin': "https://www.jet2.com", 'referer': response.url, 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36", 'x-requested-with': "XMLHttpRequest", 'Cache-Control': "no-cache" } td_date = response.xpath( '//tbody[@class="calendar__body"]/tr/td[@class="calendar__day "]' ) for td in td_date: flight_id = td.xpath( '@data-cheapest-flight-id').extract_first() data_date = td.xpath('@data-date').extract_first() if datetime.strptime( data_date, "%Y-%m-%d") > datetime.today() + timedelta(31): continue body = { "isCalendarSelection": True, "flightId": flight_id, "date": data_date, "isOutbound": True, "isFull": False, "datasource": dsid } yield scrapy.Request(flight_search_url, method='POST', headers=headers, cookies=cookies, body=json.dumps(body), errback=self.errback, callback=self.parse) else: json_data = json.loads(response.text) # 如果是最后一页停止请求,防止死循环 if 'end' not in response.meta: html = json_data['Data']['Html'] r = HtmlResponse('', body=html.encode('utf-8')) flight_ids = r.xpath( '//div[@class="times-summary__item "]/@data-flight-id' ).extract() request_body = json.loads(response.request.body) # 如果当前页面有多个航班继续请求 request_body.pop('date') for flight_id in flight_ids: request_body.update({ 'flightId': flight_id, 'isCalendarSelection': False }) yield scrapy.Request(response.url, method='POST', headers=response.request.headers, cookies=response.request.cookies, body=json.dumps(request_body), meta={'end': 1}, callback=self.parse, errback=self.errback) # 解析json数据 products = json_data['Gtm']['ecommerce']['click']['products'] products_0 = products[0] flight_number = products_0['dimension6'] _, dep_date, _, arr_date = products_0['dimension57'].split('_') currency = products_0['dimension17'] dep_airport = products_0['dimension4'] arr_airport = products_0['dimension9'] price = products_0['price'] fromCity = self.city_airport.get(dep_airport, dep_airport) toCity = self.city_airport.get(arr_airport, arr_airport) item = FlightsItem() item.update( dict( flightNumber=flight_number, # 航班号 depTime=int( time.mktime( time.strptime(dep_date, "%Y-%m-%dT%H:%M:%S"))), # 出发时间 arrTime=int( time.mktime( time.strptime(arr_date, "%Y-%m-%dT%H:%M:%S"))), # 达到时间 fromCity=fromCity, # 出发城市 toCity=toCity, # 到达城市 depAirport=dep_airport, # 出发机场 arrAirport=arr_airport, # 到达机场 currency=currency, # 货币种类 adultPrice=price, # 成人票价 adultTax=0, # 税价 netFare=price, # 净票价 maxSeats=5, # 可预定座位数 cabin='E', # 舱位 carrier=flight_number[:2], # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=int(time.time()), )) yield item
def parse(self, response): # meta = response.meta # _from = meta.get('_from') # _to = meta.get('_to') # _date = meta.get('_date') res = json.loads(response.text) if 'origin' in response.meta: multiDayAvailabilityOutbound = res['multiDayAvailabilityOutbound'] r = HtmlResponse(url=self.start_urls, body=multiDayAvailabilityOutbound.encode('utf-8')) __RequestVerificationToken = r.xpath( '//div[@class="animation-container"]//input[@name="__RequestVerificationToken"]/@value' ).extract_first() li_days = r.xpath( '//*[@class="HV-gc bulletless days"]/li/div[@class="day day-with-availability"]' ) for li in li_days: date_date = li.xpath('@data-date').extract_first() body = { 'selectSingleDayAvailability.JourneyType': 'OutboundFlight', 'selectSingleDayAvailability.Date.DateToParse': date_date[:10], 'selectSingleDayAvailability.AutoSelect': False, '__RequestVerificationToken': __RequestVerificationToken } yield scrapy.Request( self.select_url, method='POST', body=parse.urlencode(body), headers=response.request.headers, cookies=response.request.cookies, ) else: SingleDayOutbound = res['SingleDayOutbound'] html = HtmlResponse(url='', body=SingleDayOutbound.encode('utf-8')) buttons = html.xpath('//button[@class="flight-result-button"]') for button in buttons: # 机场 button_value = button.xpath('@value').extract_first() dep_airport, arr_airport = re.findall(r'~(\w{3})~', button_value)[:2] fromCity = self.city_airport.get(dep_airport, dep_airport) toCity = self.city_airport.get(arr_airport, arr_airport) # 时间 div_times = button.xpath('div[@class="times"]') departure = div_times.xpath( 'time[@class="departure"]/@datetime').extract_first() departure_time = div_times.xpath( 'time[@class="departure"]/text()').extract_first().strip() dep_date = "%s %s:00" % (departure[:10], departure_time) arrival = div_times.xpath( 'time[@class="arrival"]/@datetime').extract_first() arrival_time = div_times.xpath( 'time[@class="arrival"]/text()').extract_first().strip() arr_date = "%s %s:00" % (arrival[:10], arrival_time) # 航班号 details = button.xpath('div[@class="details"]') flight_number_list = details.xpath( 'ul/li[@class="flight-number"]/text()').extract() flight_number = flight_number_list[1].strip() # 价格 actions = button.xpath('div[@class="actions"]') price_div = actions.xpath('div[contains(@class, "price")]') currency = price_div.xpath( 'span[@class="currency"]/text()').extract_first().strip() currency = self.currency_cache.get(currency, currency) price = price_div.xpath('text()[2]').extract_first().strip() item = FlightsItem() item.update( dict( flightNumber=flight_number, # 航班号 depTime=int( time.mktime( time.strptime(dep_date, "%d/%m/%Y %H:%M:%S"))), # 出发时间 arrTime=int( time.mktime( time.strptime(arr_date, "%d/%m/%Y %H:%M:%S"))), # 达到时间 fromCity=fromCity, # 出发城市 toCity=toCity, # 到达城市 depAirport=dep_airport, # 出发机场 arrAirport=arr_airport, # 到达机场 currency=currency, # 货币种类 adultPrice=float(price), # 成人票价 adultTax=0, # 税价 netFare=float(price), # 净票价 maxSeats=3, # 可预定座位数 cabin='E', # 舱位 carrier=flight_number[:2], # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=int(time.time()), )) yield item