コード例 #1
0
ファイル: bx_kr.py プロジェクト: Biking0/spider_project
    def parse(self, response):
        meta = response.meta
        _from = meta.get('_from')
        _to = meta.get('_to')
        _date = meta.get('_date')

        result = json.loads(response.text)
        # print(response.text)
        # 航班
        listFareIntAvail = jsonpath(result, '$..list.*')

        item = None

        if listFareIntAvail:
            for rec in listFareIntAvail:

                judge = rec.get('fareRate')
                if not judge:
                    continue
                # 税
                fuelAd = int(jsonpath(rec, '$..fuelAD')[0])
                taxAd = int(jsonpath(rec, '$..taxAD')[0])

                # 机场和日期
                depDate = jsonpath(rec, '$..depDate')[0]
                depCity = jsonpath(rec, '$..depCity')[0]
                arrCity = jsonpath(rec, '$..arrCity')[0]

                # 航班详情
                flightNumber = jsonpath(rec, '$..flightNo')[0]

                depTime = '%s%s' % (depDate, jsonpath(rec, '$..depTime')[0])
                arrTime = '%s%s' % (depDate, jsonpath(rec, '$..arrTime')[0])

                # 城市-机场
                from_city = self.city_airport.get(depCity, depCity)
                to_city = self.city_airport.get(arrCity, arrCity)

                bookingClass = jsonpath(rec, '$..bookingClass')[0]
                fareNet = int(jsonpath(rec, '$..fareNet')[0])
                availSeat = jsonpath(rec, '$..availSeat')[0]
                item = FlightsItem()
                item.update(
                    dict(
                        flightNumber=flightNumber,  # 航班号
                        depTime=time.mktime(
                            time.strptime(depTime,
                                          "%Y%m%d%H%M")).__int__(),  # 出发时间
                        arrTime=time.mktime(
                            time.strptime(arrTime,
                                          "%Y%m%d%H%M")).__int__(),  # 达到时间
                        fromCity=from_city,  # 出发城市
                        toCity=to_city,  # 到达城市
                        depAirport=depCity,  # 出发机场
                        arrAirport=arrCity,  # 到达机场
                        currency='KRW',  # 货币种类
                        adultPrice=fareNet + taxAd + fuelAd,  # 成人票价
                        adultTax=taxAd + fuelAd,  # 税价
                        netFare=fareNet,  # 净票价
                        maxSeats=availSeat,  # 可预定座位数
                        cabin=bookingClass,  # 舱位
                        carrier=flightNumber[:2],  # 航空公司
                        isChange=1,  # 是否为中转 1.直达2.中转
                        segments="[]",  # 中转时的各个航班信息
                        getTime=time.mktime(
                            datetime.now().timetuple()).__int__(),
                    ))

                yield item

        else:
            # params = {'carrier': 'BX'}
            # data_array = []
            # data = {
            #     'fromCity': _from,
            #     'toCity': _to,
            #     'date': _date,
            # }
            # data_array.append(data)
            #
            # res = push_date(settings.PUSH_DATA_URL, params=params,
            #                 action='invalid', data_array=data_array)
            if 'EE590' in response.text:
                self.log("ip 不可用", 40)
            else:
                self.log("%s-%s:%s no flights" % (_from, _to, _date), 20)
            pass
コード例 #2
0
ファイル: tr.py プロジェクト: Biking0/spider_project
    def parse(self, response):
        meta = response.meta
        _from = meta.get('_from')
        _to = meta.get('_to')
        _date = meta.get('_date')
        _num = meta.get('_num')

        if '<title>WAF</title>' in response.text:
            self.log('<title>WAF</title> be banned. retry...', 40)
            self.banned = True
            # if not self.proxy:
            #     time.sleep(2)
            # yield response.request
        elif 'origin' in response.meta:
            headers = {
                'accept': "text/html, */*; q=0.01",
                'accept-encoding': "gzip, deflate, br",
                'accept-language': "zh-CN,zh;q=0.9",
                'referer': "https://makeabooking.flyscoot.com/Book/Flight",
                'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
                'x-distil-ajax': "ysbctttfwzatvbzercutdyvxzsasrf",
                'x-requested-with': "XMLHttpRequest",
                'Cache-Control': "no-cache"
            }

            for i, _date in enumerate(self._get_dates(_date, int(_num))):
                # if i == 0:
                #     continue
                params = parse.urlencode({
                    'AvailabilityAjax.LowFareMarketDate': '0|%s' % _date,
                    'AvailabilityAjax.Market': '%s|%s' % (_from, _to),
                })

                total_url = self.start_urls + params
                yield scrapy.Request(total_url,
                                     cookies=response.request.cookies,
                                     headers=headers,
                                     meta={'_from': _from, '_to': _to, '_date': _date},
                                     callback=self.parse,
                                     errback=self.errback
                                     )
        else:
            results = response.xpath('//*[@id="departure-results"]/div')

            item = None
            for departure_results in results:
                span = departure_results.xpath(
                    'div[@class="flight__stop"]//div[@class="flight-stop"]/span/text()').extract_first()
                # 跳过非直航
                if span != u'Direct Flight':
                    continue

                # 航空公司,航班号, 机场,时间
                _input = departure_results.xpath(
                    'div[@class="flight__upgrade-box"]//div[@data-fare="fly"]//input/@value').extract_first()

                # 经济舱优先
                if _input:
                    m = filter(lambda x: x, re.split(r'~|\s+', re.match(r'.*\|(.*)~$', _input).group(1)))
                    carrier, number, depAirport, date_from, time_from, arrAirport, date_to, time_to = m
                    flightNumber = carrier + number
                    price = departure_results.xpath(
                        'div[@class="flight__fly"]//span[contains(@class, "price--sale")]/text()').extract()
                    cabin = 'E'

                    # 出发,到达时间
                    depTime = time.mktime(time.strptime(date_from + time_from, "%m/%d/%Y%H:%M")).__int__()
                    arrTime = time.mktime(time.strptime(date_to + time_to, "%m/%d/%Y%H:%M")).__int__()

                    # 座位
                    seats = departure_results.xpath('div[@class="flight__fly"]/p/text()').extract_first()
                    left_seats = seats[:1] if seats else 10

                # 商务舱
                else:
                    # 机场
                    depAirport = departure_results.xpath('div[@class="flight__from"]/ul/li/text()').extract_first()[:3]
                    arrAirport = departure_results.xpath('div[@class="flight__to"]/ul/li/text()').extract_first()[:3]

                    # 航空公司,航班号
                    data_content = departure_results.xpath(
                        'div[@class="flight__stop"]/div[@role="button"]/@data-content').extract_first()
                    data_xml = etree.HTML(data_content)
                    p = data_xml.xpath('//p/text()')[0]
                    flightNumber = re.sub(r'\s*', '', re.match(r'Departing Flight:(.*)\(Scoot\)', p).group(1))
                    carrier = flightNumber[:2]

                    # 出发,到达时间
                    li = data_xml.xpath('//ul/li/text()')
                    time_from, time_to = map((lambda x: re.match(r'\w+: (.*?)\).*', x).group(1)), li[:2])
                    depTime = time.mktime(time.strptime(time_from, "%H:%M%p (%a, %d %b %Y")).__int__()
                    arrTime = time.mktime(time.strptime(time_to, "%H:%M%p (%a, %d %b %Y")).__int__()

                    price = departure_results.xpath(
                        'div[@class="flight__scootbiz visible-xs"]//span[contains(@class, "price--sale")]/text()').extract()
                    cabin = 'S'

                    # 座位
                    seats = departure_results.xpath('div[@class="flight__scootbiz visible-xs"]/p/text()').extract_first()
                    left_seats = seats[:1] if seats else 10

                # 货币种类,价格,座位
                if len(price) == 1:
                    currency = price[0][:3]
                    adultPrice = float(price[0][3:].replace(',', ''))

                elif len(price) == 2:
                    # 换行
                    currency = price[0]
                    adultPrice = float(price[1].replace(',', ''))

                else:
                    continue

                from_city = self.city_airport.get(depAirport, depAirport)
                to_city = self.city_airport.get(arrAirport, arrAirport)

                item = FlightsItem()
                item.update(dict(
                    flightNumber=flightNumber,
                    depTime=depTime,  # 出发时间
                    arrTime=arrTime,  # 达到时间
                    fromCity=from_city,  # 出发城市
                    toCity=to_city,  # 到达城市
                    depAirport=depAirport,  # 出发机场
                    arrAirport=arrAirport,  # 到达机场
                    currency=currency,  # 货币种类
                    adultPrice=adultPrice,  # 成人票价
                    adultTax=0,  # 税价
                    netFare=adultPrice,  # 净票价
                    maxSeats=left_seats,  # 可预定座位数
                    cabin=cabin,  # 舱位
                    carrier=carrier,  # 航空公司
                    isChange=1,  # 是否为中转 1.直达2.中转
                    segments="NULL",  # 中转时的各个航班信息
                    getTime=time.time().__int__(),
                ))
                yield item

            # 无航班设置失效
            if not item:
                data = {
                    'fromCity': _from,
                    'toCity': _to,
                    'date': re.sub(r'(\d+)/(\d+)/(\d+)', r'\3\1\2', _date),
                }
                res = push_date(settings.PUSH_DATA_URL, {'carrier': 'TR'}, 'invalid', [data])
                self.log('%s-%s: %s no flights' % (_from, _to, _date), 20)

        self.crawler.engine.unpause()
コード例 #3
0
    def parse(self, response):
        meta = response.meta
        _from = meta.get('_from')
        _to = meta.get('_to')
        token = meta.get('token')
        _next_date = meta.get('_next_date')

        # ajax请求
        select_departure = response.xpath('//*[@id="select_departure"]/table/tbody//input[@data-lowest="True"]/@value').extract_first()

        # 请求下一天数据时用到
        btn_next = response.xpath('//*[@id="select_departure"]/a[@class="btn_next"]')
        DepartureDate = btn_next.xpath('@data-date').extract_first()
        JourneyIndex = btn_next.xpath('@data-index').extract_first()
        Incrementer = btn_next.xpath('@data-incrementer').extract_first()
        next_headers = {
            'Accept': "application/json, text/javascript, */*; q=0.01",
            'Accept-Encoding': "gzip, deflate, br",
            'Accept-Language': "zh-CN,zh;q=0.9",
            'Connection': "keep-alive",
            'Content-Type': "application/x-www-form-urlencoded",
            'Host': "booking.hkexpress.com",
            'Origin': "https://booking.hkexpress.com",
            'Referer': response.request.headers['Referer'],
            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
            'X-Requested-With': "XMLHttpRequest",
            'Cache-Control': "no-cache"
        }

        # 解析初始页面,第一周请求入队
        if 'origin' in response.meta and response.meta['origin'] == 1:
            origin_token = response.xpath('//*[@name="__RequestVerificationToken"]/@value').extract_first()
            li_list = response.xpath('//*[@id="select_departure"]/div[@class="selectdate"]/ul/li[@class!="disabled" and @class!="current"]')
            for li in li_list:
                data_date = li.xpath('@data-date').extract_first()
                data = {"DatesSelected": [data_date],
                        "SelectedFares": [select_departure],
                        "aftoken": origin_token}
                DateTabSelect_url = self.start_urls + '/Search/DateTabSelect'
                if li == li_list[-1]:
                    yield scrapy.Request(DateTabSelect_url, method='POST',
                                         cookies=response.request.cookies,
                                         headers=response.request.headers,
                                         body=json.dumps(data),
                                         meta={'origin': 2, '_next_date': DepartureDate, 'token': origin_token,
                                               '_from': _from, '_to': _to,
                                               },
                                         priority=1,
                                         dont_filter=True)
                else:
                    yield scrapy.Request(DateTabSelect_url, method='POST',
                                         headers=response.request.headers,
                                         cookies=response.request.cookies,
                                         body=json.dumps(data),
                                         meta={'token': origin_token,
                                               '_from': _from, '_to': _to,
                                               },
                                         priority=1,
                                         dont_filter=True)
            else:
                # 更新session
                next_data = {
                    'DepartureDate': DepartureDate,
                    'JourneyIndex': JourneyIndex,
                    'Incrementer': Incrementer,
                    'aftoken': origin_token
                }
                next_date_url = self.start_urls + '/Search/NextDate'
                yield scrapy.Request(next_date_url, method='POST',
                                     headers=next_headers,
                                     cookies=response.request.cookies,
                                     body=parse.urlencode(next_data),
                                     dont_filter=True,
                                     priority=1,
                                     callback=self.parse_next
                                     )

        # 解析通用页面,价格不含税
        tr_list = response.xpath('//*[@id="select_departure"]/table/tbody/tr')
        items = []
        time_array = None
        for tr in tr_list:
            # 出发时间
            Departure = tr.xpath('td[@data-title="Departure"]')
            dep_date = Departure.xpath('span[@class="sr-only"]/text()').extract_first()
            time_array = dep_date
            dep_time = Departure.xpath('strong[@class="depart-time"]/text()').extract_first()
            data_std = dep_date+dep_time
            # 出发机场
            dep_text = Departure.xpath('text()').extract().pop()
            dep_airport = re.search(r'\w+', dep_text).group()
            # 到达时间
            Arrival = tr.xpath('td[@data-title="Arrival"]')
            arr_date = Arrival.xpath('span[@class="sr-only"]/text()').extract_first()
            arr_time = Arrival.xpath('strong[@class="arrive-time"]/text()').extract_first().split()
            data_sta = arr_date+arr_time[0]
            # 到达机场
            arr_text = Arrival.xpath('text()').extract().pop()
            arr_airport = re.search(r'\w+', arr_text).group()
            # 航班号
            Flight = tr.xpath('td[@data-title="Flight"]')
            flight_text = Flight.xpath('strong/text()').extract_first().split()
            carrier, flight_number = flight_text[-2:]

            flight_change = Flight.xpath('text()').extract().pop().strip()
            if flight_change == u"Direct Flight":
                # 价格
                Fun = tr.xpath('td[@data-title="Fun"]')
                _input = Fun.xpath('label/input')
                if _input:
                    # value = _input.xpath('@value').extract_first()
                    fun_currency, fun_price = Fun.xpath('label/span[@class="table_price"]/text()').extract_first().split()
                    fun_price = float(re.sub(r',', '', fun_price))
                    seat = self.seat
                else:
                    fun_currency, fun_price, seat = '', 0.00, 0
                # Fun_plus = tr.xpath('td[@data-title="Fun+"]')
                # UBiz = tr.xpath('td[@data-title="UBiz"]')

                fromCity = self.city_airport.get(dep_airport, dep_airport)
                toCity = self.city_airport.get(arr_airport, arr_airport)

                item = FlightsItem()
                item.update(dict(
                    flightNumber=carrier+flight_number,  # 航班号
                    depTime=int(time.mktime(time.strptime(data_std, "%Y-%m-%d%H:%M"))),  # 出发时间
                    arrTime=int(time.mktime(time.strptime(data_sta, "%Y-%m-%d%H:%M"))),  # 达到时间
                    fromCity=fromCity,  # 出发城市
                    toCity=toCity,  # 到达城市
                    depAirport=dep_airport,  # 出发机场
                    arrAirport=arr_airport,  # 到达机场
                    currency=fun_currency,  # 货币种类
                    adultPrice=fun_price,  # 成人票价
                    adultTax=0,  # 税价
                    netFare=fun_price,  # 净票价
                    maxSeats=seat,  # 可预定座位数
                    cabin='ECO',  # 舱位
                    carrier=carrier,  # 航空公司
                    isChange=1,  # 是否为中转 1.直达2.中转
                    segments="NULL",  # 中转时的各个航班信息
                    getTime=int(time.time()),
                ))

                if item['netFare'] == 0:
                    yield item
                else:
                    items.append(item)

        # 请求税
        if select_departure:
            # 先查询缓存有没有税价
            tax_cache = self.tax_cache.get("%s%s" % (_from, _to))
            if tax_cache:
                for item in items:
                    item.update(dict(
                        adultPrice=tax_cache + item['netFare'],  # 成人票价
                        adultTax=tax_cache,  # 税价
                    ))
                    yield item
            else:
                tax_url = self.start_urls + '/Search/FareSelect'
                tax_data = {
                    'JourneyFareSellKeys': [select_departure],
                    'aftoken': token
                }
                yield scrapy.Request(tax_url, method='POST',
                                     cookies=response.request.cookies,
                                     meta={'items': items,
                                           '_from': _from, '_to': _to,
                                           },
                                     body=json.dumps(tax_data),
                                     callback=self.parse_tax,
                                     dont_filter=True,
                                     priority=1)
        elif items:
            # 设置失效
            data = {'fromCity': _from, 'toCity': _to,
                    'date': '{:%Y%m%d}'.format(datetime.strptime(time_array, '%Y-%m-%d'))}
            push_date(settings.PUSH_DATA_URL,
                      params={'carrier': self.spider_name},
                      action='invalid', data_array=[data])
            # for item in items: yield item

        # 请求下一天数据
        if 'origin' in response.meta and response.meta['origin'] == 2 and \
                datetime.strptime(_next_date, '%Y-%m-%d') < (datetime.now() + timedelta(30)):
            data = {"DatesSelected": [_next_date],
                    "SelectedFares": [select_departure],
                    "aftoken": token}
            DateTabSelect_url = self.start_urls + '/Search/DateTabSelect'
            yield scrapy.Request(DateTabSelect_url, method='POST',
                                 cookies=response.request.cookies,
                                 body=json.dumps(data),
                                 meta={'origin': 2, '_next_date': DepartureDate, 'token': token,
                                       '_from': _from, '_to': _to,
                                       },
                                 dont_filter=True,
                                 priority=1)

            # 更新session
            next_data = {
                'DepartureDate': DepartureDate,
                'JourneyIndex': JourneyIndex,
                'Incrementer': Incrementer,
                'aftoken': token
            }
            next_date_url = self.start_urls + '/Search/NextDate'
            yield scrapy.Request(next_date_url, method='POST',
                                 headers=next_headers,
                                 cookies=response.request.cookies,
                                 body=parse.urlencode(next_data),
                                 dont_filter=True,
                                 priority=1,
                                 callback=self.parse_next
                                 )

        # 恢复引擎
        self.crawler.engine.unpause()
コード例 #4
0
ファイル: ws.py プロジェクト: Biking0/spider_project
    def parse(self, response):
        meta = response.meta
        FROM = meta.get('FROM')
        TO = meta.get('TO')
        _day = meta.get('_day')

        from_city = self.city_airport.get(FROM, FROM)
        to_city = self.city_airport.get(TO, TO)

        response_dict = json.loads(response.body)

        record = jsonpath.jsonpath(response_dict, '$.flights')[0]

        item = None
        for rec in record:
            departAirportCode = jsonpath.jsonpath(rec,
                                                  '$..departAirportCode')[0]
            arrivalAirportCode = jsonpath.jsonpath(rec,
                                                   '$..arrivalAirportCode')[0]
            currency = jsonpath.jsonpath(rec, '$..currency')[0]

            results = jsonpath.jsonpath(rec, '$..flightOptions')[0]

            item = None
            for r in results:
                try:
                    flightSummaryStops = jsonpath.jsonpath(
                        r, '$..flightSummaryStops')[0]
                    if flightSummaryStops == "NONSTOP":
                        flightDetails = jsonpath.jsonpath(
                            r, '$..flightDetails')
                        priceDetails = jsonpath.jsonpath(r, '$..priceDetails')

                        flightNumber = jsonpath.jsonpath(
                            flightDetails, '$..flightNumber')[0]
                        operatingAirline = jsonpath.jsonpath(
                            flightDetails, '$..operatingAirline')[0]
                        departureDateRaw = jsonpath.jsonpath(
                            flightDetails, '$..departureDateRaw')[0]
                        arrivalDateRaw = jsonpath.jsonpath(
                            flightDetails, '$..arrivalDateRaw')[0]
                        for price in priceDetails[0]:
                            fareType = jsonpath.jsonpath(price,
                                                         '$..fareType')[0]
                            totalFareAmount = float(
                                jsonpath.jsonpath(price,
                                                  '$..totalFareAmount')[0])
                            totalTaxAmount = float(
                                jsonpath.jsonpath(price,
                                                  '$..totalTaxAmount')[0])
                            try:
                                seatsAvailable = int(
                                    jsonpath.jsonpath(price,
                                                      '$..seatsAvailable')[0])
                            except:
                                seatsAvailable = 6

                            if fareType == 'Econo' and operatingAirline == 'WS':
                                item = FlightsItem()
                                item.update(
                                    dict(
                                        flightNumber="WS%s" %
                                        flightNumber,  # 航班号
                                        depTime=time.mktime(
                                            time.strptime(
                                                departureDateRaw[:-4],
                                                "%Y-%m-%dT%H:%M:%S")).__int__(
                                                ),  # 出发时间
                                        arrTime=time.mktime(
                                            time.strptime(
                                                arrivalDateRaw[:-4],
                                                "%Y-%m-%dT%H:%M:%S")).__int__(
                                                ),  # 达到时间
                                        fromCity=from_city,  # 出发城市
                                        toCity=to_city,  # 到达城市
                                        depAirport=departAirportCode,  # 出发机场
                                        arrAirport=arrivalAirportCode,  # 到达机场
                                        currency=currency,  # 货币种类
                                        adultPrice=totalFareAmount,  # 成人票价
                                        adultTax=totalTaxAmount,  # 税价
                                        netFare=totalFareAmount -
                                        totalTaxAmount,  # 净票价
                                        maxSeats=seatsAvailable,  # 可预定座位数
                                        cabin='ECO',  # 舱位
                                        carrier='WS',  # 航空公司
                                        isChange=1,  # 是否为中转 1.直达2.中转
                                        segments="NULL",  # 中转时的各个航班信息
                                        getTime=time.mktime(datetime.now(
                                        ).timetuple()).__int__(),
                                    ))
                                yield item
                except Exception as e:
                    self.log(e, 40)

        if item is None:
            # 设置失效
            _day = _day.replace('-', '')
            data = {'fromCity': FROM, 'toCity': TO, 'date': _day}
            res = push_date(settings.PUSH_DATA_URL,
                            params={'carrier': self.spider_name},
                            action='invalid',
                            data_array=[data])
            self.log('%s-%s: %s no flights' % (FROM, TO, _day), level=20)
            pass
コード例 #5
0
    def parse(self, response):
        meta = response.meta
        logging.debug('proxy: %s' % meta.get('proxy'))
        # print(response.body)
        item = FlightsItem()
        try:
            response_dict = json.loads(response.body)
            self.is_ok = True
            results = jsonpath.jsonpath(response_dict,
                                        '$..OutboundOptions.Option')
            LegOption = jsonpath.jsonpath(results, '$..LegOption')
            DepartureDate = jsonpath.jsonpath(results, '$..DepartureDate')
            if DepartureDate and DepartureDate[0]:
                for result in LegOption:
                    SegmentOptions = jsonpath.jsonpath(result,
                                                       '$..SegmentOptions')[0]

                    segmentOption = SegmentOptions.get('SegmentOption')
                    if isinstance(segmentOption, list):
                        continue
                    Surcharge = jsonpath.jsonpath(result,
                                                  '$.Surcharges.Surcharge')
                    Total = jsonpath.jsonpath(Surcharge, '$..Total')
                    if Total:
                        Total = sum(Total)
                    else:
                        Total = 0
                    Flight = jsonpath.jsonpath(SegmentOptions, '$..Flight')
                    flightNumber = jsonpath.jsonpath(Flight, '$..Number')[0]
                    depTime = jsonpath.jsonpath(Flight, '$..ETDLocal')[0]
                    arrTime = jsonpath.jsonpath(Flight, '$..ETALocal')[0]
                    DepartureAirport = jsonpath.jsonpath(
                        Flight, '$..DepartureAirport.Code')[0]
                    ArrivalAirport = jsonpath.jsonpath(
                        Flight, '$..ArrivalAirport.Code')[0]

                    FareOption = jsonpath.jsonpath(result, '$..FareOption.*')

                    from_city = self.city_airport.get(DepartureAirport,
                                                      DepartureAirport)
                    to_city = self.city_airport.get(ArrivalAirport,
                                                    ArrivalAirport)
                    # 缓存最低价
                    item_cache = None
                    for rec in FareOption:
                        DiscountFare = jsonpath.jsonpath(
                            rec, '$..DiscountFare')[0]
                        DiscountFareTaxes = jsonpath.jsonpath(
                            rec, '$..DiscountFareTaxes')[0]
                        DiscountFareTotal = jsonpath.jsonpath(
                            rec, '$..DiscountFareTotal')[0]
                        Abbreviation = jsonpath.jsonpath(
                            rec, '$..Currency.Abbreviation')[0]
                        SeatsAvailable = jsonpath.jsonpath(
                            rec, '$..SeatsAvailable')[0]
                        FareCategory = jsonpath.jsonpath(
                            rec, '$..FareCategory')[0]

                        item.update(
                            dict(
                                flightNumber=flightNumber,  # 航班号
                                depTime=time.mktime(
                                    time.strptime(depTime, "%Y-%m-%dT%H:%M:%S")
                                ).__int__(),  # 出发时间
                                arrTime=time.mktime(
                                    time.strptime(arrTime, "%Y-%m-%dT%H:%M:%S")
                                ).__int__(),  # 达到时间
                                fromCity=from_city,  # 出发城市
                                toCity=to_city,  # 到达城市
                                depAirport=DepartureAirport,  # 出发机场
                                arrAirport=ArrivalAirport,  # 到达机场
                                currency=Abbreviation,  # 货币种类
                                adultPrice=DiscountFareTotal + Total,  # 成人票价
                                adultTax=DiscountFareTaxes + Total,  # 税价
                                netFare=DiscountFare,  # 净票价
                                maxSeats=SeatsAvailable,  # 可预定座位数
                                cabin=FareCategory,  # 舱位
                                carrier=flightNumber[:2],  # 航空公司
                                isChange=1,  # 是否为中转 1.直达2.中转
                                segments="[]",  # 中转时的各个航班信息
                                getTime=time.mktime(
                                    datetime.now().timetuple()).__int__(),
                            ))
                        if not item_cache or item['adultPrice'] < item_cache[
                                'adultPrice']:
                            item_cache = item.copy()

                    if item_cache['cabin'] != 'SkyBoss':
                        yield item_cache
                    else:
                        yield None
        except:
            self.is_ok = False
            logging.error(response.body)
            logging.error('error ddddd', traceback.format_exc())
            pass
コード例 #6
0
    def parse(self, response):
        meta = response.meta
        _from = meta.get('_from')
        _to = meta.get('_to')
        # _date = meta.get('_date')
        available_dates = meta.get('available_dates')

        if available_dates:
            session = re.match(r'https://.*?/(.*?)/.*', response.url).group(1)
            for _date in available_dates:
                params = {
                    's': True,
                    'o1': _from,
                    'd1': _to,
                    'dd1': _date,
                    'ADT': self.search_seat,
                    'mon': True,
                    'bpc': False,
                    # 'bc': 'EUR'
                }
                total_url = self.start_urls.format(
                    s=session + '/') + parse.urlencode(params)
                yield scrapy.Request(total_url,
                                     meta={
                                         '_from': _from,
                                         '_to': _to,
                                         '_date': _date
                                     })

        # 解析页面
        form_container = response.xpath(
            '//div[@id="js_availability_container"]/form')

        div_fare_row = form_container.xpath(
            'div[contains(@class, "fare-row")]')
        for fare_row in div_fare_row:
            div_outbound = fare_row.xpath('div[@data-class-index="0"]/div/div')
            # 售完跳过
            if not div_outbound:
                continue

            input_value = div_outbound.xpath(
                'div[@class="fare-price-and-currency"]/input/@value').extract(
                )
            flight_values = input_value[0].split('|')[1].split('~')
            # 过滤中转
            if '^' not in input_value[0]:
                flight_values = filter(lambda x: re.sub(r'\s*', '', x),
                                       flight_values)
                try:
                    carrier, flight_number, dep_airport, dep_date, arr_airport, arr_date = flight_values
                except Exception as e:
                    print(e)
                    print(flight_values)
                    continue
                fromCity = self.city_airport.get(dep_airport, dep_airport)
                toCity = self.city_airport.get(arr_airport, arr_airport)
                currency, price = re.match('(^\D*)?(.*)',
                                           input_value[2]).groups()
                price = re.sub(r',', '', price)
                currency = self.currency_cache.get(currency, currency)

                item = FlightsItem()
                item.update(
                    dict(
                        flightNumber=carrier + flight_number.strip(),  # 航班号
                        depTime=int(
                            time.mktime(
                                time.strptime(dep_date,
                                              "%m/%d/%Y %H:%M"))),  # 出发时间
                        arrTime=int(
                            time.mktime(
                                time.strptime(arr_date,
                                              "%m/%d/%Y %H:%M"))),  # 达到时间
                        fromCity=fromCity,  # 出发城市
                        toCity=toCity,  # 到达城市
                        depAirport=dep_airport,  # 出发机场
                        arrAirport=arr_airport,  # 到达机场
                        currency=currency,  # 货币种类
                        adultPrice=float(price),  # 成人票价
                        adultTax=0,  # 税价
                        netFare=float(price),  # 净票价
                        maxSeats=self.search_seat,  # 可预定座位数
                        cabin='E',  # 舱位
                        carrier=carrier,  # 航空公司
                        isChange=1,  # 是否为中转 1.直达2.中转
                        segments="NULL",  # 中转时的各个航班信息
                        getTime=int(time.time()),
                    ))

                yield item
コード例 #7
0
ファイル: dy.py プロジェクト: Biking0/spider_project
    def parse(self, response):
        meta = response.meta
        FROM = meta.get('FROM')
        TO = meta.get('TO')
        _day = meta.get('_day')

        response_json = json.loads(response.text)
        availabilityResults = response_json.get('availabilityResults')
        currency = availabilityResults.get('currency')
        routeListOutbound = availabilityResults.get('routeListOutbound')

        item = None
        for rec in routeListOutbound:
            isTransit = rec.get('isTransit')
            # 直达
            if not isTransit:
                # 航班号
                flightList = rec.get('flightList')[0]
                flightCode = flightList.get('flightCode')

                # 城市,机场
                origin = rec.get('origin')
                destination = rec.get('destination')

                depAirport= origin.get('code')
                arrAirport= destination.get('code')
                fromCity= self.city_airport.get(depAirport, depAirport)
                toCity= self.city_airport.get(arrAirport, arrAirport)

                # 时间
                departureTime = rec.get('departureTime')
                arrivalTime = rec.get('arrivalTime')

                # 价格
                price_cache = list()
                # 标准仓
                price_cache.append(rec.get('standardIdFare'))
                price_cache.append(rec.get('standardLowFare'))
                price_cache.append(rec.get('standardLowFarePlus'))
                price_cache.append(rec.get('standardFlex'))
                # 高价仓
                price_cache.append(rec.get('premiumLowFare'))
                price_cache.append(rec.get('premiumFlex'))

                # 添加套餐
                segments = []
                keys = ['standardLowFarePlus', 'standardFlex']
                for key in keys:
                    price = rec.get(key)
                    temp_price = price.get('fareValue')
                    if temp_price == 0:
                        segments.append([0, 0])
                    else:
                        segments.append([round(temp_price, 2), price.get('seatsAvailable')])

                # 过滤掉无价格的然后按价格排序
                price_cache = sorted(filter(lambda x: x.get('fareValue'), price_cache), key=lambda x: x.get('fareValue'), reverse=True)
                if price_cache:
                    fare_data = price_cache.pop()
                else:
                    fare_data = rec.get('standardLowFare')

                # 最低价,仓位,座位
                fareValue = fare_data.get('fareValue')
                cabin = fare_data.get('bookingClass', 'E')
                seatsAvailable = fare_data.get('seatsAvailable')

                item = FlightsItem()
                item.update(dict(
                    flightNumber=flightCode,  # 航班号
                    depTime=time.mktime(time.strptime(departureTime, "%Y-%m-%dT%H:%M:%S")).__int__(),  # 出发时间 "2018-04-02T06:15:00"
                    arrTime=time.mktime(time.strptime(arrivalTime, "%Y-%m-%dT%H:%M:%S")).__int__(),  # 达到时间
                    fromCity=fromCity,  # 出发城市
                    toCity=toCity,  # 到达城市
                    depAirport=depAirport,  # 出发机场
                    arrAirport=arrAirport,  # 到达机场
                    currency=currency,  # 货币种类
                    adultPrice=float(fareValue),  # 成人票价
                    adultTax=0,  # 税价
                    netFare=float(fareValue),  # 净票价
                    maxSeats=seatsAvailable,  # 可预定座位数
                    cabin=cabin,  # 舱位
                    carrier=flightCode[:2],  # 航空公司
                    isChange=1,  # 是否为中转 1.直达2.中转
                    segments=json.dumps(segments),  # 中转时的各个航班信息
                    getTime=time.time().__int__(),
                ))
                yield item

        if item is None:
            # 设置失效
            _day = _day.replace('-', '')
            data = {'fromCity': FROM, 'toCity': TO,
                    'date': _day}
            res = push_date(self.settings.get('PUSH_DATA_URL'),
                      params={'carrier': self.spider_name},
                      action='invalid', data_array=[data])
            self.log('%s-%s: %s no flights' % (FROM, TO, _day), level=20)
            pass
コード例 #8
0
    def parse(self, response):
        meta = response.meta
        _from = meta.get('_from')
        _to = meta.get('_to')
        _date = meta.get('_date')

        # 验证码
        if 'Are you human?' in response.body:
            self.banned = True
            self.log('\t\t be banned, retry...', 20)
            yield response.request
        # 解析页面
        else:
            self.log('available proxy: %s' % response.meta['proxy'], 20)

            self.banned = False
            from_city = self.city_airport.get(_from, _from)
            to_city = self.city_airport.get(_to, _to)

            tr_set = response.xpath(
                '//*[@id="depart-table"]/tbody/tr[@class!="flight-legend"]')
            tr = [tr for tr in tr_set if tr.xpath('th/div').__len__() == 1]

            item = None
            for t in tr:
                item = FlightsItem()
                flight_number = re.sub(
                    r'\s*', '',
                    t.xpath(
                        'th//span[contains(@class, "flight-number")]/text()').
                    extract_first())

                # 判断航线时间是否跨度两天
                sup = t.xpath(
                    'td[contains(@class, "visible-sm visible-xs")]/div//sup/text()'
                ).extract_first()
                if not sup:
                    dep_time, arr_time, _ = map(
                        lambda x: re.sub(r'\s', '', x),
                        t.xpath(
                            'td[contains(@class, "visible-sm visible-xs")]/div/div/text()'
                        ).extract())
                    arr_date = _date
                else:
                    dep_time, arr_time, _, _ = map(
                        lambda x: re.sub(r'\s', '', x),
                        t.xpath(
                            'td[contains(@class, "visible-sm visible-xs")]/div/div/text()'
                        ).extract())
                    arr_date = self._add_one(_date)

                dep_airport, arr_airport = t.xpath(
                    'td[@class="avail-table-vert text-center"]/div/div/text()'
                ).extract()
                # 价格详情
                label_l = t.xpath(
                    'td[contains(@class, "fare-bundle-radio-container")]/div/label'
                )
                label = label_l[0]
                base_fare = label.xpath('@data-basefare').extract_first()
                tax = label.xpath('@data-webadminfee').extract_first()
                cabin = label.xpath('@data-fareclass').extract_first()
                currency, total_fare = (lambda x: (x[:3], x[3:]))(re.sub(
                    r'\s|,', '',
                    label.xpath('text()').extract_first()))
                if not total_fare:
                    total_fare = 0.00
                    base_fare = 0.00
                    tax = 0.00
                    currency = ''
                segments = {'F_B': 0, 'F_B_M': 0}
                try:
                    segments['F_B'] = re.sub(
                        r'\s|,', '',
                        label_l[1].xpath('text()').extract_first())
                    segments['F_B_M'] = re.sub(
                        r'\s|,', '',
                        label_l[2].xpath('text()').extract_first())
                except:
                    traceback.print_exc()
                    print('error')

                item.update(
                    dict(
                        flightNumber=flight_number,  # 航班号
                        depTime=time.mktime(
                            time.strptime(_date + dep_time,
                                          "%Y-%m-%d%H%MH")).__int__(),  # 出发时间
                        arrTime=time.mktime(
                            time.strptime(arr_date + arr_time,
                                          "%Y-%m-%d%H%MH")).__int__(),  # 达到时间
                        fromCity=from_city,  # 出发城市
                        toCity=to_city,  # 到达城市
                        depAirport=dep_airport,  # 出发机场
                        arrAirport=arr_airport,  # 到达机场
                        currency=currency,  # 货币种类
                        adultPrice=float(total_fare),  # 成人票价
                        adultTax=float(tax),  # 税价
                        netFare=float(base_fare),  # 净票价
                        maxSeats=self.search_seat,  # 可预定座位数
                        cabin=cabin,  # 舱位
                        carrier=flight_number[:2],  # 航空公司
                        isChange=1,  # 是否为中转 1.直达2.中转
                        segments=json.dumps(segments),  # 中转时的各个航班信息
                        getTime=int(time.time()),
                    ))

                yield item

            # 无数据,设置运价失效
            if not item:
                # 设置失效
                data = {'fromCity': _from, 'toCity': _to, 'date': _date}
                res = push_date(settings.PUSH_DATA_URL,
                                params={'carrier': self.spider_name},
                                action='invalid',
                                data_array=[data])
                self.log('[%s] %s-%s no flights' % (_date, _from, _to), 20)
コード例 #9
0
    def parse(self, response):
        meta = response.meta
        _from = meta.get('_from')
        _to = meta.get('_to')
        _date = meta.get('_date')

        if response.status == 500:
            self.log((_from, _to, _date), 20)
            self.log(response.text, 20)

            # 设置失效
            data = {'fromCity': _from, 'toCity': _to, 'date': _date}
            push_date(settings.PUSH_DATA_URL,
                      params={'carrier': self.spider_name},
                      action='invalid',
                      data_array=[data])

        else:
            response_dict = json.loads(response.text)
            tripInfo = jsonpath(response_dict, '$..tripInfo.*')
            if not tripInfo:
                # 设置失效
                data = {'fromCity': _from, 'toCity': _to, 'date': _date}
                push_date(settings.PUSH_DATA_URL,
                          params={'carrier': self.spider_name},
                          action='invalid',
                          data_array=[data])
            else:
                item = None
                for rec in tripInfo:
                    segmentInfo_list = rec['segmentInfo']
                    if len(segmentInfo_list) == 1:
                        segmentInfo = segmentInfo_list[0]
                        carrierCode = jsonpath(
                            segmentInfo,
                            '$.flightIdentifierInfo.carrierCode')[0]
                        flightNumber = jsonpath(
                            segmentInfo,
                            '$.flightIdentifierInfo.flightNumber')[0]

                        dep_airport = jsonpath(
                            segmentInfo, '$.departureInfo.airportCode')[0]
                        departureDateTime = jsonpath(segmentInfo,
                                                     '$.departureDateTime')[0]
                        dep_date = ''.join(
                            re.match(r'(.*)\(.*\)(.*)',
                                     departureDateTime).groups())

                        arr_airport = jsonpath(segmentInfo,
                                               '$.arrivalInfo.airportCode')[0]
                        arrivalDateTime = jsonpath(segmentInfo,
                                                   '$.arrivalDateTime')[0]
                        arr_date = ''.join(
                            re.match(r'(.*)\(.*\)(.*)',
                                     arrivalDateTime).groups())

                        # 城市
                        fromCity = self.city_airport.get(
                            dep_airport, dep_airport)
                        toCity = self.city_airport.get(arr_airport,
                                                       arr_airport)

                        # 价格
                        segmentAvailability = segmentInfo[
                            'segmentAvailability']

                        # 比价
                        items = []
                        for rec_price in segmentAvailability:
                            bookingClass = rec_price['bookingClass']
                            # 跳过没有数据的仓位
                            if not bookingClass:
                                continue
                            seatAvailablity = rec_price['seatAvailablity']
                            displayFareAmount = rec_price['displayFareAmount']
                            taxAmount = rec_price['taxAmount']
                            displayFareCurrencyCode = rec_price[
                                'displayFareCurrencyCode']

                            item = FlightsItem()
                            item.update(
                                dict(
                                    flightNumber='%s%s' %
                                    (carrierCode, flightNumber),  # 航班号
                                    depTime=int(
                                        time.mktime(
                                            time.strptime(
                                                dep_date,
                                                '%Y-%m-%d %H:%M'))),  # 出发时间
                                    arrTime=int(
                                        time.mktime(
                                            time.strptime(
                                                arr_date,
                                                '%Y-%m-%d %H:%M'))),  # 达到时间
                                    fromCity=fromCity,  # 出发城市
                                    toCity=toCity,  # 到达城市
                                    depAirport=dep_airport,  # 出发机场
                                    arrAirport=arr_airport,  # 到达机场
                                    currency=displayFareCurrencyCode,  # 货币种类
                                    adultPrice=displayFareAmount +
                                    taxAmount,  # 成人票价
                                    adultTax=taxAmount,  # 税价
                                    netFare=displayFareAmount,  # 净票价
                                    maxSeats=seatAvailablity,  # 可预定座位数
                                    cabin=bookingClass,  # 舱位
                                    carrier=carrierCode,  # 航空公司
                                    isChange=1,  # 是否为中转 1.直达2.中转
                                    segments="NULL",  # 中转时的各个航班信息
                                    getTime=int(time.time()),
                                ))
                            items.append(item)

                        # 比价,座位小于3个的不要
                        gt_2_items = filter(lambda x: x['maxSeats'] > 2, items)
                        if gt_2_items:
                            yield min(gt_2_items,
                                      key=lambda x: x['adultPrice'])
                        else:
                            yield min(items, key=lambda x: x['adultPrice'])

                    else:
                        print(_from, _to, _date)
                        print('is_change')
                        break
コード例 #10
0
    def parse(self, response):
        # 给设置无效用的。。。。
        from_port = response.meta.get('FROM')
        to_port = response.meta.get('TO')
        FROM = self.city_airport.get(from_port, from_port)
        TO = self.city_airport.get(to_port, to_port)
        list_day = response.meta.get('_day')

        # 创建items实例
        item = FlightsItem()
        response_dict = json.loads(response.body)
        try:
            # 使用jsonpath获取元素
            AvailableFlights = response_dict.get('AvailableFlights')
            DisplayCurrencyCode = response_dict.get('DisplayCurrencyCode')

            for i in AvailableFlights:
                price_pack = [[0, 0]]
                adult_price = sys.maxint
                seats = 0
                for price_item in i.get('FlightFares'):
                    i_Price = jsonpath.jsonpath(price_item, '$..Price')
                    if not i_Price:
                        continue
                    i_Price = i_Price[0]
                    PriceWithDebitCard = jsonpath.jsonpath(
                        price_item, '$..PriceWithDebitCard')[0]
                    i_tax = self.get_tax(i_Price, PriceWithDebitCard)
                    i_seat = price_item.get('LowestFareSeatsAvailable')

                    i_adult_price = i_Price + i_tax
                    if price_item.get('FareType') == 'Flexi':
                        price_pack[0] = [i_Price + i_tax, i_seat]
                    if i_adult_price < adult_price and i_adult_price != 0:
                        adult_price, seats, Price, tax = i_adult_price, i_seat, i_Price, i_tax
                if adult_price == sys.maxint:
                    adult_price, seats, Price, tax = [0] * 4
                depTime = i.get('LocalDepartureTime')
                arrTime = i.get('LocalArrivalTime')

                dep_port = i.get('DepartureIata')
                arr_port = i.get('ArrivalIata')

                from_city = self.city_airport.get(dep_port, dep_port)
                to_city = self.city_airport.get(arr_port, arr_port)

                item['flightNumber'] = "U2%s" % i.get('FlightNumber')  # 航班号
                item['depTime'] = time.mktime(
                    time.strptime(depTime,
                                  "%Y-%m-%dT%H:%M:%S")).__int__()  # 出发时间
                item['arrTime'] = time.mktime(
                    time.strptime(arrTime,
                                  "%Y-%m-%dT%H:%M:%S")).__int__()  # 达到时间
                item['fromCity'] = from_city  # 出发城市
                item['toCity'] = to_city  # 到达城市
                item['depAirport'] = dep_port  # 出发机场
                item['arrAirport'] = arr_port  # 到达机场
                item['currency'] = self.currency_cache.get(
                    DisplayCurrencyCode, DisplayCurrencyCode)  # 货币种类
                item['adultPrice'] = adult_price  # 成人票价
                item['adultTax'] = tax  # 税价
                item['netFare'] = Price  # 净票价
                item['maxSeats'] = seats  # 可预定座位数
                item['cabin'] = "Y"  # 舱位
                item['carrier'] = "U2"  # 航空公司
                item['isChange'] = 1  # 是否为中转 1.直达2.中转
                item['segments'] = json.dumps(price_pack)  # 中转时的各个航班信息
                item['getTime'] = time.mktime(
                    datetime.now().timetuple()).__int__()
                yield item
        except Exception as e:
            # traceback.print_exc()
            # print(response.body)
            self.log(e, level=40)

            # 设置失效
            params = {'carrier': 'U2'}

            data_array = list()
            for _day in list_day:
                data = {
                    'fromCity': FROM,
                    'toCity': TO,
                    'date': re.sub(r'(\d+)-(\d+)-(\d+)', r'\1\2\3', _day),
                }
                data_array.append(data)

            res = push_date(settings.PUSH_DATA_URL,
                            params=params,
                            action='invalid',
                            data_array=data_array)
            self.log('%s-%s: %s no flights' % (FROM, TO, list_day), level=20)
            pass
コード例 #11
0
ファイル: lx.py プロジェクト: Biking0/spider_project
    def parse(self, response):
        meta = response.meta
        self.is_ok = True

        # 解析items
        div_list = response.xpath(
            '//*[@id="frm-matrix"]/div[2]/div[contains(@class, "book_bundle_row") and not(contains(@class, "has-multiflight"))]'
        )

        item = None
        items = list()
        flight_date = None
        for div in div_list:
            row_header = div.xpath('div[@class="book_bundle_row--header"]')
            # 取出日期用于后面时间戳计算
            h3 = row_header.xpath(
                'div[@class="is-visuallyhidden"]/h3/text()').extract_first()
            # u'Sunday 08/04/2018, London (LHR) ab 12:05, Zurich (ZRH) an 14:45.  Economy from GBP 228.  Business from not available. Operated by SWISS GLOBAL AIR LINES. '
            flight_date = re.match(r'\w+\s+(.*?),.*', h3).group(1)

            flightentry = row_header.xpath(
                'div[@class="book_bundle_row--flightinfo"]/div[@class="book-bundle-flightentry"]'
            )

            # 机场,时间
            flightentry_time = flightentry.xpath(
                'div[@class="book-bundle-flightentry--time"]')
            # 出发机场,时间
            departure = flightentry_time.xpath(
                'div[@class="book-bundle-flightentry--departure"]')
            depAirport = departure.xpath('text()').extract_first().strip()
            dep_time = departure.xpath('strong/text()').extract_first().strip()
            dep_data = flight_date + dep_time
            # 到达机场,时间
            arrival = flightentry_time.xpath(
                'div[@class="book-bundle-flightentry--arrival"]')
            arrAirport = arrival.xpath('text()').extract_first().strip()
            arr_time = arrival.xpath('strong/text()').extract_first().strip()
            sub = arrival.xpath('strong/sub/text()').extract_first()
            arr_date = flight_date + arr_time
            if sub:
                arr_date = self.add_date(arr_date, int(sub))

            # 城市
            fromCity = self.city_airport.get(depAirport, depAirport)
            toCity = self.city_airport.get(arrAirport, arrAirport)

            flightentry_info = flightentry.xpath(
                'div[@class="book-bundle-flightentry--info"]/div[@class="book-bundle-flightentry--metainfo"]'
            )
            # 航班号
            flightentry_number = flightentry_info.xpath(
                'a[@class="book-bundle-flightentry--number"]/text()'
            ).extract_first()
            flightNumber = re.sub(r'\s*', '', flightentry_number)

            # # 过滤共享航班
            # flightentry_operator = flightentry_info.xpath(
            #     'span[@class="book-bundle-flightentry--operator"]/text()').extract_first()
            # if 'SWISS' in flightentry_operator:
            #     pass
            # else:
            #     continue

            buttons = row_header.xpath(
                'div[@class="book_bundle_row--buttons"]')
            li_list = buttons.xpath('div/ul/li')
            for li in li_list:
                li_button = li.xpath('button')
                if li_button:
                    span = li_button.xpath('span/text()').extract()
                    cabin, currency, price, _ = span
                    currency = re.match(r'from\s(.*)', currency).group(1)
                    if currency == u'¥':
                        currency = u'CNY'
                    price = re.sub(r'\W', '', price)
                else:
                    # 跳过已售完仓位
                    continue

                # 剩余座位
                div_text = li.xpath('div/text()').extract_first()
                if div_text:
                    left_seats = re.match(r'.*(\d).*',
                                          div_text,
                                          flags=re.DOTALL).group(1)
                else:
                    left_seats = 9

                item = FlightsItem()
                item.update(
                    dict(
                        flightNumber=flightNumber,  # 航班号
                        depTime=int(
                            time.mktime(
                                time.strptime(dep_data,
                                              "%d/%m/%Y%H:%M"))),  # 出发时间
                        arrTime=int(
                            time.mktime(
                                time.strptime(arr_date,
                                              "%d/%m/%Y%H:%M"))),  # 达到时间
                        fromCity=fromCity,  # 出发城市
                        toCity=toCity,  # 到达城市
                        depAirport=depAirport,  # 出发机场
                        arrAirport=arrAirport,  # 到达机场
                        currency=currency[-3:],  # 货币种类
                        adultPrice=float(price),  # 成人票价
                        adultTax=0,  # 税价
                        netFare=float(price),  # 净票价
                        maxSeats=left_seats,  # 可预定座位数
                        cabin=cabin,  # 舱位
                        carrier=flightNumber[:2],  # 航空公司
                        isChange=1,  # 是否为中转 1.直达2.中转
                        segments="NULL",  # 中转时的各个航班信息
                        getTime=int(time.time()),
                    ))
                # 比价
                items.append(item)

        # 按航班号分组,返回最低价
        items = sorted(items, key=lambda x: x['flightNumber'])
        items_group = groupby(items, key=lambda x: x['flightNumber'])
        for k, g in items_group:
            yield min(g, key=lambda x: x['adultPrice'])

        # 解析url
        if 'origin' in meta and flight_date is not None:
            a_list = response.xpath(
                '//*[@id="matrixDaySelection"]/ul/li/a[@data-has-module="yes"]/@href'
            ).extract()
            # 剔除模拟爬过的页面
            a_list_s = [
                link for link in a_list if
                parse_qs(link.split('?')[1])['selectedDate'][0] != flight_date
            ]
            for url_path in a_list_s:
                total_url = self.start_urls + url_path
                yield scrapy.Request(total_url,
                                     cookies=response.request.cookies,
                                     callback=self.parse,
                                     dont_filter=True)
        # 恢复引擎
        self.crawler.engine.unpause()
コード例 #12
0
    def parse(self, response):
        meta = response.meta
        FROM = meta.get('FROM')
        TO = meta.get('TO')
        _day = meta.get('_day')
        # currency = meta.get('currency')
        a_tags = response.xpath(
            '//*[@id="tripDeparture"]/div[@class="tripJourneys"]/div[contains(@class, "tripJourneyDate")]/div[contains(@class, "tripJourneyDateFlight")]'
        )
        for trip in a_tags:
            a = trip.xpath('a[contains(@class, "list-box")]')
            content = a.xpath('@data-sellkey').extract_first()
            # 跳过中转航班
            if '^' in content:
                continue

            s = filter((lambda x: x), re.split(r'[~|\s]+', content))
            carrier, number, depAirport, date_from, time_from, arrAirport, date_to, time_to = s

            # 无价格标签说明以售完
            price_str = a.xpath(
                'div[@class="flight-wrapper"]//div[contains(@class, "flight-price-wrapper")]/span[contains(@class,"price")]/span[@class="value"]/text()'
            ).extract_first()
            if price_str:
                currency, price = re.match(r'(\D.*?)(\d.*)',
                                           price_str).groups()
                # price = locale.atof(price)
                price = float(re.sub(r',', '', price))
                currency = currency.strip()
                currency = self.currency_cache.get(currency, currency)
                maxSeats = 5
                price_dif = [[0, 0]] * 2
                keys = ['SMART', 'BIZclass']
                trip_list = trip.xpath(
                    'div[@class="tariffList"]/div[contains(@class, "tripFare") and not(contains(@class, "sold-out"))]'
                )
                price_flag = price
                for trip_i in trip_list:
                    key = trip_i.xpath('./@data-tariff').extract()[0]
                    if key not in keys:
                        continue
                    price_i = trip_i.xpath('./@data-price').extract()[0]
                    price_i = float(
                        re.findall(r'.*?(\d.*)',
                                   price_i)[0].replace(',', ''))  # 获取此服务的价格
                    if price_i < price_flag:
                        price_i = price_flag
                    else:
                        price_flag = price_i
                    price_dif[keys.index(key)] = [price_i, maxSeats]
            else:
                currency = ''
                price = 0
                maxSeats = 0
                price_dif = None

            fromCity = self.city_airport.get(depAirport, depAirport)
            toCity = self.city_airport.get(arrAirport, arrAirport)

            item = FlightsItem()
            item.update(
                dict(
                    flightNumber=carrier + number,  # 航班号
                    depTime=time.mktime(
                        time.strptime(date_from + time_from,
                                      "%m/%d/%Y%H:%M")).__int__(),  # 出发时间
                    arrTime=time.mktime(
                        time.strptime(date_to + time_to,
                                      "%m/%d/%Y%H:%M")).__int__(),  # 达到时间
                    fromCity=fromCity,  # 出发城市
                    toCity=toCity,  # 到达城市
                    depAirport=depAirport,  # 出发机场
                    arrAirport=arrAirport,  # 到达机场
                    currency=currency,  # 货币种类
                    adultPrice=price,  # 成人票价
                    adultTax=0,  # 税价
                    netFare=price,  # 净票价
                    maxSeats=maxSeats,  # 可预定座位数
                    cabin='BASIC',  # 舱位
                    carrier=carrier,  # 航空公司
                    isChange=1,  # 是否为中转 1.直达2.中转
                    segments='[]'
                    if not price_dif else json.dumps(price_dif),  # 中转时的各个航班信息
                    getTime=time.time().__int__(),
                ))
            yield item
コード例 #13
0
    def parse(self, response):
        # meta = response.meta
        # _from = meta.get('_from')
        # _to = meta.get('_to')
        # _date = meta.get('_date')
        self.log(response.meta['proxy'])

        rows = response.xpath(
            '//*[@id="availabilityForm"]//table[@class="table avail-table"]/tbody/tr[contains(@class, "-row")]'
        )
        # 过滤中转航线
        nonstop_rows = [
            row for row in rows \
            if row.xpath(
                'td[contains(@class, "avail-table-vert")]//td[contains(@id, "icon_")]//td[@class="avail-table-detail"]/div/div'
            ).__len__() == 1
        ]
        for row in nonstop_rows:
            # 过滤仓位
            td_depart = row.xpath('td[contains(@class, "depart LF")]')

            for td in td_depart:
                item = FlightsItem()

                left_seats = td.xpath(
                    'div//div[@class="avail-table-seats-remaining"]/text()'
                ).extract_first()
                if left_seats:
                    left_seats = re.search(r'\d', left_seats).group()
                else:
                    left_seats = 6

                input_tag = td.xpath('div//input')
                value = input_tag.xpath('@value').extract_first()

                m = re.findall(r'~\w{3}~(\d{2}/\d{2}/\d{4} \d{2}:\d{2})',
                               value)

                data_json = input_tag.xpath('@data-json').extract_first()
                result = json.loads(data_json)[0]
                carrier = result.get('brand')
                flightNumber = result.get('dimension16')
                depAirport = result.get('dimension2')
                arrAirport = result.get('dimension3')
                adultPrice = result.get('price')

                from_city = self.city_airport.get(depAirport, depAirport)
                to_city = self.city_airport.get(arrAirport, arrAirport)

                product = input_tag.xpath('@data-productclass').extract_first()
                cur = input_tag.xpath('@data-cur').extract_first()
                item.update(
                    dict(
                        flightNumber=flightNumber,  # 航班号
                        depTime=time.mktime(
                            time.strptime(m[0],
                                          "%m/%d/%Y %H:%M")).__int__(),  # 出发时间
                        arrTime=time.mktime(
                            time.strptime(m[1],
                                          "%m/%d/%Y %H:%M")).__int__(),  # 达到时间
                        fromCity=from_city,  # 出发城市
                        toCity=to_city,  # 到达城市
                        depAirport=depAirport,  # 出发机场
                        arrAirport=arrAirport,  # 到达机场
                        currency=cur,  # 货币种类
                        adultPrice=adultPrice,  # 成人票价
                        adultTax=0,  # 税价
                        netFare=adultPrice,  # 净票价
                        maxSeats=left_seats,  # 可预定座位数
                        cabin=product,  # 舱位
                        carrier=carrier,  # 航空公司
                        isChange=1,  # 是否为中转 1.直达2.中转
                        segments="NULL",  # 中转时的各个航班信息
                        getTime=time.mktime(
                            datetime.now().timetuple()).__int__(),
                    ))

                yield item
コード例 #14
0
    def parse(self, response):
        meta = response.meta
        _from = meta.get('_from')
        _to = meta.get('_to')
        _date = meta.get('_date')

        results = json.loads(response.text).get('AirLowFareSearchRS')

        # 设置失效
        if results is None:
            params = {'carrier': self.spider_name}

            data_array = list()
            data = {
                'fromCity': _from,
                'toCity': _to,
                'date': re.sub(r'(\d+)-(\d+)-(\d+)', r'\1\2\3', _date),
            }
            data_array.append(data)
            res = push_date(settings.PUSH_DATA_URL,
                            params=params,
                            action='invalid',
                            data_array=data_array)
            self.log('%s-%s: %s no flights' % (_from, _to, _date), level=20)
            return

        item = None
        items = []
        # 航班信息
        FlightInformation = results['FlightInformationSummary'][
            'FlightInformation']
        flight_info_list = [
            flight for flight in FlightInformation if _date in flight['Flight']
            [0]['FlightSegment'][0]['@DepartureDate']
        ]
        for flight_info in flight_info_list:
            if len(flight_info['Flight'][0]['FlightSegment']) == 1:
                info_rec = flight_info['Flight'][0]['FlightSegment'][0]
                flight_id = info_rec['@ID']
                flightNumber = info_rec['@FlightNumber']
                dep_data = info_rec['@DepartureDate'][:19]
                arr_date = info_rec['@ArrivalDate'][:19]
                depAirport = info_rec['@OriginCode']
                arrAirport = info_rec['@DestinationCode']
                carrier = info_rec['@MarketingAirline']
                fromCity = self.city_airport.get(depAirport, depAirport)
                toCity = self.city_airport.get(arrAirport, arrAirport)

                # 座位信息
                FlightItineraryPricePoint = results[
                    'FlightItineraryPricePoints']['FlightItineraryPricePoint']
                for seat_info in FlightItineraryPricePoint:
                    # 价格point
                    PricePointRef = seat_info['PricePointRef']
                    FlightInformationAttributes = seat_info[
                        'FlightInformationAttributes'][0]
                    if flight_id == FlightInformationAttributes[
                            '@FlightSegmentRef']:
                        cabin = FlightInformationAttributes['@BookingClass']
                        maxSeats = FlightInformationAttributes[
                            '@SeatsAvailable']

                        # 价格信息
                        PricePoint = results['PricePointSummary']['PricePoint']
                        for price_info in PricePoint:
                            if PricePointRef == price_info['@ID']:
                                Fare = price_info['BasedOnPTCPricing']['Fare']
                                netFare = float(Fare['@BaseFareAmount'])
                                BaseFareCurrency = Fare['@BaseFareCurrency']
                                TotalFareAmount = float(
                                    Fare['@TotalFareAmount'])

                                item = FlightsItem()
                                item.update(
                                    dict(
                                        flightNumber=carrier +
                                        flightNumber,  # 航班号
                                        depTime=int(
                                            time.mktime(
                                                time.strptime(
                                                    dep_data,
                                                    "%Y-%m-%dT%H:%M:%S"))
                                        ),  # 出发时间
                                        arrTime=int(
                                            time.mktime(
                                                time.strptime(
                                                    arr_date,
                                                    "%Y-%m-%dT%H:%M:%S"))
                                        ),  # 达到时间
                                        fromCity=fromCity,  # 出发城市
                                        toCity=toCity,  # 到达城市
                                        depAirport=depAirport,  # 出发机场
                                        arrAirport=arrAirport,  # 到达机场
                                        currency=BaseFareCurrency,  # 货币种类
                                        adultPrice=TotalFareAmount,  # 成人票价
                                        adultTax=TotalFareAmount -
                                        netFare,  # 税价
                                        netFare=netFare,  # 净票价
                                        maxSeats=maxSeats,  # 可预定座位数
                                        cabin=cabin,  # 舱位
                                        carrier=carrier,  # 航空公司
                                        isChange=1,  # 是否为中转 1.直达2.中转
                                        segments="NULL",  # 中转时的各个航班信息
                                        getTime=int(time.time()),
                                    ))
                                # 比价
                                items.append(item)

        # 按航班号分组,返回最低价
        items = sorted(items, key=lambda x: x['flightNumber'])
        items_group = groupby(items, key=lambda x: x['flightNumber'])
        for k, g in items_group:
            yield min(g, key=lambda x: x['adultPrice'])
コード例 #15
0
    def parse(self, response):
        meta = response.meta
        dep_code = meta.get('dep_code')
        arr_code = meta.get('arr_code')
        _date = meta.get('_date')
        date_array = meta.get('date_array')

        if date_array:
            _from = self.cities.get(dep_code)
            _to = self.cities.get(arr_code)
            for _date_s in date_array:
                body = {
                    'SaveFields.SelDepOptId': '-1',
                    'SaveFields.RetShldrSel': 'False',
                    'SearchFields.IsAwardBooking': 'false',
                    'SearchFields.PriceType': 'Lowest',
                    'SearchFields.UpgradeOption': 'none',
                    'ClientStateCode': 'HA',
                    'SaveFields.SelRetOptId': '-1',
                    'SaveFields.DepShldrSel': 'False',
                    # 'SearchFields.DepartureCity': 'San Francisco, CA (SFO-San Francisco Intl.)',
                    # 'SearchFields.ArrivalCity': 'Portland, OR (PDX-Portland Intl.)',
                    'SearchFields.NumberOfTravelers': self.search_seat,
                    'SearchFields.SearchType': 'OneWay',
                    'SearchFields.IsCalendar': 'false',
                    'SearchFields.DepartureCity': _from,
                    'SearchFields.ArrivalCity': _to,
                    'SourcePage': 'Search',
                    'SearchFields.DepartureDate': _date_s,
                }
                yield scrapy.Request(method='POST',
                                     url=self.start_urls,
                                     body=parse.urlencode(body),
                                     meta={
                                         'dep_code': dep_code,
                                         'arr_code': arr_code,
                                         '_date': _date_s
                                     },
                                     cookies=response.request.cookies,
                                     callback=self.parse,
                                     errback=self.errback)

        flight_day = response.xpath(
            '//div[@class="shoulderSelected"]/input/@value').extract_first()
        optionList = response.xpath(
            '//*[@id="result-0"]/ul/li[div[@class="optionDetail"]/div[@class="clear"]/a/text()="Nonstop"]'
        )
        item = None
        for li in optionList:
            # 航班号
            optionHeader = li.xpath('div[@class="optionHeader"]')
            flight_number = optionHeader.xpath(
                'div[@class="optionHeaderFltNum"]/text()').re(r'\d+')[0]
            # 机场,时间
            optionDetail = li.xpath('div[@class="optionDetail"]')
            dep = optionDetail.xpath('div[@class="optionDeparts"]')
            dep_airport = dep.xpath(
                'div[@class="optionCityCode"]/text()').extract_first()
            dep_time = dep.xpath(
                'div[@class="optionTime"]/div[@class="b"]/text()'
            ).extract_first()
            dep_date = flight_day + dep_time
            arr = optionDetail.xpath('div[@class="left"]')
            arr_airport = arr.xpath(
                'div[@class="optionCityCode"]/text()').extract_first()
            arr_time = arr.xpath(
                'div[@class="optionTime"]/div[@class="b"]/text()'
            ).extract_first()
            arrivalDaysDifferent = arr.xpath(
                'div[@class="optionTime"]/div[@class="arrivalDaysDifferent"]/text()'
            ).re(r'\d')
            if arrivalDaysDifferent:
                num = int(arrivalDaysDifferent[0])
                arr_date = self._add_date(flight_day, num) + arr_time
            else:
                arr_date = flight_day + arr_time
            # 价格
            data_c_l_p = li.xpath('div[@data-c="l"]/@data-p').extract()
            price = float(min(data_c_l_p, key=float))
            # 货币种类
            currency = li.xpath(
                'div[@data-c="l" and @data-p]/div[@class="fareprice"]/text()'
            ).re(r'\D+')[0]
            currency = self.currency_cache.get(currency)
            if currency is None:
                continue

            fromCity = self.city_airport.get(dep_airport, dep_airport)
            toCity = self.city_airport.get(arr_airport, arr_airport)

            if dep_code != dep_airport or arr_code != arr_airport:
                print('ft: %s-%s' % (dep_code, arr_code))
                print('da: %s-%s' % (dep_airport, arr_airport))
            item = FlightsItem()
            item.update(
                dict(
                    flightNumber='AS' + flight_number.strip(),  # 航班号
                    depTime=int(
                        time.mktime(time.strptime(
                            dep_date, "%m/%d/%Y%I:%M %p"))),  # 出发时间
                    arrTime=int(
                        time.mktime(time.strptime(
                            arr_date, "%m/%d/%Y%I:%M %p"))),  # 达到时间
                    fromCity=fromCity,  # 出发城市
                    toCity=toCity,  # 到达城市
                    depAirport=dep_airport,  # 出发机场
                    arrAirport=arr_airport,  # 到达机场
                    currency=currency,  # 货币种类
                    adultPrice=float(price),  # 成人票价
                    adultTax=0,  # 税价
                    netFare=float(price),  # 净票价
                    maxSeats=self.search_seat,  # 可预定座位数
                    cabin='E',  # 舱位
                    carrier='AS',  # 航空公司
                    isChange=1,  # 是否为中转 1.直达2.中转
                    segments="NULL",  # 中转时的各个航班信息
                    getTime=int(time.time()),
                ))

            yield item

        # 设置失效
        if item is None:
            data = {
                'depAirport': dep_code,
                'arrAirport': arr_code,
                'date': datetime.strptime(_date, "%m/%d/%Y").strftime('%Y%m%d')
            }
            content = push_date(self.settings.get('PUSH_DATA_URL'),
                                params={'carrier': self.spider_name},
                                action='invalid',
                                data_array=[data])
            self.log('[%s] %s-%s no flight.' % (_date, dep_code, arr_code), 20)
コード例 #16
0
    def parse(self, response):
        meta = response.meta
        _from = meta.get('_from')
        _to = meta.get('_to')
        _date = meta.get('_date')

        if response.status == 404:
            # data=null
            self.log(
                '404, No flights on this day: %s-%s[%s]' % (_from, _to, _date),
                20)
            return
        elif response.status == 500:
            self.log(response.text, 40)
            return

        data_map = json.loads(response.text).get('data')
        trips = data_map.get('trips')
        if not trips:
            # 当天无航班
            pass
        else:
            # 货币种类
            currencyCode = data_map.get('currencyCode')

            faresAvailable = data_map.get('faresAvailable')
            for trip in trips:
                journeysAvailable = trip['journeysAvailable']
                for journey in journeysAvailable:
                    if journey['stops'] == 0:
                        segments = journey['segments'][0]
                        designator = segments['designator']
                        # 机场,时间
                        dep_airport = designator['origin']
                        arr_airport = designator['destination']
                        dep_date = designator['departure']
                        arr_date = designator['arrival']

                        from_city = self.city_airport.get(
                            dep_airport, dep_airport)
                        to_city = self.city_airport.get(
                            arr_airport, arr_airport)

                        # 航班号,航司
                        identifier = segments['identifier']
                        flight_number = identifier['identifier']
                        carrierCode = identifier['carrierCode']

                        items = []
                        fares = journey['fares']
                        for fare_key, fare_value in fares.items():
                            # 座位
                            availableCount = fare_value['availableCount']
                            # 仓位
                            classOfService = fare_value['classOfService']
                            fare_rec = faresAvailable[fare_key]
                            passengerFares = fare_rec['passengerFares'][0]
                            # 纯价格
                            serviceCharges = float(
                                passengerFares['serviceCharges'][0]['amount'])
                            # 总价
                            fareAmount = float(passengerFares['fareAmount'])

                            # 管理费
                            admin_fees = self.admin_fees.get(dep_airport +
                                                             arr_airport)
                            if admin_fees is None:
                                self.log(
                                    'new flight line, no found admin fees', 40)
                            else:
                                admin_fees = float(admin_fees)

                                item = FlightsItem()
                                item.update(
                                    dict(
                                        flightNumber=carrierCode +
                                        flight_number,  # 航班号
                                        depTime=time.mktime(
                                            time.strptime(
                                                dep_date,
                                                "%Y-%m-%dT%H:%M:%S")).__int__(
                                                ),  # 出发时间
                                        arrTime=time.mktime(
                                            time.strptime(
                                                arr_date,
                                                "%Y-%m-%dT%H:%M:%S")).__int__(
                                                ),  # 达到时间
                                        fromCity=from_city,  # 出发城市
                                        toCity=to_city,  # 到达城市
                                        depAirport=dep_airport,  # 出发机场
                                        arrAirport=arr_airport,  # 到达机场
                                        currency=currencyCode,  # 货币种类
                                        adultPrice=fareAmount +
                                        admin_fees,  # 成人票价
                                        adultTax=admin_fees,  # 税价
                                        netFare=fareAmount,  # 净票价
                                        maxSeats=availableCount,  # 可预定座位数
                                        cabin=classOfService,  # 舱位
                                        carrier=carrierCode,  # 航空公司
                                        isChange=1,  # 是否为中转 1.直达2.中转
                                        segments="NULL",  # 中转时的各个航班信息
                                        getTime=int(time.time()),
                                    ))
                                items.append(item)

                        if not fares:
                            item = FlightsItem()
                            item.update(
                                dict(
                                    flightNumber=carrierCode +
                                    flight_number,  # 航班号
                                    depTime=time.mktime(
                                        time.strptime(dep_date,
                                                      "%Y-%m-%dT%H:%M:%S")
                                    ).__int__(),  # 出发时间
                                    arrTime=time.mktime(
                                        time.strptime(arr_date,
                                                      "%Y-%m-%dT%H:%M:%S")
                                    ).__int__(),  # 达到时间
                                    fromCity=from_city,  # 出发城市
                                    toCity=to_city,  # 到达城市
                                    depAirport=dep_airport,  # 出发机场
                                    arrAirport=arr_airport,  # 到达机场
                                    currency=currencyCode,  # 货币种类
                                    adultPrice=0,  # 成人票价
                                    adultTax=0,  # 税价
                                    netFare=0,  # 净票价
                                    maxSeats=0,  # 可预定座位数
                                    cabin='',  # 舱位
                                    carrier=carrierCode,  # 航空公司
                                    isChange=1,  # 是否为中转 1.直达2.中转
                                    segments="[]",  # 中转时的各个航班信息
                                    getTime=int(time.time()),
                                ))
                            yield item
                            continue

                        yield min(items, key=lambda item: item['adultPrice'])
コード例 #17
0
    def parse(self, response):
        meta = response.meta
        # _from = meta.get('_from')
        # _to = meta.get('_to')
        # _date = meta.get('_date')

        self.is_ok = True
        if 'origin' in meta:
            scid = response.xpath('//a[@class="logout__name"]/@href').re_first(
                r'.*scid=(.*)')
            dsid = response.xpath(
                '//div[@class="flight-results__wrapper clearfix"]/@data-dsid'
            ).extract_first()

            set_cookies = response.headers.getlist('Set-Cookie')
            cookie_items = [
                re.match(r'(.*?)=(.*?);', i).groups() for i in set_cookies
            ]
            cookies = []
            for k, v in cookie_items:
                cookie = {
                    u'domain': u'.jet2.com',
                    u'secure': False,
                    u'value': unicode(v),
                    u'expiry': None,
                    u'path': u'/',
                    u'httpOnly': False,
                    u'name': unicode(k)
                }
                cookies.append(cookie)

            flight_search_url = 'https://www.jet2.com/api/search/flightsearchresults/update?scid=' + scid

            headers = {
                'accept-encoding': "gzip, deflate, br",
                'accept-language': "zh-CN,zh;q=0.9",
                'adrum': "isAjax:true",
                'content-type': "application/json; charset=UTF-8",
                'origin': "https://www.jet2.com",
                'referer': response.url,
                'user-agent':
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
                'x-requested-with': "XMLHttpRequest",
                'Cache-Control': "no-cache"
            }

            td_date = response.xpath(
                '//tbody[@class="calendar__body"]/tr/td[@class="calendar__day "]'
            )
            for td in td_date:
                flight_id = td.xpath(
                    '@data-cheapest-flight-id').extract_first()
                data_date = td.xpath('@data-date').extract_first()
                if datetime.strptime(
                        data_date,
                        "%Y-%m-%d") > datetime.today() + timedelta(31):
                    continue
                body = {
                    "isCalendarSelection": True,
                    "flightId": flight_id,
                    "date": data_date,
                    "isOutbound": True,
                    "isFull": False,
                    "datasource": dsid
                }

                yield scrapy.Request(flight_search_url,
                                     method='POST',
                                     headers=headers,
                                     cookies=cookies,
                                     body=json.dumps(body),
                                     errback=self.errback,
                                     callback=self.parse)
        else:
            json_data = json.loads(response.text)
            # 如果是最后一页停止请求,防止死循环
            if 'end' not in response.meta:
                html = json_data['Data']['Html']
                r = HtmlResponse('', body=html.encode('utf-8'))
                flight_ids = r.xpath(
                    '//div[@class="times-summary__item "]/@data-flight-id'
                ).extract()
                request_body = json.loads(response.request.body)
                # 如果当前页面有多个航班继续请求
                request_body.pop('date')
                for flight_id in flight_ids:
                    request_body.update({
                        'flightId': flight_id,
                        'isCalendarSelection': False
                    })
                    yield scrapy.Request(response.url,
                                         method='POST',
                                         headers=response.request.headers,
                                         cookies=response.request.cookies,
                                         body=json.dumps(request_body),
                                         meta={'end': 1},
                                         callback=self.parse,
                                         errback=self.errback)

            # 解析json数据
            products = json_data['Gtm']['ecommerce']['click']['products']
            products_0 = products[0]
            flight_number = products_0['dimension6']
            _, dep_date, _, arr_date = products_0['dimension57'].split('_')
            currency = products_0['dimension17']
            dep_airport = products_0['dimension4']
            arr_airport = products_0['dimension9']
            price = products_0['price']

            fromCity = self.city_airport.get(dep_airport, dep_airport)
            toCity = self.city_airport.get(arr_airport, arr_airport)

            item = FlightsItem()
            item.update(
                dict(
                    flightNumber=flight_number,  # 航班号
                    depTime=int(
                        time.mktime(
                            time.strptime(dep_date,
                                          "%Y-%m-%dT%H:%M:%S"))),  # 出发时间
                    arrTime=int(
                        time.mktime(
                            time.strptime(arr_date,
                                          "%Y-%m-%dT%H:%M:%S"))),  # 达到时间
                    fromCity=fromCity,  # 出发城市
                    toCity=toCity,  # 到达城市
                    depAirport=dep_airport,  # 出发机场
                    arrAirport=arr_airport,  # 到达机场
                    currency=currency,  # 货币种类
                    adultPrice=price,  # 成人票价
                    adultTax=0,  # 税价
                    netFare=price,  # 净票价
                    maxSeats=5,  # 可预定座位数
                    cabin='E',  # 舱位
                    carrier=flight_number[:2],  # 航空公司
                    isChange=1,  # 是否为中转 1.直达2.中转
                    segments="NULL",  # 中转时的各个航班信息
                    getTime=int(time.time()),
                ))

            yield item
コード例 #18
0
    def parse(self, response):
        # meta = response.meta
        # _from = meta.get('_from')
        # _to = meta.get('_to')
        # _date = meta.get('_date')

        res = json.loads(response.text)
        if 'origin' in response.meta:
            multiDayAvailabilityOutbound = res['multiDayAvailabilityOutbound']
            r = HtmlResponse(url=self.start_urls,
                             body=multiDayAvailabilityOutbound.encode('utf-8'))
            __RequestVerificationToken = r.xpath(
                '//div[@class="animation-container"]//input[@name="__RequestVerificationToken"]/@value'
            ).extract_first()
            li_days = r.xpath(
                '//*[@class="HV-gc bulletless days"]/li/div[@class="day day-with-availability"]'
            )
            for li in li_days:
                date_date = li.xpath('@data-date').extract_first()
                body = {
                    'selectSingleDayAvailability.JourneyType':
                    'OutboundFlight',
                    'selectSingleDayAvailability.Date.DateToParse':
                    date_date[:10],
                    'selectSingleDayAvailability.AutoSelect': False,
                    '__RequestVerificationToken': __RequestVerificationToken
                }
                yield scrapy.Request(
                    self.select_url,
                    method='POST',
                    body=parse.urlencode(body),
                    headers=response.request.headers,
                    cookies=response.request.cookies,
                )

        else:
            SingleDayOutbound = res['SingleDayOutbound']
            html = HtmlResponse(url='', body=SingleDayOutbound.encode('utf-8'))
            buttons = html.xpath('//button[@class="flight-result-button"]')
            for button in buttons:
                # 机场
                button_value = button.xpath('@value').extract_first()
                dep_airport, arr_airport = re.findall(r'~(\w{3})~',
                                                      button_value)[:2]
                fromCity = self.city_airport.get(dep_airport, dep_airport)
                toCity = self.city_airport.get(arr_airport, arr_airport)
                # 时间
                div_times = button.xpath('div[@class="times"]')
                departure = div_times.xpath(
                    'time[@class="departure"]/@datetime').extract_first()
                departure_time = div_times.xpath(
                    'time[@class="departure"]/text()').extract_first().strip()
                dep_date = "%s %s:00" % (departure[:10], departure_time)
                arrival = div_times.xpath(
                    'time[@class="arrival"]/@datetime').extract_first()
                arrival_time = div_times.xpath(
                    'time[@class="arrival"]/text()').extract_first().strip()
                arr_date = "%s %s:00" % (arrival[:10], arrival_time)
                # 航班号
                details = button.xpath('div[@class="details"]')
                flight_number_list = details.xpath(
                    'ul/li[@class="flight-number"]/text()').extract()
                flight_number = flight_number_list[1].strip()
                # 价格
                actions = button.xpath('div[@class="actions"]')
                price_div = actions.xpath('div[contains(@class, "price")]')
                currency = price_div.xpath(
                    'span[@class="currency"]/text()').extract_first().strip()
                currency = self.currency_cache.get(currency, currency)
                price = price_div.xpath('text()[2]').extract_first().strip()

                item = FlightsItem()
                item.update(
                    dict(
                        flightNumber=flight_number,  # 航班号
                        depTime=int(
                            time.mktime(
                                time.strptime(dep_date,
                                              "%d/%m/%Y %H:%M:%S"))),  # 出发时间
                        arrTime=int(
                            time.mktime(
                                time.strptime(arr_date,
                                              "%d/%m/%Y %H:%M:%S"))),  # 达到时间
                        fromCity=fromCity,  # 出发城市
                        toCity=toCity,  # 到达城市
                        depAirport=dep_airport,  # 出发机场
                        arrAirport=arr_airport,  # 到达机场
                        currency=currency,  # 货币种类
                        adultPrice=float(price),  # 成人票价
                        adultTax=0,  # 税价
                        netFare=float(price),  # 净票价
                        maxSeats=3,  # 可预定座位数
                        cabin='E',  # 舱位
                        carrier=flight_number[:2],  # 航空公司
                        isChange=1,  # 是否为中转 1.直达2.中转
                        segments="NULL",  # 中转时的各个航班信息
                        getTime=int(time.time()),
                    ))

                yield item