Beispiel #1
0
 def __init__(self, *args, **kwargs):
     super(AirasiaSearchSpider, self).__init__(*args, **kwargs)
     self.settings = get_project_settings()
     self.flightParse = AirasiaFlightParse()
     self.fareParse = AirasiaFareParse()
     self.keyGenerator = KeyGenerator()
     self.use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', False)
     self.is_zset = self.settings.getbool('REDIS_START_URLS_IS_ZSET', False)
     self.redis_key_live = self.settings.getint('REDIS_KEY_LIVE_TIME', 300)
Beispiel #2
0
    def __init__(self, crawler):
        self.crawler = crawler
        self.interval = crawler.settings.getfloat('RESCHEDULER_INTERVAL')
        self.retry = crawler.settings.getbool('RETRY_ENABLED')
        self.max_retry_times = crawler.settings.getint('RETRY_TIMES')
        self.policy = crawler.settings.get('RESCHEDULER_DOWNEXCEPTION_POLICY',
                                           'Retry')

        if not self.interval:
            raise NotConfigured

        self.keyGenerator = KeyGenerator()
Beispiel #3
0
class ReschedulerTimer(object):
    """
    Scrapy crawl rescheduler timer

    ----------
        RESCHEDULER_ENABLED default:True
                True: enable extension
                False: disabled it
        RESCHEDULER_INTERVAL must
                None or not defined it to disabled it
                float number interval recall
        RESCHEDULER_DOWNEXCEPTION_POLICY default:Retry
                下载异常处理策略:Retry/Delete 重试/删除
    """
    # IOError is raised by the HttpCompression middleware when trying to
    # decompress an empty response
    EXCEPTIONS_TO_HANDLE = (defer.TimeoutError, TimeoutError, DNSLookupError,
                            ConnectionRefusedError, ConnectionDone,
                            ConnectError, ConnectionLost, TCPTimedOutError,
                            ResponseFailed, IOError, TunnelError)

    def __init__(self, crawler):
        self.crawler = crawler
        self.interval = crawler.settings.getfloat('RESCHEDULER_INTERVAL')
        self.retry = crawler.settings.getbool('RETRY_ENABLED')
        self.max_retry_times = crawler.settings.getint('RETRY_TIMES')
        self.policy = crawler.settings.get('RESCHEDULER_DOWNEXCEPTION_POLICY',
                                           'Retry')

        if not self.interval:
            raise NotConfigured

        self.keyGenerator = KeyGenerator()

    @classmethod
    def from_crawler(cls, crawler):
        if not crawler.settings.getbool('RESCHEDULER_ENABLED', True):
            raise NotConfigured
        return cls(crawler)

    def process_request(self, request, spider):
        """
        Request处理
        """
        request.meta['LATEST_SCHEDULER_TIMESTAMP'] = time.time()

    def process_response(self, request, response, spider):
        """
        Response处理
        """

        # 打印出状态码不是200的url
        allowed_status = [200]
        if response.status not in allowed_status:
            spider.logger.info('%(request)s status: %(status)s', {
                'status': response.status,
                'request': request
            })
        # url匹配规则
        if 'spider.rescheduler_url_regex' in dir():
            if hasattr(spider.rescheduler_url_regex, '__call__'):
                if not spider.rescheduler_url_regex(request.url):
                    spider.logger.info('%s is not expect' % request.url)
                    return response

        # 处理重定向
        allowed_3xx_status = (301, 302, 303, 307)
        if 'Location' in response.headers or response.status in allowed_3xx_status:
            location = safe_url_string(response.headers['location'])
            redirected_url = urljoin(request.url, location)
            spider.logger.info(
                'redirecte url: %(redirected)s from %(request)s', {
                    'redirected': redirected_url,
                    'request': request
                })

        priority = 1

        timestamp = request.meta.get('LATEST_SCHEDULER_TIMESTAMP', time.time())
        delta = time.time() - timestamp

        if delta > self.interval:
            self.__save_to_redis(spider, request.url, priority)
        else:
            Timer(self.interval - delta, self.__save_to_redis,
                  (spider, request.url, priority)).start()

        return response

    def process_exception(self, request, exception, spider):
        """
        下载异常处理
        """
        if isinstance(exception, self.EXCEPTIONS_TO_HANDLE):
            return self.__handle(request, spider)

    def __handle(self, request, spider):
        priority = -1
        if self.retry:
            retries = request.meta.get('retry_times', 0) + 1
            if retries > self.max_retry_times:
                Timer(self.interval, self.__save_to_redis,
                      (spider, request.url, priority)).start()
        else:
            Timer(self.interval, self.__save_to_redis,
                  (spider, request.url, priority)).start()

    def __save_to_redis(self, spider, url, priority):

        url_key = self.keyGenerator.generator_data_key(url)

        if self.policy == 'Delete':
            spider.server.delete(url_key)
            return

        data = spider.server.hgetall(url_key)
        expire = int(data.get('expire', -1))  # 过期时间戳

        # 没有设置过期时间戳,默认不重新调度
        if expire < 0:
            return
        # URL 已过期
        if time.time() > expire:
            return

        # 优先级
        _priority = float(data.get('priority', 100)) + priority

        spider.logger.debug('add url: %s into %s <%d>' %
                            (url, spider.redis_key, _priority))
        # URL 有效
        spider.server.hset(url_key, 'priority', _priority)

        if spider.is_zset:
            spider.server.zadd(spider.redis_key, _priority, url)
        else:
            save_one = spider.server.sadd if spider.use_set else spider.server.rpush
            save_one(spider.redis_key, url)
Beispiel #4
0
class AirasiaSearchSpider(RedisSpider):
    """
    AirasiaSearchSpider instance
    """
    domain = "airasia"
    name = "search"
    source = 'AKB2C'
    redis_key = '{domain}:{name}:queue'.format(domain=domain, name=name)

    allowed_domains = ["airasia.com"]

    def __init__(self, *args, **kwargs):
        super(AirasiaSearchSpider, self).__init__(*args, **kwargs)
        self.settings = get_project_settings()
        self.flightParse = AirasiaFlightParse()
        self.fareParse = AirasiaFareParse()
        self.keyGenerator = KeyGenerator()
        self.use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', False)
        self.is_zset = self.settings.getbool('REDIS_START_URLS_IS_ZSET', False)
        self.redis_key_live = self.settings.getint('REDIS_KEY_LIVE_TIME', 300)

    def rescheduler_url_regex(self, url):
        """
        判断url是否有效
        无效的url不参与重新调度
        reschedulertimer.py文件调用
        参数:
            @param url: 判断的url
        返回:
            Boolean True-有效 False-无效
        """
        if url.find('Flight/Select?') > 0:
            return True
        return False

    def parse(self, response):

        try:
            filename = 'download/flight/%s.html' % response.url.split("/")[-1]
            with open(filename, 'wb') as _file:
                _file.write(response.body)
        except:
            pass

        # Cookies处理
        cookies = None

        for key, value in response.headers.items():
            if key == 'Set-Cookie':
                cookies = cookies_to_dict(value)

        # print cookies

        self.flightParse.set_html(response.body)
        rows = self.flightParse.get_rows()

        # TODO: 数据完整性

        for row in rows:
            schedule = self.flightParse.get_schedule(row)
            itinerary = self.flightParse.get_itinerary(row)

            Flights = {}
            flight = {}

            routes = itinerary['route'].split('-')
            depart = routes[0]
            arrive = routes[-1]

            for key in itinerary.keys():

                if key.find('Flight') > -1:
                    # 获取航班
                    Flights[key] = itinerary[key]
                    index = int(re.findall(r'\d', key)[0]) - 1
                    flight[index] = {}
                    flight[index]['date'] = itinerary[key]['date']
                    flight[index]['no'] = itinerary[key]['no']

            no = '/'.join([value['no'] for key, value in flight.items()])
            date = flight[0]['date']

            key = self.keyGenerator.generator_detail_key(depart=depart,
                                                         arrive=arrive,
                                                         source=self.source,
                                                         no=no,
                                                         date=date)

            fare_redis_key = '{domain}:{name}:queue'.format(domain=self.domain,
                                                            name='fare')
            save_one = self.server.sadd if self.use_set else self.server.rpush
            # 获取舱位及其费用链接
            if 'tag' in itinerary.keys():
                for cabin, value in itinerary['tag'].items():
                    # print cabin, value['remaining'], value['url']
                    # 把URL放入fare的调度队列中
                    if self.is_zset:
                        self.server.zadd(fare_redis_key, 100.0, value['url'])
                    else:
                        save_one(fare_redis_key, value['url'])
                    # 编码URL作为redis的hash键,用于给其他的爬虫传递数据

                    url_key = self.keyGenerator.generator_data_key(
                        value['url'])
                    meta = json.dumps({'key': key, 'class': cabin})

                    data = {
                        'meta': meta,
                        'cookies': cookies,
                        'priority': 100,
                        'expire': int(time.time() + 24 * 60 * 60)
                    }
                    self.server.hmset(url_key, data)
                    self.server.expire(url_key, 24 * 60 * 60)

            update = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
            # 返回数据到存储后端
            yield {
                # 'id': 'flight',                   # 标示此次yield
                'key': key,
                'depart': depart,
                'arrive': arrive,
                'date': date,
                'schedule': schedule,  # 时刻表
                'no': no,  # 所有航班号用/分隔
                'type': itinerary['type'],  # 类型:Fly-Through/Direct
                'route': itinerary['route'],  # 行程摘要 CNX-(KUL)-PEK
                'source': self.domain,  # 数据来源
                'flights': Flights,  # 所有航班信息
                'flight_update': update,  # 更新时间
            }

    def downloader_exception_handle(self, request, reason, spider):
        """
        DOWNLOADER_MIDDLEWARES:DownloaderExceptionHandle
        downloader exception handle func
        """
        # spider.save('%s' % 'AirAsiaSearch:failed', request.url)
        spider.logger.error("%(request)s \nException: %(reason)s", {
            'request': request,
            'reason': reason
        })
Beispiel #5
0
class AirasiaFareSpider(RedisSpider):
    """
    AirasiaFareSpider instance
    """
    domain = "airasia"
    name = "fare"
    source = 'AKB2C'
    redis_key = '{domain}:{name}:queue'.format(domain=domain, name=name)

    allowed_domains = ["airasia.com"]

    def __init__(self, *args, **kwargs):
        super(AirasiaFareSpider, self).__init__(*args, **kwargs)
        self.settings = get_project_settings()
        self.fareParse = AirasiaFareParse()
        self.keyGenerator = KeyGenerator()
        self.use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', False)
        self.is_zset = self.settings.getbool('REDIS_START_URLS_IS_ZSET', False)
        self.redis_key_live = self.settings.getint('REDIS_KEY_LIVE_TIME',
                                                   300)

    def make_requests_from_url(self, url):
        """
        重写make_requests_from_url

        根据URL的编码从redis中取出数据加到Request中

        参数:
            @param url:
            @param priority: 优先级
        返回:
            Request对象
        """

        if url.find('SellKeys') == -1:
            return

        url_key = self.keyGenerator.generator_data_key(url)
        data = self.server.hgetall(url_key)
        if not data:
            self.logger.debug('no key:%s data,url:%s' % (url_key, url))
            return
        cookies = eval(data['cookies'])
        meta = json.loads(data['meta'])
        # print '-' * 80
        # print cookies
        # print '-' * 80
        return scrapy.Request(url, cookies=cookies, meta=meta)

    def rescheduler_url_regex(self, url):
        """
        判断url是否有效
        无效的url不参与重新调度
        reschedulertimer.py文件调用
        参数:
            @param url: 判断的url
        返回:
            Boolean True-有效 False-无效
        """
        if url.find('Flight/PriceItinerary?') > 0:
            return True
        return False

    def parse(self, response):
        """
        Parse the flight fare
        """

        try:
            filename = 'download/fare/%s.html' % response.url.split("=")[-1]
            with open(filename, 'wb') as _file:
                _file.write(response.body)
        except:
            pass

        self.fareParse.set_html(response.body)
        # print self.fareParse.prettify()
        fare = self.fareParse.get_fare()

        # 数据为空处理
        if not fare:
            # cookie不可用删除
            url_key = self.keyGenerator.generator_data_key(response.url)
            self.server.delete(url_key)
            return

        # 类型
        _type = 0

        for key in fare:
            # 成人
            if 'Adult(s)' in key:
                _type += 1
            # 儿童
            elif 'Children' in key:
                _type += 2
            # 婴儿
            elif 'Infant(s)' in key:
                _type += 4

        # print fare
        key = response.meta['key']
        cabin = response.meta['class']

        update = time.strftime('%Y-%m-%d %H:%M:%S',
                               time.localtime(time.time()))
        yield {
            # 'id': 'fare',           # 标示此次yield
            'key': key,
            # 'class': cabin,         # 舱位等级
            # 类型 1:成人/3:成人+儿童/5:成人+婴儿/7:成人+儿童+婴儿
            'class:%s:%s' % (cabin, _type): fare,
            'fare_update': update,        # 更新时间
        }

    def downloader_exception_handle(self, request, reason, spider):
        """
        DOWNLOADER_MIDDLEWARES:DownloaderExceptionHandle
        downloader exception handle func
        """
        # spider.save('%s' % 'AirAsiaSearch:failed', request.url)
        spider.logger.error("%(request)s \nException: %(reason)s",
                            {'request': request,
                             'reason': reason})
Beispiel #6
0
def generate_url(args):
    """
    生成URL
    """

    print 'generate_url', args

    search_queue_key = 'airasia:search:queue'

    REDIS_START_URLS_AS_SET = True
    REDIS_START_URLS_IS_ZSET = True

    redis_param = {'host': '127.0.0.1', 'port': 6379}
    redis_param.update(dict(x.split('=', 1) for x in args.redis))

    server = redis.Redis(**redis_param)

    search = AirasiaSearch()

    keyGenerator = KeyGenerator()

    params = {}

    if 'airasia' in args.source:
        params = {
            'o1': args.depart.upper(),
            'd1': args.arrive.upper(),
            'dd1': args.date,
            'ADT': args.adult,
            'CHD': args.children,
            'inl': args.infant
        }
        # 返程处理
        # 有返程时,是不会把单程日期拆分为同一个的单程和返程
        if args.return_date:
            params['dd2'] = args.return_date
        # 语种处理
        if args.language == 'en':
            params['culture'] = 'en-GB'
        elif args.language == 'zh':
            params['culture'] = 'zh-CN'

    params['days'] = args.days
    params['round'] = args.round

    urls = search.generate_urls(**params)

    for url, date in urls:

        priority = 100.0

        if REDIS_START_URLS_IS_ZSET:
            server.zadd(search_queue_key, url, priority)
        else:
            save_one = server.sadd if REDIS_START_URLS_AS_SET else server.lpush
            save_one(search_queue_key, url)

        date = '%s 23:59:59' % date
        timestamp = int(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M:%S')))

        data = {'priority': priority, 'expire': timestamp}

        url_key = keyGenerator.generator_data_key(url)
        server.hmset(url_key, data)
        server.expireat(url_key, timestamp)

        print url, date
Beispiel #7
0
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from com.utils.key import KeyGenerator

keyGenerator = KeyGenerator()


class AirasiaSearchPipeline(object):
    """
    AirasiaSearchPipeline
    """
    def process_item(self, item, spider):
        """
        process_item
        """
        return item


def AirasiaSearchRedisPipeline(server, item, spider):
    """
    AirasiaSearchRedisPipeline
    Redis backend
    """
    try:

        key = keyGenerator.generator_set_key(date=item['date'],