Esempio n. 1
0
class Scheduler(Logger):

    def __init__(self, crawler):

        self.settings = crawler.settings
        self.set_logger(crawler)

        if self.settings.get("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.get("REDIS_PORT"))
        self.queue_name = "%s:*:queue"
        self.queues = {}
        self.extract = tldextract.extract

    @classmethod
    def from_crawler(cls, crawler):

        return cls(crawler)

    def open(self, spider):

        self.spider = spider
        self.queue_name = self.queue_name%spider.name
        spider.set_redis(self.redis_conn)
        spider.set_logger(self.logger)

    def request_to_dict(self, request):

        req_dict = {
            'url': request.url.decode('ascii'),
            'method': request.method,
            'headers': dict(request.headers),
            'body': request.body,
            'cookies': request.cookies,
            'meta': request.meta,
            '_encoding': request._encoding,
            'priority': request.priority,
            'dont_filter': request.dont_filter,
            'callback': None if request.callback is None else request.callback.func_name,
            'errback': None if request.errback is None else request.errback.func_name,
        }
        return req_dict

    def enqueue_request(self, request):

        req_dict = self.request_to_dict(request)
        ex_res = self.extract(req_dict['url'])
        key = "{sid}:{dom}.{suf}:queue".format(
            sid=req_dict['meta']['spiderid'],
            dom=ex_res.domain,
            suf=ex_res.suffix)
        self.redis_conn.zadd(key, json.dumps(req_dict), -int(req_dict["priority"]))
        self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue"
                          .format(id=req_dict['meta']['crawlid'],
                                  url=req_dict['url']))

    def next_request(self):

        queues = self.redis_conn.keys(self.queue_name)

        if queues:
            queue = random.choice(queues)
            self.logger.info("length of queue %s is %s" %
                             (queue, self.redis_conn.zcard(queue)))

            if self.settings.get("CUSTOM_REDIS"):
                item = self.redis_conn.zpop(queue)
            else:
                pipe = self.redis_conn.pipeline()
                pipe.multi()
                pipe.zrange(queue, 0, 0).zremrangebyrank(queue, 0, 0)
                result, count = pipe.execute()
                item = result[0]

            if item:
                item = json.loads(item)

                try:
                    req = Request(item['url'])
                except ValueError:
                    req = Request('http://' + item['url'])

                if 'callback' in item:
                    cb = item['callback']
                    if cb and self.spider:
                        cb = getattr(self.spider, cb)
                        req.callback = cb

                if 'errback' in item:
                    eb = item['errback']
                    if eb and self.spider:
                        eb = getattr(self.spider, eb)
                        req.errback = eb

                if 'meta' in item:
                    item = item['meta']

                # defaults not in schema
                if 'curdepth' not in item:
                    item['curdepth'] = 0

                if "retry_times" not in item:
                    item['retry_times'] = 0

                for key in item.keys():
                    req.meta[key] = item[key]

                if 'useragent' in item and item['useragent'] is not None:
                    req.headers['User-Agent'] = item['useragent']

                if 'cookie' in item and item['cookie'] is not None:
                    if isinstance(item['cookie'], dict):
                        req.cookies = item['cookie']
                    elif isinstance(item['cookie'], basestring):
                        req.cookies = parse_cookie(item['cookie'])
                return req


    def close(self, reason):

        self.logger.info("Closing Spider", {'spiderid': self.spider.name})

    def has_pending_requests(self):

        return False
Esempio n. 2
0
class Scheduler(object):
    spider = None

    def __init__(self, crawler):
        self.settings = crawler.settings
        self.logger = Logger.from_crawler(crawler)
        if self.settings.getbool("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis
        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.getint("REDIS_PORT"))
        self.queue_name = None
        self.queues = {}

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def open(self, spider):
        self.spider = spider
        self.queue_name = self.settings.get("TASK_QUEUE_TEMPLATE",
                                            "%s:request:queue") % spider.name
        spider.set_redis(self.redis_conn)

    def enqueue_request(self, request):
        request.callback = getattr(request.callback, "__name__",
                                   request.callback)
        request.errback = getattr(request.errback, "__name__", request.errback)
        self.redis_conn.zadd(self.queue_name, pickle.dumps(request),
                             -int(request.meta["priority"]))
        self.logger.debug("Crawlid: %s, url: %s added to queue. " %
                          (request.meta['crawlid'], request.url))

    def next_request(self):
        self.logger.debug(
            "length of queue %s is %s" %
            (self.queue_name, self.redis_conn.zcard(self.queue_name)))
        item = None

        if self.settings.getbool("CUSTOM_REDIS"):
            item = self.redis_conn.zpop(self.queue_name)
        else:
            pipe = self.redis_conn.pipeline()
            pipe.multi()
            pipe.zrange(self.queue_name, 0,
                        0).zremrangebyrank(self.queue_name, 0, 0)
            result, _ = pipe.execute()
            if result:
                item = result[0]

        if item:
            request = pickle.loads(item)
            request.callback = request.callback and getattr(
                self.spider, request.callback)
            request.errback = request.errback and getattr(
                self.spider, request.errback)
            return request

    def close(self, reason):
        self.logger.info("Closing Spider: %s. " % self.spider.name)

    def has_pending_requests(self):
        return False
Esempio n. 3
0
class Scheduler(object):
    # 记录当前正在处理的item, 在处理异常时使用
    present_item = None
    spider = None

    def __init__(self, crawler):

        self.settings = crawler.settings
        self.logger = Logger.from_crawler(crawler)
        if self.settings.getbool("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis
        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.get("REDIS_PORT"))
        self.queue_name = "%s:item:queue"
        self.queues = {}
        self.request_interval = 60 / self.settings.getint("SPEED", 60)
        self.last_acs_time = time.time()

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def open(self, spider):
        self.spider = spider
        self.queue_name = self.queue_name % spider.name
        spider.set_redis(self.redis_conn)

    def request_to_dict(self, request):

        headers = dict([(item[0].decode("ascii"), item[1])
                        for item in request.headers.items()])
        req_dict = {
            'url':
            request.url,
            'method':
            request.method,
            'headers':
            headers,
            'body':
            request.body,
            'cookies':
            request.cookies,
            'meta':
            request.meta,
            '_encoding':
            request._encoding,
            'dont_filter':
            request.dont_filter,
            'callback':
            request.callback
            if not isinstance(request.callback, types.MethodType) else
            request.callback.__name__,
            'errback':
            request.errback
            if not isinstance(request.errback, types.MethodType) else
            request.errback.__name__,
        }
        return req_dict

    @enqueue_request_method_wrapper
    def enqueue_request(self, request):
        req_dict = self.request_to_dict(request)
        self.redis_conn.zadd(self.queue_name, pickle.dumps(req_dict),
                             -int(req_dict["meta"]["priority"]))
        self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue".format(
            id=req_dict['meta']['crawlid'], url=req_dict['url']))

    @next_request_method_wrapper
    def next_request(self):

        self.logger.info(
            "length of queue %s is %s" %
            (self.queue_name, self.redis_conn.zcard(self.queue_name)))
        item = None
        if time.time() - self.request_interval < self.last_acs_time:
            return item
        if self.settings.getbool("CUSTOM_REDIS"):
            item = self.redis_conn.zpop(self.queue_name)
        else:
            pipe = self.redis_conn.pipeline()
            pipe.multi()
            pipe.zrange(self.queue_name, 0,
                        0).zremrangebyrank(self.queue_name, 0, 0)
            result, count = pipe.execute()

            if result:
                item = result[0]

        if item:
            self.last_acs_time = time.time()
            item = pickle.loads(item)
            self.present_item = item
            headers = item.get("headers", {})
            body = item.get("body")
            if item.get("method"):
                method = item.get("method")
            else:
                method = "GET"

            try:
                req = Request(item['url'],
                              method=method,
                              body=body,
                              headers=headers)
            except ValueError:
                req = Request('http://' + item['url'],
                              method=method,
                              body=body,
                              headers=headers)

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = getattr(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = getattr(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0

            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']

            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], (str, bytes)):
                    req.cookies = parse_cookie(item['cookie'])

            return req

    def close(self, reason):
        self.logger.info("Closing Spider", {'spiderid': self.spider.name})

    def has_pending_requests(self):
        return False