Ejemplo n.º 1
0
class RedisSource(Source):
    """
    redis source
    """
    def __init__(self, redis_host, redis_port, redis_key, **kwargs):
        try:
            from custom_redis.client import Redis
        except ImportError:
            try:
                from redis import Redis
            except ImportError:
                warnings.warn(
                    "RedisSource depends on redis, try: pip install redis. ")
                exit(1)
        self.redis_key = redis_key
        self.redis_conn = Redis(redis_host, redis_port)

    async def __anext__(self):
        """
        异步迭代器需要实现这个方法,这是一个异步方法,最终返回一个迭代值。
        :return:
        """
        return self.redis_conn.lpop(self.redis_key)

    async def push_back(self, data):
        self.redis_conn.rpush(self.redis_key, data)

    @staticmethod
    def enrich_parser(sub_parser):
        sub_parser.add_argument("-rh", "--redis-host", default="0.0.0.0")
        sub_parser.add_argument("-rp", "--redis-port", default=6379)
        sub_parser.add_argument("-rk", "--redis-key", default="download_meta")
        sub_parser.add_argument("--idle", action="store_true", help="Idle... ")
Ejemplo n.º 2
0
    def setup(self):

        self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0

        if self.custom:
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(host=self.host, port=self.port)
        self.clean_previous_task(self.crawlid)
Ejemplo n.º 3
0
 def __init__(self, crawler):
     self.settings = crawler.settings
     self.logger = Logger.from_crawler(crawler)
     if self.settings.getbool("CUSTOM_REDIS"):
         from custom_redis.client import Redis
     else:
         from redis import Redis
     self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                             self.settings.getint("REDIS_PORT"))
     self.queue_name = None
     self.queues = {}
Ejemplo n.º 4
0
 def __init__(self, redis_host, redis_port, redis_key, **kwargs):
     try:
         from custom_redis.client import Redis
     except ImportError:
         try:
             from redis import Redis
         except ImportError:
             warnings.warn(
                 "RedisSource depends on redis, try: pip install redis. ")
             exit(1)
     self.redis_key = redis_key
     self.redis_conn = Redis(redis_host, redis_port)
Ejemplo n.º 5
0
    def setup(self):

        self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0

        if self.custom:
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(host=self.host, port=self.port)
        self.redis_conn.delete("crawlid:%s" % self.crawlid)
        self.redis_conn.delete("failed_pages:%s" % self.crawlid)
        self.redis_conn.delete("crawlid:%s:model" % self.crawlid)
Ejemplo n.º 6
0
 def __init__(self, crawler):
     self.settings = crawler.settings
     self.logger = CustomLogger.from_crawler(crawler)
     if self.settings.getbool("CUSTOM_REDIS"):
         from custom_redis.client import Redis
     else:
         from redis import Redis
     self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                             self.settings.getint("REDIS_PORT"))
     self.queue_name = None
     self.queues = {}
     self.request_interval = 60 / self.settings.getint("SPEED", 60)
     self.last_acs_time = time.time()
 def __init__(self, settings):
     self.settings_file = settings
     Logger.__init__(self, settings)
     self.set_logger()
     MultiThreadClosing.__init__(self)
     self.de_queue = Queue()
     if self.settings.get("CUSTOM_REDIS"):
         from custom_redis.client import Redis
     else:
         from redis import Redis
     self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                             self.settings.get("REDIS_PORT"))
     self.small = False
Ejemplo n.º 8
0
    def __init__(self, crawler):

        self.settings = crawler.settings
        self.set_logger(crawler)

        if self.settings.get("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.get("REDIS_PORT"))
        self.queue_name = "%s:*:queue"
        self.queues = {}
        self.extract = tldextract.extract
Ejemplo n.º 9
0
def start(crawlid, host, custom):
    if custom:
        from custom_redis.client import Redis
    else:
        from redis import Redis

    redis_conn = Redis(host)
    key = "crawlid:%s" % crawlid
    failed_pages = int(redis_conn.hget(key, "failed_download_pages") or 0)
    format(redis_conn.hgetall(key))
    if failed_pages:
        print_if = raw_input("show the failed pages? y/n default n:")
        if print_if == "y":
            key_ = "failed_pages:%s" % crawlid
            p = redis_conn.hgetall(key_)
            format(p, True)
Ejemplo n.º 10
0
def start(crawlid, host, port, custom):
    if custom:
        from custom_redis.client import Redis
    else:
        from redis import Redis
    redis_conn = Redis(host, port)
    key = "crawlid:%s" % crawlid
    data = redis_conn.hgetall(key)
    failed_keys = [x for x in data.keys() if fnmatch.fnmatch(
        x.decode() if isinstance(x, bytes) else x, "failed_download_*")]
    format(data)
    for fk in failed_keys:
        fk = fk.decode() if isinstance(fk, bytes) else fk
        print_if = input("show the %s? y/n default n:" % fk.replace("_", " "))
        if print_if == "y":
            key_ = "%s:%s" % (fk, crawlid)
            p = redis_conn.hgetall(key_)
            format(p, True)
Ejemplo n.º 11
0
def start(crawlid, host, custom):
    if custom:
        from custom_redis.client import Redis
    else:
        from redis import Redis

    redis_conn = Redis(host)
    key = "crawlid:%s" % crawlid
    data = redis_conn.hgetall(key)
    failed_keys = filter(lambda x: fnmatch.fnmatch(x, "failed_download_*"),
                         data.keys())
    format(data)
    for fk in failed_keys:
        print_if = raw_input("show the %s? y/n default n:" %
                             fk.replace("_", " "))
        if print_if == "y":
            key_ = "%s:%s" % (fk, crawlid)
            p = redis_conn.hgetall(key_)
            format(p, True)
Ejemplo n.º 12
0
    def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host, custom):
        self.crawlid = crawlid
        self.spiderid = spiderid
        self.url = url
        self.urls_file = urls_file
        self.priority = priority
        self.port = port
        self.host = host
        self.custom = custom
        self.inc = 0
        self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0

        if self.custom:
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(host=self.host, port=self.port)
        self.clean_previous_task(self.crawlid)
 def __init__(self, settings):
     self.settings_file = settings
     Logger.__init__(self, settings)
     self.set_logger()
     MultiThreadClosing.__init__(self)
     self.de_queue = Queue()
     if self.settings.get("CUSTOM_REDIS"):
         from custom_redis.client import Redis
     else:
         from redis import Redis
     self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                             self.settings.get("REDIS_PORT"))
     self.small = False
class MultiDownloadProcess(Logger, MultiThreadClosing):

    name = "multidownload_process"

    def __init__(self, settings):
        self.settings_file = settings
        Logger.__init__(self, settings)
        self.set_logger()
        MultiThreadClosing.__init__(self)
        self.de_queue = Queue()
        if self.settings.get("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis
        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.get("REDIS_PORT"))
        self.small = False

    @classmethod
    def parse_args(cls):
        parser = ArgumentParser()
        parser.add_argument("-s",
                            "--settings",
                            dest="settings",
                            default="settings.py")
        return cls(**vars(parser.parse_args()))

    def is_small(self):
        self.small = True

    def callback(self, item, flag):
        """
        callback called when download is finished.
        :return:
        """
        raise NotImplementedError()

    def decode(self, item):
        """
        redis pop out to got url, filename, directory
        :param item:
        :return: (url, filename, directory)
        """
        raise NotImplementedError()

    def processing(self, de, url_paths, item):
        if self.small:
            downloader = "download_small_file"
        else:
            downloader = "start"
        flag = False
        try:
            t1 = time.time()
            length = len(url_paths)
            for index, (url, filename, path) in enumerate(url_paths):
                result = getattr(de, downloader)(url=url,
                                                 filename=filename,
                                                 path=path)
                self.logger.info("download process %s/%s completed" %
                                 (index + 1, length))
                flag = flag or result
            t2 = time.time()
            self.logger.info("download finished, success:%s, seconds:%.4f" %
                             (flag, t2 - t1))
            self.de_queue.put(de)
            self.callback(item, flag)
            self.logger.info("callback finished, seconds:%.4f" %
                             (time.time() - t2))
        except Exception:
            self.logger.error(traceback.format_exc())
        finally:
            try:
                self.threads.remove(current_thread())
            except ValueError:
                pass
            self.logger.info("the count of thread which alives is %s. " %
                             len(self.threads))

    def start(self):
        self.logger.debug("start process %s" % self.name)
        concurrent_download_count = self.settings.get(
            "CONCURRENT_DOWNLOAD_COUNT", 10)
        for i in range(concurrent_download_count):
            DE = DownloaderEngine(self.settings_file, signal_open=False)
            DE.set_logger(self.logger)
            self.de_queue.put(DE)
        self.logger.debug("setup %s des" % concurrent_download_count)
        while self.alive:
            try:
                item = self.redis_conn.lpop(self.settings.get("QUEUE_KEY"))
            except Exception:
                self.logger.error("redis error %s" % traceback.format_exc())
                item = None
            if not item:
                self.logger.debug("got no message...")
                time.sleep(1)
                continue
            self.logger.debug(
                "%s tasks  to be continue..." %
                self.redis_conn.llen(self.settings.get("QUEUE_KEY")))
            try:
                url_paths = self.decode(item)
            except Exception:
                self.logger.error(traceback.format_exc())
                url_paths = []
            while url_paths:
                try:
                    DE = self.de_queue.get_nowait()
                    th = Thread(target=self.processing,
                                args=(DE, url_paths, item))
                    self.set_force_interrupt(th)
                    self.logger.debug("start a new thread. ")
                    th.start()
                except Empty:
                    time.sleep(1)
                else:
                    break
        while True:
            if filter(lambda x: x.is_alive(), self.threads):
                time.sleep(1)
            else:
                break
class MultiDownloadProcess(Logger, MultiThreadClosing):

    name = "multidownload_process"

    def __init__(self, settings):
        self.settings_file = settings
        Logger.__init__(self, settings)
        self.set_logger()
        MultiThreadClosing.__init__(self)
        self.de_queue = Queue()
        if self.settings.get("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis
        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.get("REDIS_PORT"))
        self.small = False

    @classmethod
    def parse_args(cls):
        parser = ArgumentParser()
        parser.add_argument("-s", "--settings", dest="settings", default="settings.py")
        return cls(**vars(parser.parse_args()))

    def is_small(self):
        self.small=True

    def callback(self, item, flag):
        """
        callback called when download is finished.
        :return:
        """
        raise NotImplementedError()

    def decode(self, item):
        """
        redis pop out to got url, filename, directory
        :param item:
        :return: (url, filename, directory)
        """
        raise NotImplementedError()

    def processing(self, de, url_paths, item):
        if self.small:
            downloader = "download_small_file"
        else:
            downloader = "start"
        flag = False
        try:
            t1 = time.time()
            length = len(url_paths)
            for index, (url, filename, path) in enumerate(url_paths):
                result = getattr(de, downloader)(url=url, filename=filename, path=path)
                self.logger.info("download process %s/%s completed"%(index+1, length))
                flag = flag or result
            t2 = time.time()
            self.logger.info("download finished, success:%s, seconds:%.4f"%(flag,  t2-t1))
            self.de_queue.put(de)
            self.callback(item, flag)
            self.logger.info("callback finished, seconds:%.4f"%(time.time()-t2))
        except Exception:
            self.logger.error(traceback.format_exc())
        finally:
            try:
                self.threads.remove(current_thread())
            except ValueError:
                pass
            self.logger.info("the count of thread which alives is %s. "%len(self.threads))

    def start(self):
        self.logger.debug("start process %s"%self.name)
        concurrent_download_count = self.settings.get("CONCURRENT_DOWNLOAD_COUNT", 10)
        for i in range(concurrent_download_count):
            DE = DownloaderEngine(self.settings_file, signal_open=False)
            DE.set_logger(self.logger)
            self.de_queue.put(DE)
        self.logger.debug("setup %s des"%concurrent_download_count)
        while self.alive:
            try:
                item = self.redis_conn.lpop(self.settings.get("QUEUE_KEY"))
            except Exception:
                self.logger.error("redis error %s"%traceback.format_exc())
                item = None
            if not item:
                self.logger.debug("got no message...")
                time.sleep(1)
                continue
            self.logger.debug("%s tasks  to be continue..."%self.redis_conn.llen(self.settings.get("QUEUE_KEY")))
            try:
                url_paths = self.decode(item)
            except Exception:
                self.logger.error(traceback.format_exc())
                url_paths = []
            while url_paths:
                try:
                    DE = self.de_queue.get_nowait()
                    th = Thread(target=self.processing, args=(DE, url_paths, item))
                    self.set_force_interrupt(th)
                    self.logger.debug("start a new thread. ")
                    th.start()
                except Empty:
                    time.sleep(1)
                else:
                    break
        while True:
            if filter(lambda x:x.is_alive(), self.threads):
                time.sleep(1)
            else:
                break
Ejemplo n.º 16
0
class Scheduler(object):
    spider = None

    def __init__(self, crawler):
        self.settings = crawler.settings
        self.logger = Logger.from_crawler(crawler)
        if self.settings.getbool("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis
        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.getint("REDIS_PORT"))
        self.queue_name = None
        self.queues = {}

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def open(self, spider):
        self.spider = spider
        self.queue_name = self.settings.get("TASK_QUEUE_TEMPLATE",
                                            "%s:request:queue") % spider.name
        spider.set_redis(self.redis_conn)

    def enqueue_request(self, request):
        request.callback = getattr(request.callback, "__name__",
                                   request.callback)
        request.errback = getattr(request.errback, "__name__", request.errback)
        self.redis_conn.zadd(self.queue_name, pickle.dumps(request),
                             -int(request.meta["priority"]))
        self.logger.debug("Crawlid: %s, url: %s added to queue. " %
                          (request.meta['crawlid'], request.url))

    def next_request(self):
        self.logger.debug(
            "length of queue %s is %s" %
            (self.queue_name, self.redis_conn.zcard(self.queue_name)))
        item = None

        if self.settings.getbool("CUSTOM_REDIS"):
            item = self.redis_conn.zpop(self.queue_name)
        else:
            pipe = self.redis_conn.pipeline()
            pipe.multi()
            pipe.zrange(self.queue_name, 0,
                        0).zremrangebyrank(self.queue_name, 0, 0)
            result, _ = pipe.execute()
            if result:
                item = result[0]

        if item:
            request = pickle.loads(item)
            request.callback = request.callback and getattr(
                self.spider, request.callback)
            request.errback = request.errback and getattr(
                self.spider, request.errback)
            return request

    def close(self, reason):
        self.logger.info("Closing Spider: %s. " % self.spider.name)

    def has_pending_requests(self):
        return False
Ejemplo n.º 17
0
class RedisFeed:
    def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host,
                 custom):

        self.name = "redis_feed"
        self.crawlid = crawlid
        self.spiderid = spiderid
        self.url = url
        self.urls_file = urls_file
        self.priority = priority
        self.port = port
        self.host = host
        self.custom = custom
        self.inc = 0
        self.extract = extract
        self.setup()

    @classmethod
    def parse_args(cls):

        parser = argparse.ArgumentParser()
        parser.add_argument('-rh',
                            "--redis-host",
                            dest="host",
                            type=str,
                            default="127.0.0.1")
        parser.add_argument('-rp',
                            "--redis-port",
                            dest="port",
                            type=int,
                            default=6379)
        parser.add_argument('-u', '--url', type=str)
        parser.add_argument('-uf', '--urls-file', type=str)
        parser.add_argument('-c', '--crawlid', required=True, type=str)
        parser.add_argument('-s', '--spiderid', required=True, type=str)
        parser.add_argument('-p', '--priority', type=int, default=100)
        parser.add_argument('--custom', action="store_true")
        return cls(**vars(parser.parse_args()))

    def setup(self):

        self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0

        if self.custom:
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(host=self.host, port=self.port)
        self.redis_conn.delete("crawlid:%s" % self.crawlid)
        self.redis_conn.delete("failed_pages:%s" % self.crawlid)
        self.redis_conn.delete("crawlid:%s:model" % self.crawlid)

    def start(self):
        sucess_rate, failed_rate = 0, 0
        # item抓取
        if self.urls_file:
            with open(self.urls_file) as f:
                lst = f.readlines()
                lines_count = len(lst)
                for index, url in enumerate(lst):
                    json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse_item", "priority":%s}' % (
                        url.strip("\357\273\277\r\n"), self.crawlid,
                        self.spiderid, self.priority)
                    self.failed_count += self.feed(self.get_name(url),
                                                   json_req)
                    sucess_rate, failed_rate = self.show_process_line(
                        lines_count, index + 1, self.failed_count)
                self.redis_conn.hset("crawlid:%s" % self.crawlid,
                                     "total_pages", lines_count)
        # 分类抓取
        else:
            url_list = self.url.split("     ")
            lines_count = len(url_list)

            for index, url in enumerate(url_list):
                json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse","priority":%s}' % (
                    url.strip(),
                    self.crawlid,
                    self.spiderid,
                    self.priority,
                )
                self.failed_count += self.feed(self.get_name(url), json_req)
                sucess_rate, failed_rate = self.show_process_line(
                    lines_count, index + 1, self.failed_count)
        print "\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" % (
            sucess_rate, failed_rate)

    def get_name(self, url):
        ex_res = self.extract(url)
        return "{sid}:{dom}.{suf}:queue".format(sid=self.spiderid,
                                                dom=ex_res.domain,
                                                suf=ex_res.suffix)

    def feed(self, queue_name, req):

        if self.custom:
            from custom_redis.client.errors import RedisError
        else:
            from redis import RedisError

        try:
            self.redis_conn.zadd(queue_name, req, -self.priority)
            return 0
        except RedisError:
            traceback.print_exc()
            return 1

    def show_process_line(self, count, num, failed):

        per = count / 100
        success = num - failed
        success_rate = success * 100.0 / count
        failed_rate = failed * 100.0 / count
        str_success_rate = "%.2f%%  " % success_rate
        str_failed_rate = "%.2f%%  " % failed_rate

        if num >= self.inc:
            self.inc += per

            if sys.platform == "win32":
                import ctypes
                std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11)
                color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute
                color_ctl(std_out_handle, 2)
                print "\r", str_success_rate,
                color_ctl(std_out_handle, 32)
                print int(success_rate * 30 / 100) * ' ',
                if int(failed_rate):
                    color_ctl(std_out_handle, 64)
                    print int(failed_rate * 30 / 100) * ' ',
                color_ctl(std_out_handle, 0)
                color_ctl(std_out_handle, 4)
                print str_failed_rate,
                color_ctl(std_out_handle, 7)
            else:
                print "\r", str_success_rate,
                print "%s%s" % (int(success_rate * 50 / 100) *
                                '\033[42m \033[0m', int(failed_rate * 50 / 100)
                                * '\033[41m \033[0m'), str_failed_rate,

        return success_rate, failed_rate
Ejemplo n.º 18
0
class Scheduler(Logger):

    def __init__(self, crawler):

        self.settings = crawler.settings
        self.set_logger(crawler)

        if self.settings.get("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.get("REDIS_PORT"))
        self.queue_name = "%s:*:queue"
        self.queues = {}
        self.extract = tldextract.extract

    @classmethod
    def from_crawler(cls, crawler):

        return cls(crawler)

    def open(self, spider):

        self.spider = spider
        self.queue_name = self.queue_name%spider.name
        spider.set_redis(self.redis_conn)
        spider.set_logger(self.logger)

    def request_to_dict(self, request):

        req_dict = {
            'url': request.url.decode('ascii'),
            'method': request.method,
            'headers': dict(request.headers),
            'body': request.body,
            'cookies': request.cookies,
            'meta': request.meta,
            '_encoding': request._encoding,
            'priority': request.priority,
            'dont_filter': request.dont_filter,
            'callback': None if request.callback is None else request.callback.func_name,
            'errback': None if request.errback is None else request.errback.func_name,
        }
        return req_dict

    def enqueue_request(self, request):

        req_dict = self.request_to_dict(request)
        ex_res = self.extract(req_dict['url'])
        key = "{sid}:{dom}.{suf}:queue".format(
            sid=req_dict['meta']['spiderid'],
            dom=ex_res.domain,
            suf=ex_res.suffix)
        self.redis_conn.zadd(key, json.dumps(req_dict), -int(req_dict["priority"]))
        self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue"
                          .format(id=req_dict['meta']['crawlid'],
                                  url=req_dict['url']))

    def next_request(self):

        queues = self.redis_conn.keys(self.queue_name)

        if queues:
            queue = random.choice(queues)
            self.logger.info("length of queue %s is %s" %
                             (queue, self.redis_conn.zcard(queue)))

            if self.settings.get("CUSTOM_REDIS"):
                item = self.redis_conn.zpop(queue)
            else:
                pipe = self.redis_conn.pipeline()
                pipe.multi()
                pipe.zrange(queue, 0, 0).zremrangebyrank(queue, 0, 0)
                result, count = pipe.execute()
                item = result[0]

            if item:
                item = json.loads(item)

                try:
                    req = Request(item['url'])
                except ValueError:
                    req = Request('http://' + item['url'])

                if 'callback' in item:
                    cb = item['callback']
                    if cb and self.spider:
                        cb = getattr(self.spider, cb)
                        req.callback = cb

                if 'errback' in item:
                    eb = item['errback']
                    if eb and self.spider:
                        eb = getattr(self.spider, eb)
                        req.errback = eb

                if 'meta' in item:
                    item = item['meta']

                # defaults not in schema
                if 'curdepth' not in item:
                    item['curdepth'] = 0

                if "retry_times" not in item:
                    item['retry_times'] = 0

                for key in item.keys():
                    req.meta[key] = item[key]

                if 'useragent' in item and item['useragent'] is not None:
                    req.headers['User-Agent'] = item['useragent']

                if 'cookie' in item and item['cookie'] is not None:
                    if isinstance(item['cookie'], dict):
                        req.cookies = item['cookie']
                    elif isinstance(item['cookie'], basestring):
                        req.cookies = parse_cookie(item['cookie'])
                return req


    def close(self, reason):

        self.logger.info("Closing Spider", {'spiderid': self.spider.name})

    def has_pending_requests(self):

        return False
Ejemplo n.º 19
0
class Scheduler(object):
    # 记录当前正在处理的item, 在处理异常时使用
    present_item = None
    spider = None

    def __init__(self, crawler):

        self.settings = crawler.settings
        self.logger = Logger.from_crawler(crawler)
        if self.settings.getbool("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis
        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.get("REDIS_PORT"))
        self.queue_name = "%s:item:queue"
        self.queues = {}
        self.request_interval = 60 / self.settings.getint("SPEED", 60)
        self.last_acs_time = time.time()

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def open(self, spider):
        self.spider = spider
        self.queue_name = self.queue_name % spider.name
        spider.set_redis(self.redis_conn)

    def request_to_dict(self, request):

        headers = dict([(item[0].decode("ascii"), item[1])
                        for item in request.headers.items()])
        req_dict = {
            'url':
            request.url,
            'method':
            request.method,
            'headers':
            headers,
            'body':
            request.body,
            'cookies':
            request.cookies,
            'meta':
            request.meta,
            '_encoding':
            request._encoding,
            'dont_filter':
            request.dont_filter,
            'callback':
            request.callback
            if not isinstance(request.callback, types.MethodType) else
            request.callback.__name__,
            'errback':
            request.errback
            if not isinstance(request.errback, types.MethodType) else
            request.errback.__name__,
        }
        return req_dict

    @enqueue_request_method_wrapper
    def enqueue_request(self, request):
        req_dict = self.request_to_dict(request)
        self.redis_conn.zadd(self.queue_name, pickle.dumps(req_dict),
                             -int(req_dict["meta"]["priority"]))
        self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue".format(
            id=req_dict['meta']['crawlid'], url=req_dict['url']))

    @next_request_method_wrapper
    def next_request(self):

        self.logger.info(
            "length of queue %s is %s" %
            (self.queue_name, self.redis_conn.zcard(self.queue_name)))
        item = None
        if time.time() - self.request_interval < self.last_acs_time:
            return item
        if self.settings.getbool("CUSTOM_REDIS"):
            item = self.redis_conn.zpop(self.queue_name)
        else:
            pipe = self.redis_conn.pipeline()
            pipe.multi()
            pipe.zrange(self.queue_name, 0,
                        0).zremrangebyrank(self.queue_name, 0, 0)
            result, count = pipe.execute()

            if result:
                item = result[0]

        if item:
            self.last_acs_time = time.time()
            item = pickle.loads(item)
            self.present_item = item
            headers = item.get("headers", {})
            body = item.get("body")
            if item.get("method"):
                method = item.get("method")
            else:
                method = "GET"

            try:
                req = Request(item['url'],
                              method=method,
                              body=body,
                              headers=headers)
            except ValueError:
                req = Request('http://' + item['url'],
                              method=method,
                              body=body,
                              headers=headers)

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = getattr(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = getattr(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0

            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']

            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], (str, bytes)):
                    req.cookies = parse_cookie(item['cookie'])

            return req

    def close(self, reason):
        self.logger.info("Closing Spider", {'spiderid': self.spider.name})

    def has_pending_requests(self):
        return False
Ejemplo n.º 20
0
class RedisFeed(object):
    def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host,
                 custom):

        self.name = "redis_feed"
        self.crawlid = crawlid
        self.spiderid = spiderid
        self.url = url
        self.urls_file = urls_file
        self.priority = priority
        self.port = port
        self.host = host
        self.custom = custom
        self.inc = 0
        self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0

        if self.custom:
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(host=self.host, port=self.port)
        self.clean_previous_task(self.crawlid)

    @classmethod
    def parse_args(cls):

        parser = argparse.ArgumentParser(description="usage: %prog [options]")
        parser.add_argument('-rh',
                            "--redis-host",
                            dest="host",
                            type=str,
                            default="127.0.0.1",
                            help="Redis host to feed in. ")
        parser.add_argument('-rp',
                            "--redis-port",
                            dest="port",
                            type=int,
                            default=6379,
                            help="Redis port to feed in. ")
        parser.add_argument('-u',
                            '--url',
                            type=str,
                            help="The url to crawl, a list of products. ")
        parser.add_argument('-uf',
                            '--urls-file',
                            type=str,
                            help="The urlsfile to crawl, single product. ")
        parser.add_argument('-c',
                            '--crawlid',
                            required=True,
                            type=str,
                            help="An unique Id for a crawl task. ")
        parser.add_argument('-s',
                            '--spiderid',
                            required=True,
                            type=str,
                            help="The website you wanna crawl. ")
        parser.add_argument('-p',
                            '--priority',
                            type=int,
                            default=100,
                            help="Feed in the task queue with priority. ")
        parser.add_argument('--custom',
                            action="store_true",
                            help="Use the custom redis whether or not. ")
        return cls(**vars(parser.parse_args()))

    def clean_previous_task(self, crawlid):
        failed_keys = self.redis_conn.keys("failed_download_*:%s" % crawlid)
        for fk in failed_keys:
            self.redis_conn.delete(fk)

        self.redis_conn.delete("crawlid:%s" % crawlid)
        self.redis_conn.delete("crawlid:%s:model" % crawlid)

    def start(self):
        sucess_rate, failed_rate = 0, 0
        # item抓取
        if self.urls_file:
            with open(self.urls_file) as f:
                lst = f.readlines()
                lines_count = len(lst)
                for index, url in enumerate(lst):
                    json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse_item", "priority":%s}' % (
                        url.strip("\357\273\277\r\n"), self.crawlid,
                        self.spiderid, self.priority)
                    self.failed_count += self.feed(self.get_name(), json_req)
                    sucess_rate, failed_rate = self.show_process_line(
                        lines_count, index + 1, self.failed_count)
                self.redis_conn.hset("crawlid:%s" % self.crawlid,
                                     "total_pages", lines_count)
                self.redis_conn.expire("crawlid:%s" % self.crawlid,
                                       2 * 24 * 60 * 60)
        # 分类抓取
        else:
            url_list = self.url.split("     ")
            lines_count = len(url_list)

            for index, url in enumerate(url_list):
                json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse","priority":%s}' % (
                    url.strip(),
                    self.crawlid,
                    self.spiderid,
                    self.priority,
                )
                self.failed_count += self.feed(self.get_name(), json_req)
                sucess_rate, failed_rate = self.show_process_line(
                    lines_count, index + 1, self.failed_count)
        print("\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" %
              (sucess_rate, failed_rate))

    def get_name(self):
        return "{sid}:item:queue".format(sid=self.spiderid)

    def feed(self, queue_name, req):

        if self.custom:
            from custom_redis.client.errors import RedisError
        else:
            from redis import RedisError

        try:
            self.redis_conn.zadd(queue_name, req, -self.priority)
            return 0
        except RedisError:
            traceback.print_exc()
            return 1

    def show_process_line(self, count, num, failed):

        per = count / 100
        success = num - failed
        success_rate = success * 100.0 / count
        failed_rate = failed * 100.0 / count
        str_success_rate = "%.2f%%  " % success_rate
        str_failed_rate = "%.2f%%  " % failed_rate

        if num >= self.inc:
            self.inc += per

            if sys.platform == "win32":
                import ctypes
                std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11)
                color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute
                color_ctl(std_out_handle, 2)
                print("\r", str_success_rate, "")
                color_ctl(std_out_handle, 32)
                print(int(success_rate * 30 / 100) * ' ', "")
                if int(failed_rate):
                    color_ctl(std_out_handle, 64)
                    print(int(failed_rate * 30 / 100) * ' ', "")
                color_ctl(std_out_handle, 0)
                color_ctl(std_out_handle, 4)
                print(str_failed_rate, "")
                color_ctl(std_out_handle, 7)
            else:
                print("\r", str_success_rate, "")
                print(
                    "%s%s" %
                    (int(success_rate * 50 / 100) * '\033[42m \033[0m',
                     int(failed_rate * 50 / 100) * '\033[41m \033[0m'),
                    str_failed_rate)

        return success_rate, failed_rate
Ejemplo n.º 21
0
class SpiderFeeder(object):
    def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host,
                 custom):
        self.crawlid = crawlid
        self.spiderid = spiderid
        self.url = url
        self.urls_file = urls_file
        self.priority = priority
        self.port = port
        self.host = host
        self.custom = custom
        self.inc = 0
        self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0

        if self.custom:
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(host=self.host, port=self.port)
        self.clean_previous_task(self.crawlid)

    def clean_previous_task(self, crawlid):
        failed_keys = self.redis_conn.keys("failed_download_*:%s" % crawlid)
        for fk in failed_keys:
            self.redis_conn.delete(fk)
        self.redis_conn.delete("crawlid:%s" % crawlid)
        self.redis_conn.delete("crawlid:%s:model" % crawlid)

    def start(self):
        success_rate, failed_rate = 0, 0
        # item抓取
        if self.urls_file:
            with open(self.urls_file) as f:
                lst = f.readlines()
                lines_count = len(lst)
                for index, url in enumerate(lst):
                    req = Request(url=url.strip("\357\273\277\r\n"),
                                  callback="parse_item",
                                  meta={
                                      "crawlid": self.crawlid,
                                      "spiderid": self.spiderid,
                                      "priority": self.priority
                                  })
                    self.failed_count += self.feed(self.get_name(),
                                                   pickle.dumps(req))
                    success_rate, failed_rate = \
                        self.show_process_line(
                            lines_count, index + 1, self.failed_count)
                self.redis_conn.hset("crawlid:%s" % self.crawlid,
                                     "total_pages", lines_count)
        # 分类抓取
        else:
            url_list = self.url.split("     ")
            lines_count = len(url_list)

            for index, url in enumerate(url_list):
                req = Request(url=url.strip(),
                              callback="parse",
                              meta={
                                  "crawlid": self.crawlid,
                                  "spiderid": self.spiderid,
                                  "priority": self.priority
                              })
                self.failed_count += self.feed(self.get_name(),
                                               pickle.dumps(req))
                sucess_rate, failed_rate = self.show_process_line(
                    lines_count, index + 1, self.failed_count)
        print("\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" %
              (success_rate, failed_rate))

    def get_name(self):
        return "{sid}:request:queue".format(sid=self.spiderid)

    def feed(self, queue_name, req):
        if self.custom:
            from custom_redis.client.errors import RedisError
        else:
            from redis import RedisError
        try:
            self.redis_conn.zadd(queue_name, req, -self.priority)
            return 0
        except RedisError:
            traceback.print_exc()
            return 1

    def show_process_line(self, count, num, failed):
        per = count / 100
        success = num - failed
        success_rate = success * 100.0 / count
        failed_rate = failed * 100.0 / count
        str_success_rate = "%.2f%%  " % success_rate
        str_failed_rate = "%.2f%%  " % failed_rate

        if num >= self.inc:
            self.inc += per
            if sys.platform == "win32":
                import ctypes
                std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11)
                color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute
                color_ctl(std_out_handle, 2)
                print("\r", str_success_rate, "")
                color_ctl(std_out_handle, 32)
                print(int(success_rate * 30 / 100) * ' ', "")
                if int(failed_rate):
                    color_ctl(std_out_handle, 64)
                    print(int(failed_rate * 30 / 100) * ' ', "")
                color_ctl(std_out_handle, 0)
                color_ctl(std_out_handle, 4)
                print(str_failed_rate, "")
                color_ctl(std_out_handle, 7)
            else:
                print("\r", str_success_rate, "")
                print(
                    "%s%s" %
                    (int(success_rate * 50 / 100) * '\033[42m \033[0m',
                     int(failed_rate * 50 / 100) * '\033[41m \033[0m'),
                    str_failed_rate)
        return success_rate, failed_rate