Ejemplo n.º 1
0
class RedisFeed(object):
    def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host,
                 custom):

        self.name = "redis_feed"
        self.crawlid = crawlid
        self.spiderid = spiderid
        self.url = url
        self.urls_file = urls_file
        self.priority = priority
        self.port = port
        self.host = host
        self.custom = custom
        self.inc = 0
        self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0

        if self.custom:
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(host=self.host, port=self.port)
        self.clean_previous_task(self.crawlid)

    @classmethod
    def parse_args(cls):

        parser = argparse.ArgumentParser(description="usage: %prog [options]")
        parser.add_argument('-rh',
                            "--redis-host",
                            dest="host",
                            type=str,
                            default="127.0.0.1",
                            help="Redis host to feed in. ")
        parser.add_argument('-rp',
                            "--redis-port",
                            dest="port",
                            type=int,
                            default=6379,
                            help="Redis port to feed in. ")
        parser.add_argument('-u',
                            '--url',
                            type=str,
                            help="The url to crawl, a list of products. ")
        parser.add_argument('-uf',
                            '--urls-file',
                            type=str,
                            help="The urlsfile to crawl, single product. ")
        parser.add_argument('-c',
                            '--crawlid',
                            required=True,
                            type=str,
                            help="An unique Id for a crawl task. ")
        parser.add_argument('-s',
                            '--spiderid',
                            required=True,
                            type=str,
                            help="The website you wanna crawl. ")
        parser.add_argument('-p',
                            '--priority',
                            type=int,
                            default=100,
                            help="Feed in the task queue with priority. ")
        parser.add_argument('--custom',
                            action="store_true",
                            help="Use the custom redis whether or not. ")
        return cls(**vars(parser.parse_args()))

    def clean_previous_task(self, crawlid):
        failed_keys = self.redis_conn.keys("failed_download_*:%s" % crawlid)
        for fk in failed_keys:
            self.redis_conn.delete(fk)

        self.redis_conn.delete("crawlid:%s" % crawlid)
        self.redis_conn.delete("crawlid:%s:model" % crawlid)

    def start(self):
        sucess_rate, failed_rate = 0, 0
        # item抓取
        if self.urls_file:
            with open(self.urls_file) as f:
                lst = f.readlines()
                lines_count = len(lst)
                for index, url in enumerate(lst):
                    json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse_item", "priority":%s}' % (
                        url.strip("\357\273\277\r\n"), self.crawlid,
                        self.spiderid, self.priority)
                    self.failed_count += self.feed(self.get_name(), json_req)
                    sucess_rate, failed_rate = self.show_process_line(
                        lines_count, index + 1, self.failed_count)
                self.redis_conn.hset("crawlid:%s" % self.crawlid,
                                     "total_pages", lines_count)
                self.redis_conn.expire("crawlid:%s" % self.crawlid,
                                       2 * 24 * 60 * 60)
        # 分类抓取
        else:
            url_list = self.url.split("     ")
            lines_count = len(url_list)

            for index, url in enumerate(url_list):
                json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse","priority":%s}' % (
                    url.strip(),
                    self.crawlid,
                    self.spiderid,
                    self.priority,
                )
                self.failed_count += self.feed(self.get_name(), json_req)
                sucess_rate, failed_rate = self.show_process_line(
                    lines_count, index + 1, self.failed_count)
        print("\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" %
              (sucess_rate, failed_rate))

    def get_name(self):
        return "{sid}:item:queue".format(sid=self.spiderid)

    def feed(self, queue_name, req):

        if self.custom:
            from custom_redis.client.errors import RedisError
        else:
            from redis import RedisError

        try:
            self.redis_conn.zadd(queue_name, req, -self.priority)
            return 0
        except RedisError:
            traceback.print_exc()
            return 1

    def show_process_line(self, count, num, failed):

        per = count / 100
        success = num - failed
        success_rate = success * 100.0 / count
        failed_rate = failed * 100.0 / count
        str_success_rate = "%.2f%%  " % success_rate
        str_failed_rate = "%.2f%%  " % failed_rate

        if num >= self.inc:
            self.inc += per

            if sys.platform == "win32":
                import ctypes
                std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11)
                color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute
                color_ctl(std_out_handle, 2)
                print("\r", str_success_rate, "")
                color_ctl(std_out_handle, 32)
                print(int(success_rate * 30 / 100) * ' ', "")
                if int(failed_rate):
                    color_ctl(std_out_handle, 64)
                    print(int(failed_rate * 30 / 100) * ' ', "")
                color_ctl(std_out_handle, 0)
                color_ctl(std_out_handle, 4)
                print(str_failed_rate, "")
                color_ctl(std_out_handle, 7)
            else:
                print("\r", str_success_rate, "")
                print(
                    "%s%s" %
                    (int(success_rate * 50 / 100) * '\033[42m \033[0m',
                     int(failed_rate * 50 / 100) * '\033[41m \033[0m'),
                    str_failed_rate)

        return success_rate, failed_rate
Ejemplo n.º 2
0
class Scheduler(object):
    spider = None

    def __init__(self, crawler):
        self.settings = crawler.settings
        self.logger = Logger.from_crawler(crawler)
        if self.settings.getbool("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis
        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.getint("REDIS_PORT"))
        self.queue_name = None
        self.queues = {}

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def open(self, spider):
        self.spider = spider
        self.queue_name = self.settings.get("TASK_QUEUE_TEMPLATE",
                                            "%s:request:queue") % spider.name
        spider.set_redis(self.redis_conn)

    def enqueue_request(self, request):
        request.callback = getattr(request.callback, "__name__",
                                   request.callback)
        request.errback = getattr(request.errback, "__name__", request.errback)
        self.redis_conn.zadd(self.queue_name, pickle.dumps(request),
                             -int(request.meta["priority"]))
        self.logger.debug("Crawlid: %s, url: %s added to queue. " %
                          (request.meta['crawlid'], request.url))

    def next_request(self):
        self.logger.debug(
            "length of queue %s is %s" %
            (self.queue_name, self.redis_conn.zcard(self.queue_name)))
        item = None

        if self.settings.getbool("CUSTOM_REDIS"):
            item = self.redis_conn.zpop(self.queue_name)
        else:
            pipe = self.redis_conn.pipeline()
            pipe.multi()
            pipe.zrange(self.queue_name, 0,
                        0).zremrangebyrank(self.queue_name, 0, 0)
            result, _ = pipe.execute()
            if result:
                item = result[0]

        if item:
            request = pickle.loads(item)
            request.callback = request.callback and getattr(
                self.spider, request.callback)
            request.errback = request.errback and getattr(
                self.spider, request.errback)
            return request

    def close(self, reason):
        self.logger.info("Closing Spider: %s. " % self.spider.name)

    def has_pending_requests(self):
        return False
Ejemplo n.º 3
0
class RedisFeed:
    def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host,
                 custom):

        self.name = "redis_feed"
        self.crawlid = crawlid
        self.spiderid = spiderid
        self.url = url
        self.urls_file = urls_file
        self.priority = priority
        self.port = port
        self.host = host
        self.custom = custom
        self.inc = 0
        self.extract = extract
        self.setup()

    @classmethod
    def parse_args(cls):

        parser = argparse.ArgumentParser()
        parser.add_argument('-rh',
                            "--redis-host",
                            dest="host",
                            type=str,
                            default="127.0.0.1")
        parser.add_argument('-rp',
                            "--redis-port",
                            dest="port",
                            type=int,
                            default=6379)
        parser.add_argument('-u', '--url', type=str)
        parser.add_argument('-uf', '--urls-file', type=str)
        parser.add_argument('-c', '--crawlid', required=True, type=str)
        parser.add_argument('-s', '--spiderid', required=True, type=str)
        parser.add_argument('-p', '--priority', type=int, default=100)
        parser.add_argument('--custom', action="store_true")
        return cls(**vars(parser.parse_args()))

    def setup(self):

        self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0

        if self.custom:
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(host=self.host, port=self.port)
        self.redis_conn.delete("crawlid:%s" % self.crawlid)
        self.redis_conn.delete("failed_pages:%s" % self.crawlid)
        self.redis_conn.delete("crawlid:%s:model" % self.crawlid)

    def start(self):
        sucess_rate, failed_rate = 0, 0
        # item抓取
        if self.urls_file:
            with open(self.urls_file) as f:
                lst = f.readlines()
                lines_count = len(lst)
                for index, url in enumerate(lst):
                    json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse_item", "priority":%s}' % (
                        url.strip("\357\273\277\r\n"), self.crawlid,
                        self.spiderid, self.priority)
                    self.failed_count += self.feed(self.get_name(url),
                                                   json_req)
                    sucess_rate, failed_rate = self.show_process_line(
                        lines_count, index + 1, self.failed_count)
                self.redis_conn.hset("crawlid:%s" % self.crawlid,
                                     "total_pages", lines_count)
        # 分类抓取
        else:
            url_list = self.url.split("     ")
            lines_count = len(url_list)

            for index, url in enumerate(url_list):
                json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse","priority":%s}' % (
                    url.strip(),
                    self.crawlid,
                    self.spiderid,
                    self.priority,
                )
                self.failed_count += self.feed(self.get_name(url), json_req)
                sucess_rate, failed_rate = self.show_process_line(
                    lines_count, index + 1, self.failed_count)
        print "\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" % (
            sucess_rate, failed_rate)

    def get_name(self, url):
        ex_res = self.extract(url)
        return "{sid}:{dom}.{suf}:queue".format(sid=self.spiderid,
                                                dom=ex_res.domain,
                                                suf=ex_res.suffix)

    def feed(self, queue_name, req):

        if self.custom:
            from custom_redis.client.errors import RedisError
        else:
            from redis import RedisError

        try:
            self.redis_conn.zadd(queue_name, req, -self.priority)
            return 0
        except RedisError:
            traceback.print_exc()
            return 1

    def show_process_line(self, count, num, failed):

        per = count / 100
        success = num - failed
        success_rate = success * 100.0 / count
        failed_rate = failed * 100.0 / count
        str_success_rate = "%.2f%%  " % success_rate
        str_failed_rate = "%.2f%%  " % failed_rate

        if num >= self.inc:
            self.inc += per

            if sys.platform == "win32":
                import ctypes
                std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11)
                color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute
                color_ctl(std_out_handle, 2)
                print "\r", str_success_rate,
                color_ctl(std_out_handle, 32)
                print int(success_rate * 30 / 100) * ' ',
                if int(failed_rate):
                    color_ctl(std_out_handle, 64)
                    print int(failed_rate * 30 / 100) * ' ',
                color_ctl(std_out_handle, 0)
                color_ctl(std_out_handle, 4)
                print str_failed_rate,
                color_ctl(std_out_handle, 7)
            else:
                print "\r", str_success_rate,
                print "%s%s" % (int(success_rate * 50 / 100) *
                                '\033[42m \033[0m', int(failed_rate * 50 / 100)
                                * '\033[41m \033[0m'), str_failed_rate,

        return success_rate, failed_rate
Ejemplo n.º 4
0
class Scheduler(Logger):

    def __init__(self, crawler):

        self.settings = crawler.settings
        self.set_logger(crawler)

        if self.settings.get("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.get("REDIS_PORT"))
        self.queue_name = "%s:*:queue"
        self.queues = {}
        self.extract = tldextract.extract

    @classmethod
    def from_crawler(cls, crawler):

        return cls(crawler)

    def open(self, spider):

        self.spider = spider
        self.queue_name = self.queue_name%spider.name
        spider.set_redis(self.redis_conn)
        spider.set_logger(self.logger)

    def request_to_dict(self, request):

        req_dict = {
            'url': request.url.decode('ascii'),
            'method': request.method,
            'headers': dict(request.headers),
            'body': request.body,
            'cookies': request.cookies,
            'meta': request.meta,
            '_encoding': request._encoding,
            'priority': request.priority,
            'dont_filter': request.dont_filter,
            'callback': None if request.callback is None else request.callback.func_name,
            'errback': None if request.errback is None else request.errback.func_name,
        }
        return req_dict

    def enqueue_request(self, request):

        req_dict = self.request_to_dict(request)
        ex_res = self.extract(req_dict['url'])
        key = "{sid}:{dom}.{suf}:queue".format(
            sid=req_dict['meta']['spiderid'],
            dom=ex_res.domain,
            suf=ex_res.suffix)
        self.redis_conn.zadd(key, json.dumps(req_dict), -int(req_dict["priority"]))
        self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue"
                          .format(id=req_dict['meta']['crawlid'],
                                  url=req_dict['url']))

    def next_request(self):

        queues = self.redis_conn.keys(self.queue_name)

        if queues:
            queue = random.choice(queues)
            self.logger.info("length of queue %s is %s" %
                             (queue, self.redis_conn.zcard(queue)))

            if self.settings.get("CUSTOM_REDIS"):
                item = self.redis_conn.zpop(queue)
            else:
                pipe = self.redis_conn.pipeline()
                pipe.multi()
                pipe.zrange(queue, 0, 0).zremrangebyrank(queue, 0, 0)
                result, count = pipe.execute()
                item = result[0]

            if item:
                item = json.loads(item)

                try:
                    req = Request(item['url'])
                except ValueError:
                    req = Request('http://' + item['url'])

                if 'callback' in item:
                    cb = item['callback']
                    if cb and self.spider:
                        cb = getattr(self.spider, cb)
                        req.callback = cb

                if 'errback' in item:
                    eb = item['errback']
                    if eb and self.spider:
                        eb = getattr(self.spider, eb)
                        req.errback = eb

                if 'meta' in item:
                    item = item['meta']

                # defaults not in schema
                if 'curdepth' not in item:
                    item['curdepth'] = 0

                if "retry_times" not in item:
                    item['retry_times'] = 0

                for key in item.keys():
                    req.meta[key] = item[key]

                if 'useragent' in item and item['useragent'] is not None:
                    req.headers['User-Agent'] = item['useragent']

                if 'cookie' in item and item['cookie'] is not None:
                    if isinstance(item['cookie'], dict):
                        req.cookies = item['cookie']
                    elif isinstance(item['cookie'], basestring):
                        req.cookies = parse_cookie(item['cookie'])
                return req


    def close(self, reason):

        self.logger.info("Closing Spider", {'spiderid': self.spider.name})

    def has_pending_requests(self):

        return False
Ejemplo n.º 5
0
class Scheduler(object):
    # 记录当前正在处理的item, 在处理异常时使用
    present_item = None
    spider = None

    def __init__(self, crawler):

        self.settings = crawler.settings
        self.logger = Logger.from_crawler(crawler)
        if self.settings.getbool("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis
        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.get("REDIS_PORT"))
        self.queue_name = "%s:item:queue"
        self.queues = {}
        self.request_interval = 60 / self.settings.getint("SPEED", 60)
        self.last_acs_time = time.time()

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def open(self, spider):
        self.spider = spider
        self.queue_name = self.queue_name % spider.name
        spider.set_redis(self.redis_conn)

    def request_to_dict(self, request):

        headers = dict([(item[0].decode("ascii"), item[1])
                        for item in request.headers.items()])
        req_dict = {
            'url':
            request.url,
            'method':
            request.method,
            'headers':
            headers,
            'body':
            request.body,
            'cookies':
            request.cookies,
            'meta':
            request.meta,
            '_encoding':
            request._encoding,
            'dont_filter':
            request.dont_filter,
            'callback':
            request.callback
            if not isinstance(request.callback, types.MethodType) else
            request.callback.__name__,
            'errback':
            request.errback
            if not isinstance(request.errback, types.MethodType) else
            request.errback.__name__,
        }
        return req_dict

    @enqueue_request_method_wrapper
    def enqueue_request(self, request):
        req_dict = self.request_to_dict(request)
        self.redis_conn.zadd(self.queue_name, pickle.dumps(req_dict),
                             -int(req_dict["meta"]["priority"]))
        self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue".format(
            id=req_dict['meta']['crawlid'], url=req_dict['url']))

    @next_request_method_wrapper
    def next_request(self):

        self.logger.info(
            "length of queue %s is %s" %
            (self.queue_name, self.redis_conn.zcard(self.queue_name)))
        item = None
        if time.time() - self.request_interval < self.last_acs_time:
            return item
        if self.settings.getbool("CUSTOM_REDIS"):
            item = self.redis_conn.zpop(self.queue_name)
        else:
            pipe = self.redis_conn.pipeline()
            pipe.multi()
            pipe.zrange(self.queue_name, 0,
                        0).zremrangebyrank(self.queue_name, 0, 0)
            result, count = pipe.execute()

            if result:
                item = result[0]

        if item:
            self.last_acs_time = time.time()
            item = pickle.loads(item)
            self.present_item = item
            headers = item.get("headers", {})
            body = item.get("body")
            if item.get("method"):
                method = item.get("method")
            else:
                method = "GET"

            try:
                req = Request(item['url'],
                              method=method,
                              body=body,
                              headers=headers)
            except ValueError:
                req = Request('http://' + item['url'],
                              method=method,
                              body=body,
                              headers=headers)

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = getattr(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = getattr(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0

            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']

            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], (str, bytes)):
                    req.cookies = parse_cookie(item['cookie'])

            return req

    def close(self, reason):
        self.logger.info("Closing Spider", {'spiderid': self.spider.name})

    def has_pending_requests(self):
        return False
Ejemplo n.º 6
0
class SpiderFeeder(object):
    def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host,
                 custom):
        self.crawlid = crawlid
        self.spiderid = spiderid
        self.url = url
        self.urls_file = urls_file
        self.priority = priority
        self.port = port
        self.host = host
        self.custom = custom
        self.inc = 0
        self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0

        if self.custom:
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(host=self.host, port=self.port)
        self.clean_previous_task(self.crawlid)

    def clean_previous_task(self, crawlid):
        failed_keys = self.redis_conn.keys("failed_download_*:%s" % crawlid)
        for fk in failed_keys:
            self.redis_conn.delete(fk)
        self.redis_conn.delete("crawlid:%s" % crawlid)
        self.redis_conn.delete("crawlid:%s:model" % crawlid)

    def start(self):
        success_rate, failed_rate = 0, 0
        # item抓取
        if self.urls_file:
            with open(self.urls_file) as f:
                lst = f.readlines()
                lines_count = len(lst)
                for index, url in enumerate(lst):
                    req = Request(url=url.strip("\357\273\277\r\n"),
                                  callback="parse_item",
                                  meta={
                                      "crawlid": self.crawlid,
                                      "spiderid": self.spiderid,
                                      "priority": self.priority
                                  })
                    self.failed_count += self.feed(self.get_name(),
                                                   pickle.dumps(req))
                    success_rate, failed_rate = \
                        self.show_process_line(
                            lines_count, index + 1, self.failed_count)
                self.redis_conn.hset("crawlid:%s" % self.crawlid,
                                     "total_pages", lines_count)
        # 分类抓取
        else:
            url_list = self.url.split("     ")
            lines_count = len(url_list)

            for index, url in enumerate(url_list):
                req = Request(url=url.strip(),
                              callback="parse",
                              meta={
                                  "crawlid": self.crawlid,
                                  "spiderid": self.spiderid,
                                  "priority": self.priority
                              })
                self.failed_count += self.feed(self.get_name(),
                                               pickle.dumps(req))
                sucess_rate, failed_rate = self.show_process_line(
                    lines_count, index + 1, self.failed_count)
        print("\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" %
              (success_rate, failed_rate))

    def get_name(self):
        return "{sid}:request:queue".format(sid=self.spiderid)

    def feed(self, queue_name, req):
        if self.custom:
            from custom_redis.client.errors import RedisError
        else:
            from redis import RedisError
        try:
            self.redis_conn.zadd(queue_name, req, -self.priority)
            return 0
        except RedisError:
            traceback.print_exc()
            return 1

    def show_process_line(self, count, num, failed):
        per = count / 100
        success = num - failed
        success_rate = success * 100.0 / count
        failed_rate = failed * 100.0 / count
        str_success_rate = "%.2f%%  " % success_rate
        str_failed_rate = "%.2f%%  " % failed_rate

        if num >= self.inc:
            self.inc += per
            if sys.platform == "win32":
                import ctypes
                std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11)
                color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute
                color_ctl(std_out_handle, 2)
                print("\r", str_success_rate, "")
                color_ctl(std_out_handle, 32)
                print(int(success_rate * 30 / 100) * ' ', "")
                if int(failed_rate):
                    color_ctl(std_out_handle, 64)
                    print(int(failed_rate * 30 / 100) * ' ', "")
                color_ctl(std_out_handle, 0)
                color_ctl(std_out_handle, 4)
                print(str_failed_rate, "")
                color_ctl(std_out_handle, 7)
            else:
                print("\r", str_success_rate, "")
                print(
                    "%s%s" %
                    (int(success_rate * 50 / 100) * '\033[42m \033[0m',
                     int(failed_rate * 50 / 100) * '\033[41m \033[0m'),
                    str_failed_rate)
        return success_rate, failed_rate