class RedisSource(Source): """ redis source """ def __init__(self, redis_host, redis_port, redis_key, **kwargs): try: from custom_redis.client import Redis except ImportError: try: from redis import Redis except ImportError: warnings.warn( "RedisSource depends on redis, try: pip install redis. ") exit(1) self.redis_key = redis_key self.redis_conn = Redis(redis_host, redis_port) async def __anext__(self): """ 异步迭代器需要实现这个方法,这是一个异步方法,最终返回一个迭代值。 :return: """ return self.redis_conn.lpop(self.redis_key) async def push_back(self, data): self.redis_conn.rpush(self.redis_key, data) @staticmethod def enrich_parser(sub_parser): sub_parser.add_argument("-rh", "--redis-host", default="0.0.0.0") sub_parser.add_argument("-rp", "--redis-port", default=6379) sub_parser.add_argument("-rk", "--redis-key", default="download_meta") sub_parser.add_argument("--idle", action="store_true", help="Idle... ")
def setup(self): self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.clean_previous_task(self.crawlid)
def __init__(self, crawler): self.settings = crawler.settings self.logger = Logger.from_crawler(crawler) if self.settings.getbool("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.getint("REDIS_PORT")) self.queue_name = None self.queues = {}
def __init__(self, redis_host, redis_port, redis_key, **kwargs): try: from custom_redis.client import Redis except ImportError: try: from redis import Redis except ImportError: warnings.warn( "RedisSource depends on redis, try: pip install redis. ") exit(1) self.redis_key = redis_key self.redis_conn = Redis(redis_host, redis_port)
def setup(self): self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.redis_conn.delete("crawlid:%s" % self.crawlid) self.redis_conn.delete("failed_pages:%s" % self.crawlid) self.redis_conn.delete("crawlid:%s:model" % self.crawlid)
def __init__(self, crawler): self.settings = crawler.settings self.logger = CustomLogger.from_crawler(crawler) if self.settings.getbool("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.getint("REDIS_PORT")) self.queue_name = None self.queues = {} self.request_interval = 60 / self.settings.getint("SPEED", 60) self.last_acs_time = time.time()
def __init__(self, settings): self.settings_file = settings Logger.__init__(self, settings) self.set_logger() MultiThreadClosing.__init__(self) self.de_queue = Queue() if self.settings.get("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.small = False
def __init__(self, crawler): self.settings = crawler.settings self.set_logger(crawler) if self.settings.get("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.queue_name = "%s:*:queue" self.queues = {} self.extract = tldextract.extract
def start(crawlid, host, custom): if custom: from custom_redis.client import Redis else: from redis import Redis redis_conn = Redis(host) key = "crawlid:%s" % crawlid failed_pages = int(redis_conn.hget(key, "failed_download_pages") or 0) format(redis_conn.hgetall(key)) if failed_pages: print_if = raw_input("show the failed pages? y/n default n:") if print_if == "y": key_ = "failed_pages:%s" % crawlid p = redis_conn.hgetall(key_) format(p, True)
def start(crawlid, host, port, custom): if custom: from custom_redis.client import Redis else: from redis import Redis redis_conn = Redis(host, port) key = "crawlid:%s" % crawlid data = redis_conn.hgetall(key) failed_keys = [x for x in data.keys() if fnmatch.fnmatch( x.decode() if isinstance(x, bytes) else x, "failed_download_*")] format(data) for fk in failed_keys: fk = fk.decode() if isinstance(fk, bytes) else fk print_if = input("show the %s? y/n default n:" % fk.replace("_", " ")) if print_if == "y": key_ = "%s:%s" % (fk, crawlid) p = redis_conn.hgetall(key_) format(p, True)
def start(crawlid, host, custom): if custom: from custom_redis.client import Redis else: from redis import Redis redis_conn = Redis(host) key = "crawlid:%s" % crawlid data = redis_conn.hgetall(key) failed_keys = filter(lambda x: fnmatch.fnmatch(x, "failed_download_*"), data.keys()) format(data) for fk in failed_keys: print_if = raw_input("show the %s? y/n default n:" % fk.replace("_", " ")) if print_if == "y": key_ = "%s:%s" % (fk, crawlid) p = redis_conn.hgetall(key_) format(p, True)
def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host, custom): self.crawlid = crawlid self.spiderid = spiderid self.url = url self.urls_file = urls_file self.priority = priority self.port = port self.host = host self.custom = custom self.inc = 0 self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.clean_previous_task(self.crawlid)
class MultiDownloadProcess(Logger, MultiThreadClosing): name = "multidownload_process" def __init__(self, settings): self.settings_file = settings Logger.__init__(self, settings) self.set_logger() MultiThreadClosing.__init__(self) self.de_queue = Queue() if self.settings.get("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.small = False @classmethod def parse_args(cls): parser = ArgumentParser() parser.add_argument("-s", "--settings", dest="settings", default="settings.py") return cls(**vars(parser.parse_args())) def is_small(self): self.small = True def callback(self, item, flag): """ callback called when download is finished. :return: """ raise NotImplementedError() def decode(self, item): """ redis pop out to got url, filename, directory :param item: :return: (url, filename, directory) """ raise NotImplementedError() def processing(self, de, url_paths, item): if self.small: downloader = "download_small_file" else: downloader = "start" flag = False try: t1 = time.time() length = len(url_paths) for index, (url, filename, path) in enumerate(url_paths): result = getattr(de, downloader)(url=url, filename=filename, path=path) self.logger.info("download process %s/%s completed" % (index + 1, length)) flag = flag or result t2 = time.time() self.logger.info("download finished, success:%s, seconds:%.4f" % (flag, t2 - t1)) self.de_queue.put(de) self.callback(item, flag) self.logger.info("callback finished, seconds:%.4f" % (time.time() - t2)) except Exception: self.logger.error(traceback.format_exc()) finally: try: self.threads.remove(current_thread()) except ValueError: pass self.logger.info("the count of thread which alives is %s. " % len(self.threads)) def start(self): self.logger.debug("start process %s" % self.name) concurrent_download_count = self.settings.get( "CONCURRENT_DOWNLOAD_COUNT", 10) for i in range(concurrent_download_count): DE = DownloaderEngine(self.settings_file, signal_open=False) DE.set_logger(self.logger) self.de_queue.put(DE) self.logger.debug("setup %s des" % concurrent_download_count) while self.alive: try: item = self.redis_conn.lpop(self.settings.get("QUEUE_KEY")) except Exception: self.logger.error("redis error %s" % traceback.format_exc()) item = None if not item: self.logger.debug("got no message...") time.sleep(1) continue self.logger.debug( "%s tasks to be continue..." % self.redis_conn.llen(self.settings.get("QUEUE_KEY"))) try: url_paths = self.decode(item) except Exception: self.logger.error(traceback.format_exc()) url_paths = [] while url_paths: try: DE = self.de_queue.get_nowait() th = Thread(target=self.processing, args=(DE, url_paths, item)) self.set_force_interrupt(th) self.logger.debug("start a new thread. ") th.start() except Empty: time.sleep(1) else: break while True: if filter(lambda x: x.is_alive(), self.threads): time.sleep(1) else: break
class MultiDownloadProcess(Logger, MultiThreadClosing): name = "multidownload_process" def __init__(self, settings): self.settings_file = settings Logger.__init__(self, settings) self.set_logger() MultiThreadClosing.__init__(self) self.de_queue = Queue() if self.settings.get("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.small = False @classmethod def parse_args(cls): parser = ArgumentParser() parser.add_argument("-s", "--settings", dest="settings", default="settings.py") return cls(**vars(parser.parse_args())) def is_small(self): self.small=True def callback(self, item, flag): """ callback called when download is finished. :return: """ raise NotImplementedError() def decode(self, item): """ redis pop out to got url, filename, directory :param item: :return: (url, filename, directory) """ raise NotImplementedError() def processing(self, de, url_paths, item): if self.small: downloader = "download_small_file" else: downloader = "start" flag = False try: t1 = time.time() length = len(url_paths) for index, (url, filename, path) in enumerate(url_paths): result = getattr(de, downloader)(url=url, filename=filename, path=path) self.logger.info("download process %s/%s completed"%(index+1, length)) flag = flag or result t2 = time.time() self.logger.info("download finished, success:%s, seconds:%.4f"%(flag, t2-t1)) self.de_queue.put(de) self.callback(item, flag) self.logger.info("callback finished, seconds:%.4f"%(time.time()-t2)) except Exception: self.logger.error(traceback.format_exc()) finally: try: self.threads.remove(current_thread()) except ValueError: pass self.logger.info("the count of thread which alives is %s. "%len(self.threads)) def start(self): self.logger.debug("start process %s"%self.name) concurrent_download_count = self.settings.get("CONCURRENT_DOWNLOAD_COUNT", 10) for i in range(concurrent_download_count): DE = DownloaderEngine(self.settings_file, signal_open=False) DE.set_logger(self.logger) self.de_queue.put(DE) self.logger.debug("setup %s des"%concurrent_download_count) while self.alive: try: item = self.redis_conn.lpop(self.settings.get("QUEUE_KEY")) except Exception: self.logger.error("redis error %s"%traceback.format_exc()) item = None if not item: self.logger.debug("got no message...") time.sleep(1) continue self.logger.debug("%s tasks to be continue..."%self.redis_conn.llen(self.settings.get("QUEUE_KEY"))) try: url_paths = self.decode(item) except Exception: self.logger.error(traceback.format_exc()) url_paths = [] while url_paths: try: DE = self.de_queue.get_nowait() th = Thread(target=self.processing, args=(DE, url_paths, item)) self.set_force_interrupt(th) self.logger.debug("start a new thread. ") th.start() except Empty: time.sleep(1) else: break while True: if filter(lambda x:x.is_alive(), self.threads): time.sleep(1) else: break
class Scheduler(object): spider = None def __init__(self, crawler): self.settings = crawler.settings self.logger = Logger.from_crawler(crawler) if self.settings.getbool("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.getint("REDIS_PORT")) self.queue_name = None self.queues = {} @classmethod def from_crawler(cls, crawler): return cls(crawler) def open(self, spider): self.spider = spider self.queue_name = self.settings.get("TASK_QUEUE_TEMPLATE", "%s:request:queue") % spider.name spider.set_redis(self.redis_conn) def enqueue_request(self, request): request.callback = getattr(request.callback, "__name__", request.callback) request.errback = getattr(request.errback, "__name__", request.errback) self.redis_conn.zadd(self.queue_name, pickle.dumps(request), -int(request.meta["priority"])) self.logger.debug("Crawlid: %s, url: %s added to queue. " % (request.meta['crawlid'], request.url)) def next_request(self): self.logger.debug( "length of queue %s is %s" % (self.queue_name, self.redis_conn.zcard(self.queue_name))) item = None if self.settings.getbool("CUSTOM_REDIS"): item = self.redis_conn.zpop(self.queue_name) else: pipe = self.redis_conn.pipeline() pipe.multi() pipe.zrange(self.queue_name, 0, 0).zremrangebyrank(self.queue_name, 0, 0) result, _ = pipe.execute() if result: item = result[0] if item: request = pickle.loads(item) request.callback = request.callback and getattr( self.spider, request.callback) request.errback = request.errback and getattr( self.spider, request.errback) return request def close(self, reason): self.logger.info("Closing Spider: %s. " % self.spider.name) def has_pending_requests(self): return False
class RedisFeed: def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host, custom): self.name = "redis_feed" self.crawlid = crawlid self.spiderid = spiderid self.url = url self.urls_file = urls_file self.priority = priority self.port = port self.host = host self.custom = custom self.inc = 0 self.extract = extract self.setup() @classmethod def parse_args(cls): parser = argparse.ArgumentParser() parser.add_argument('-rh', "--redis-host", dest="host", type=str, default="127.0.0.1") parser.add_argument('-rp', "--redis-port", dest="port", type=int, default=6379) parser.add_argument('-u', '--url', type=str) parser.add_argument('-uf', '--urls-file', type=str) parser.add_argument('-c', '--crawlid', required=True, type=str) parser.add_argument('-s', '--spiderid', required=True, type=str) parser.add_argument('-p', '--priority', type=int, default=100) parser.add_argument('--custom', action="store_true") return cls(**vars(parser.parse_args())) def setup(self): self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.redis_conn.delete("crawlid:%s" % self.crawlid) self.redis_conn.delete("failed_pages:%s" % self.crawlid) self.redis_conn.delete("crawlid:%s:model" % self.crawlid) def start(self): sucess_rate, failed_rate = 0, 0 # item抓取 if self.urls_file: with open(self.urls_file) as f: lst = f.readlines() lines_count = len(lst) for index, url in enumerate(lst): json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse_item", "priority":%s}' % ( url.strip("\357\273\277\r\n"), self.crawlid, self.spiderid, self.priority) self.failed_count += self.feed(self.get_name(url), json_req) sucess_rate, failed_rate = self.show_process_line( lines_count, index + 1, self.failed_count) self.redis_conn.hset("crawlid:%s" % self.crawlid, "total_pages", lines_count) # 分类抓取 else: url_list = self.url.split(" ") lines_count = len(url_list) for index, url in enumerate(url_list): json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse","priority":%s}' % ( url.strip(), self.crawlid, self.spiderid, self.priority, ) self.failed_count += self.feed(self.get_name(url), json_req) sucess_rate, failed_rate = self.show_process_line( lines_count, index + 1, self.failed_count) print "\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" % ( sucess_rate, failed_rate) def get_name(self, url): ex_res = self.extract(url) return "{sid}:{dom}.{suf}:queue".format(sid=self.spiderid, dom=ex_res.domain, suf=ex_res.suffix) def feed(self, queue_name, req): if self.custom: from custom_redis.client.errors import RedisError else: from redis import RedisError try: self.redis_conn.zadd(queue_name, req, -self.priority) return 0 except RedisError: traceback.print_exc() return 1 def show_process_line(self, count, num, failed): per = count / 100 success = num - failed success_rate = success * 100.0 / count failed_rate = failed * 100.0 / count str_success_rate = "%.2f%% " % success_rate str_failed_rate = "%.2f%% " % failed_rate if num >= self.inc: self.inc += per if sys.platform == "win32": import ctypes std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11) color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute color_ctl(std_out_handle, 2) print "\r", str_success_rate, color_ctl(std_out_handle, 32) print int(success_rate * 30 / 100) * ' ', if int(failed_rate): color_ctl(std_out_handle, 64) print int(failed_rate * 30 / 100) * ' ', color_ctl(std_out_handle, 0) color_ctl(std_out_handle, 4) print str_failed_rate, color_ctl(std_out_handle, 7) else: print "\r", str_success_rate, print "%s%s" % (int(success_rate * 50 / 100) * '\033[42m \033[0m', int(failed_rate * 50 / 100) * '\033[41m \033[0m'), str_failed_rate, return success_rate, failed_rate
class Scheduler(Logger): def __init__(self, crawler): self.settings = crawler.settings self.set_logger(crawler) if self.settings.get("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.queue_name = "%s:*:queue" self.queues = {} self.extract = tldextract.extract @classmethod def from_crawler(cls, crawler): return cls(crawler) def open(self, spider): self.spider = spider self.queue_name = self.queue_name%spider.name spider.set_redis(self.redis_conn) spider.set_logger(self.logger) def request_to_dict(self, request): req_dict = { 'url': request.url.decode('ascii'), 'method': request.method, 'headers': dict(request.headers), 'body': request.body, 'cookies': request.cookies, 'meta': request.meta, '_encoding': request._encoding, 'priority': request.priority, 'dont_filter': request.dont_filter, 'callback': None if request.callback is None else request.callback.func_name, 'errback': None if request.errback is None else request.errback.func_name, } return req_dict def enqueue_request(self, request): req_dict = self.request_to_dict(request) ex_res = self.extract(req_dict['url']) key = "{sid}:{dom}.{suf}:queue".format( sid=req_dict['meta']['spiderid'], dom=ex_res.domain, suf=ex_res.suffix) self.redis_conn.zadd(key, json.dumps(req_dict), -int(req_dict["priority"])) self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue" .format(id=req_dict['meta']['crawlid'], url=req_dict['url'])) def next_request(self): queues = self.redis_conn.keys(self.queue_name) if queues: queue = random.choice(queues) self.logger.info("length of queue %s is %s" % (queue, self.redis_conn.zcard(queue))) if self.settings.get("CUSTOM_REDIS"): item = self.redis_conn.zpop(queue) else: pipe = self.redis_conn.pipeline() pipe.multi() pipe.zrange(queue, 0, 0).zremrangebyrank(queue, 0, 0) result, count = pipe.execute() item = result[0] if item: item = json.loads(item) try: req = Request(item['url']) except ValueError: req = Request('http://' + item['url']) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = getattr(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = getattr(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = parse_cookie(item['cookie']) return req def close(self, reason): self.logger.info("Closing Spider", {'spiderid': self.spider.name}) def has_pending_requests(self): return False
class Scheduler(object): # 记录当前正在处理的item, 在处理异常时使用 present_item = None spider = None def __init__(self, crawler): self.settings = crawler.settings self.logger = Logger.from_crawler(crawler) if self.settings.getbool("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.queue_name = "%s:item:queue" self.queues = {} self.request_interval = 60 / self.settings.getint("SPEED", 60) self.last_acs_time = time.time() @classmethod def from_crawler(cls, crawler): return cls(crawler) def open(self, spider): self.spider = spider self.queue_name = self.queue_name % spider.name spider.set_redis(self.redis_conn) def request_to_dict(self, request): headers = dict([(item[0].decode("ascii"), item[1]) for item in request.headers.items()]) req_dict = { 'url': request.url, 'method': request.method, 'headers': headers, 'body': request.body, 'cookies': request.cookies, 'meta': request.meta, '_encoding': request._encoding, 'dont_filter': request.dont_filter, 'callback': request.callback if not isinstance(request.callback, types.MethodType) else request.callback.__name__, 'errback': request.errback if not isinstance(request.errback, types.MethodType) else request.errback.__name__, } return req_dict @enqueue_request_method_wrapper def enqueue_request(self, request): req_dict = self.request_to_dict(request) self.redis_conn.zadd(self.queue_name, pickle.dumps(req_dict), -int(req_dict["meta"]["priority"])) self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue".format( id=req_dict['meta']['crawlid'], url=req_dict['url'])) @next_request_method_wrapper def next_request(self): self.logger.info( "length of queue %s is %s" % (self.queue_name, self.redis_conn.zcard(self.queue_name))) item = None if time.time() - self.request_interval < self.last_acs_time: return item if self.settings.getbool("CUSTOM_REDIS"): item = self.redis_conn.zpop(self.queue_name) else: pipe = self.redis_conn.pipeline() pipe.multi() pipe.zrange(self.queue_name, 0, 0).zremrangebyrank(self.queue_name, 0, 0) result, count = pipe.execute() if result: item = result[0] if item: self.last_acs_time = time.time() item = pickle.loads(item) self.present_item = item headers = item.get("headers", {}) body = item.get("body") if item.get("method"): method = item.get("method") else: method = "GET" try: req = Request(item['url'], method=method, body=body, headers=headers) except ValueError: req = Request('http://' + item['url'], method=method, body=body, headers=headers) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = getattr(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = getattr(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], (str, bytes)): req.cookies = parse_cookie(item['cookie']) return req def close(self, reason): self.logger.info("Closing Spider", {'spiderid': self.spider.name}) def has_pending_requests(self): return False
class RedisFeed(object): def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host, custom): self.name = "redis_feed" self.crawlid = crawlid self.spiderid = spiderid self.url = url self.urls_file = urls_file self.priority = priority self.port = port self.host = host self.custom = custom self.inc = 0 self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.clean_previous_task(self.crawlid) @classmethod def parse_args(cls): parser = argparse.ArgumentParser(description="usage: %prog [options]") parser.add_argument('-rh', "--redis-host", dest="host", type=str, default="127.0.0.1", help="Redis host to feed in. ") parser.add_argument('-rp', "--redis-port", dest="port", type=int, default=6379, help="Redis port to feed in. ") parser.add_argument('-u', '--url', type=str, help="The url to crawl, a list of products. ") parser.add_argument('-uf', '--urls-file', type=str, help="The urlsfile to crawl, single product. ") parser.add_argument('-c', '--crawlid', required=True, type=str, help="An unique Id for a crawl task. ") parser.add_argument('-s', '--spiderid', required=True, type=str, help="The website you wanna crawl. ") parser.add_argument('-p', '--priority', type=int, default=100, help="Feed in the task queue with priority. ") parser.add_argument('--custom', action="store_true", help="Use the custom redis whether or not. ") return cls(**vars(parser.parse_args())) def clean_previous_task(self, crawlid): failed_keys = self.redis_conn.keys("failed_download_*:%s" % crawlid) for fk in failed_keys: self.redis_conn.delete(fk) self.redis_conn.delete("crawlid:%s" % crawlid) self.redis_conn.delete("crawlid:%s:model" % crawlid) def start(self): sucess_rate, failed_rate = 0, 0 # item抓取 if self.urls_file: with open(self.urls_file) as f: lst = f.readlines() lines_count = len(lst) for index, url in enumerate(lst): json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse_item", "priority":%s}' % ( url.strip("\357\273\277\r\n"), self.crawlid, self.spiderid, self.priority) self.failed_count += self.feed(self.get_name(), json_req) sucess_rate, failed_rate = self.show_process_line( lines_count, index + 1, self.failed_count) self.redis_conn.hset("crawlid:%s" % self.crawlid, "total_pages", lines_count) self.redis_conn.expire("crawlid:%s" % self.crawlid, 2 * 24 * 60 * 60) # 分类抓取 else: url_list = self.url.split(" ") lines_count = len(url_list) for index, url in enumerate(url_list): json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse","priority":%s}' % ( url.strip(), self.crawlid, self.spiderid, self.priority, ) self.failed_count += self.feed(self.get_name(), json_req) sucess_rate, failed_rate = self.show_process_line( lines_count, index + 1, self.failed_count) print("\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" % (sucess_rate, failed_rate)) def get_name(self): return "{sid}:item:queue".format(sid=self.spiderid) def feed(self, queue_name, req): if self.custom: from custom_redis.client.errors import RedisError else: from redis import RedisError try: self.redis_conn.zadd(queue_name, req, -self.priority) return 0 except RedisError: traceback.print_exc() return 1 def show_process_line(self, count, num, failed): per = count / 100 success = num - failed success_rate = success * 100.0 / count failed_rate = failed * 100.0 / count str_success_rate = "%.2f%% " % success_rate str_failed_rate = "%.2f%% " % failed_rate if num >= self.inc: self.inc += per if sys.platform == "win32": import ctypes std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11) color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute color_ctl(std_out_handle, 2) print("\r", str_success_rate, "") color_ctl(std_out_handle, 32) print(int(success_rate * 30 / 100) * ' ', "") if int(failed_rate): color_ctl(std_out_handle, 64) print(int(failed_rate * 30 / 100) * ' ', "") color_ctl(std_out_handle, 0) color_ctl(std_out_handle, 4) print(str_failed_rate, "") color_ctl(std_out_handle, 7) else: print("\r", str_success_rate, "") print( "%s%s" % (int(success_rate * 50 / 100) * '\033[42m \033[0m', int(failed_rate * 50 / 100) * '\033[41m \033[0m'), str_failed_rate) return success_rate, failed_rate
class SpiderFeeder(object): def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host, custom): self.crawlid = crawlid self.spiderid = spiderid self.url = url self.urls_file = urls_file self.priority = priority self.port = port self.host = host self.custom = custom self.inc = 0 self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.clean_previous_task(self.crawlid) def clean_previous_task(self, crawlid): failed_keys = self.redis_conn.keys("failed_download_*:%s" % crawlid) for fk in failed_keys: self.redis_conn.delete(fk) self.redis_conn.delete("crawlid:%s" % crawlid) self.redis_conn.delete("crawlid:%s:model" % crawlid) def start(self): success_rate, failed_rate = 0, 0 # item抓取 if self.urls_file: with open(self.urls_file) as f: lst = f.readlines() lines_count = len(lst) for index, url in enumerate(lst): req = Request(url=url.strip("\357\273\277\r\n"), callback="parse_item", meta={ "crawlid": self.crawlid, "spiderid": self.spiderid, "priority": self.priority }) self.failed_count += self.feed(self.get_name(), pickle.dumps(req)) success_rate, failed_rate = \ self.show_process_line( lines_count, index + 1, self.failed_count) self.redis_conn.hset("crawlid:%s" % self.crawlid, "total_pages", lines_count) # 分类抓取 else: url_list = self.url.split(" ") lines_count = len(url_list) for index, url in enumerate(url_list): req = Request(url=url.strip(), callback="parse", meta={ "crawlid": self.crawlid, "spiderid": self.spiderid, "priority": self.priority }) self.failed_count += self.feed(self.get_name(), pickle.dumps(req)) sucess_rate, failed_rate = self.show_process_line( lines_count, index + 1, self.failed_count) print("\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" % (success_rate, failed_rate)) def get_name(self): return "{sid}:request:queue".format(sid=self.spiderid) def feed(self, queue_name, req): if self.custom: from custom_redis.client.errors import RedisError else: from redis import RedisError try: self.redis_conn.zadd(queue_name, req, -self.priority) return 0 except RedisError: traceback.print_exc() return 1 def show_process_line(self, count, num, failed): per = count / 100 success = num - failed success_rate = success * 100.0 / count failed_rate = failed * 100.0 / count str_success_rate = "%.2f%% " % success_rate str_failed_rate = "%.2f%% " % failed_rate if num >= self.inc: self.inc += per if sys.platform == "win32": import ctypes std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11) color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute color_ctl(std_out_handle, 2) print("\r", str_success_rate, "") color_ctl(std_out_handle, 32) print(int(success_rate * 30 / 100) * ' ', "") if int(failed_rate): color_ctl(std_out_handle, 64) print(int(failed_rate * 30 / 100) * ' ', "") color_ctl(std_out_handle, 0) color_ctl(std_out_handle, 4) print(str_failed_rate, "") color_ctl(std_out_handle, 7) else: print("\r", str_success_rate, "") print( "%s%s" % (int(success_rate * 50 / 100) * '\033[42m \033[0m', int(failed_rate * 50 / 100) * '\033[41m \033[0m'), str_failed_rate) return success_rate, failed_rate