class RedisFeed(object): def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host, custom): self.name = "redis_feed" self.crawlid = crawlid self.spiderid = spiderid self.url = url self.urls_file = urls_file self.priority = priority self.port = port self.host = host self.custom = custom self.inc = 0 self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.clean_previous_task(self.crawlid) @classmethod def parse_args(cls): parser = argparse.ArgumentParser(description="usage: %prog [options]") parser.add_argument('-rh', "--redis-host", dest="host", type=str, default="127.0.0.1", help="Redis host to feed in. ") parser.add_argument('-rp', "--redis-port", dest="port", type=int, default=6379, help="Redis port to feed in. ") parser.add_argument('-u', '--url', type=str, help="The url to crawl, a list of products. ") parser.add_argument('-uf', '--urls-file', type=str, help="The urlsfile to crawl, single product. ") parser.add_argument('-c', '--crawlid', required=True, type=str, help="An unique Id for a crawl task. ") parser.add_argument('-s', '--spiderid', required=True, type=str, help="The website you wanna crawl. ") parser.add_argument('-p', '--priority', type=int, default=100, help="Feed in the task queue with priority. ") parser.add_argument('--custom', action="store_true", help="Use the custom redis whether or not. ") return cls(**vars(parser.parse_args())) def clean_previous_task(self, crawlid): failed_keys = self.redis_conn.keys("failed_download_*:%s" % crawlid) for fk in failed_keys: self.redis_conn.delete(fk) self.redis_conn.delete("crawlid:%s" % crawlid) self.redis_conn.delete("crawlid:%s:model" % crawlid) def start(self): sucess_rate, failed_rate = 0, 0 # item抓取 if self.urls_file: with open(self.urls_file) as f: lst = f.readlines() lines_count = len(lst) for index, url in enumerate(lst): json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse_item", "priority":%s}' % ( url.strip("\357\273\277\r\n"), self.crawlid, self.spiderid, self.priority) self.failed_count += self.feed(self.get_name(), json_req) sucess_rate, failed_rate = self.show_process_line( lines_count, index + 1, self.failed_count) self.redis_conn.hset("crawlid:%s" % self.crawlid, "total_pages", lines_count) self.redis_conn.expire("crawlid:%s" % self.crawlid, 2 * 24 * 60 * 60) # 分类抓取 else: url_list = self.url.split(" ") lines_count = len(url_list) for index, url in enumerate(url_list): json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse","priority":%s}' % ( url.strip(), self.crawlid, self.spiderid, self.priority, ) self.failed_count += self.feed(self.get_name(), json_req) sucess_rate, failed_rate = self.show_process_line( lines_count, index + 1, self.failed_count) print("\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" % (sucess_rate, failed_rate)) def get_name(self): return "{sid}:item:queue".format(sid=self.spiderid) def feed(self, queue_name, req): if self.custom: from custom_redis.client.errors import RedisError else: from redis import RedisError try: self.redis_conn.zadd(queue_name, req, -self.priority) return 0 except RedisError: traceback.print_exc() return 1 def show_process_line(self, count, num, failed): per = count / 100 success = num - failed success_rate = success * 100.0 / count failed_rate = failed * 100.0 / count str_success_rate = "%.2f%% " % success_rate str_failed_rate = "%.2f%% " % failed_rate if num >= self.inc: self.inc += per if sys.platform == "win32": import ctypes std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11) color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute color_ctl(std_out_handle, 2) print("\r", str_success_rate, "") color_ctl(std_out_handle, 32) print(int(success_rate * 30 / 100) * ' ', "") if int(failed_rate): color_ctl(std_out_handle, 64) print(int(failed_rate * 30 / 100) * ' ', "") color_ctl(std_out_handle, 0) color_ctl(std_out_handle, 4) print(str_failed_rate, "") color_ctl(std_out_handle, 7) else: print("\r", str_success_rate, "") print( "%s%s" % (int(success_rate * 50 / 100) * '\033[42m \033[0m', int(failed_rate * 50 / 100) * '\033[41m \033[0m'), str_failed_rate) return success_rate, failed_rate
class Scheduler(object): spider = None def __init__(self, crawler): self.settings = crawler.settings self.logger = Logger.from_crawler(crawler) if self.settings.getbool("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.getint("REDIS_PORT")) self.queue_name = None self.queues = {} @classmethod def from_crawler(cls, crawler): return cls(crawler) def open(self, spider): self.spider = spider self.queue_name = self.settings.get("TASK_QUEUE_TEMPLATE", "%s:request:queue") % spider.name spider.set_redis(self.redis_conn) def enqueue_request(self, request): request.callback = getattr(request.callback, "__name__", request.callback) request.errback = getattr(request.errback, "__name__", request.errback) self.redis_conn.zadd(self.queue_name, pickle.dumps(request), -int(request.meta["priority"])) self.logger.debug("Crawlid: %s, url: %s added to queue. " % (request.meta['crawlid'], request.url)) def next_request(self): self.logger.debug( "length of queue %s is %s" % (self.queue_name, self.redis_conn.zcard(self.queue_name))) item = None if self.settings.getbool("CUSTOM_REDIS"): item = self.redis_conn.zpop(self.queue_name) else: pipe = self.redis_conn.pipeline() pipe.multi() pipe.zrange(self.queue_name, 0, 0).zremrangebyrank(self.queue_name, 0, 0) result, _ = pipe.execute() if result: item = result[0] if item: request = pickle.loads(item) request.callback = request.callback and getattr( self.spider, request.callback) request.errback = request.errback and getattr( self.spider, request.errback) return request def close(self, reason): self.logger.info("Closing Spider: %s. " % self.spider.name) def has_pending_requests(self): return False
class RedisFeed: def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host, custom): self.name = "redis_feed" self.crawlid = crawlid self.spiderid = spiderid self.url = url self.urls_file = urls_file self.priority = priority self.port = port self.host = host self.custom = custom self.inc = 0 self.extract = extract self.setup() @classmethod def parse_args(cls): parser = argparse.ArgumentParser() parser.add_argument('-rh', "--redis-host", dest="host", type=str, default="127.0.0.1") parser.add_argument('-rp', "--redis-port", dest="port", type=int, default=6379) parser.add_argument('-u', '--url', type=str) parser.add_argument('-uf', '--urls-file', type=str) parser.add_argument('-c', '--crawlid', required=True, type=str) parser.add_argument('-s', '--spiderid', required=True, type=str) parser.add_argument('-p', '--priority', type=int, default=100) parser.add_argument('--custom', action="store_true") return cls(**vars(parser.parse_args())) def setup(self): self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.redis_conn.delete("crawlid:%s" % self.crawlid) self.redis_conn.delete("failed_pages:%s" % self.crawlid) self.redis_conn.delete("crawlid:%s:model" % self.crawlid) def start(self): sucess_rate, failed_rate = 0, 0 # item抓取 if self.urls_file: with open(self.urls_file) as f: lst = f.readlines() lines_count = len(lst) for index, url in enumerate(lst): json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse_item", "priority":%s}' % ( url.strip("\357\273\277\r\n"), self.crawlid, self.spiderid, self.priority) self.failed_count += self.feed(self.get_name(url), json_req) sucess_rate, failed_rate = self.show_process_line( lines_count, index + 1, self.failed_count) self.redis_conn.hset("crawlid:%s" % self.crawlid, "total_pages", lines_count) # 分类抓取 else: url_list = self.url.split(" ") lines_count = len(url_list) for index, url in enumerate(url_list): json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse","priority":%s}' % ( url.strip(), self.crawlid, self.spiderid, self.priority, ) self.failed_count += self.feed(self.get_name(url), json_req) sucess_rate, failed_rate = self.show_process_line( lines_count, index + 1, self.failed_count) print "\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" % ( sucess_rate, failed_rate) def get_name(self, url): ex_res = self.extract(url) return "{sid}:{dom}.{suf}:queue".format(sid=self.spiderid, dom=ex_res.domain, suf=ex_res.suffix) def feed(self, queue_name, req): if self.custom: from custom_redis.client.errors import RedisError else: from redis import RedisError try: self.redis_conn.zadd(queue_name, req, -self.priority) return 0 except RedisError: traceback.print_exc() return 1 def show_process_line(self, count, num, failed): per = count / 100 success = num - failed success_rate = success * 100.0 / count failed_rate = failed * 100.0 / count str_success_rate = "%.2f%% " % success_rate str_failed_rate = "%.2f%% " % failed_rate if num >= self.inc: self.inc += per if sys.platform == "win32": import ctypes std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11) color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute color_ctl(std_out_handle, 2) print "\r", str_success_rate, color_ctl(std_out_handle, 32) print int(success_rate * 30 / 100) * ' ', if int(failed_rate): color_ctl(std_out_handle, 64) print int(failed_rate * 30 / 100) * ' ', color_ctl(std_out_handle, 0) color_ctl(std_out_handle, 4) print str_failed_rate, color_ctl(std_out_handle, 7) else: print "\r", str_success_rate, print "%s%s" % (int(success_rate * 50 / 100) * '\033[42m \033[0m', int(failed_rate * 50 / 100) * '\033[41m \033[0m'), str_failed_rate, return success_rate, failed_rate
class Scheduler(Logger): def __init__(self, crawler): self.settings = crawler.settings self.set_logger(crawler) if self.settings.get("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.queue_name = "%s:*:queue" self.queues = {} self.extract = tldextract.extract @classmethod def from_crawler(cls, crawler): return cls(crawler) def open(self, spider): self.spider = spider self.queue_name = self.queue_name%spider.name spider.set_redis(self.redis_conn) spider.set_logger(self.logger) def request_to_dict(self, request): req_dict = { 'url': request.url.decode('ascii'), 'method': request.method, 'headers': dict(request.headers), 'body': request.body, 'cookies': request.cookies, 'meta': request.meta, '_encoding': request._encoding, 'priority': request.priority, 'dont_filter': request.dont_filter, 'callback': None if request.callback is None else request.callback.func_name, 'errback': None if request.errback is None else request.errback.func_name, } return req_dict def enqueue_request(self, request): req_dict = self.request_to_dict(request) ex_res = self.extract(req_dict['url']) key = "{sid}:{dom}.{suf}:queue".format( sid=req_dict['meta']['spiderid'], dom=ex_res.domain, suf=ex_res.suffix) self.redis_conn.zadd(key, json.dumps(req_dict), -int(req_dict["priority"])) self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue" .format(id=req_dict['meta']['crawlid'], url=req_dict['url'])) def next_request(self): queues = self.redis_conn.keys(self.queue_name) if queues: queue = random.choice(queues) self.logger.info("length of queue %s is %s" % (queue, self.redis_conn.zcard(queue))) if self.settings.get("CUSTOM_REDIS"): item = self.redis_conn.zpop(queue) else: pipe = self.redis_conn.pipeline() pipe.multi() pipe.zrange(queue, 0, 0).zremrangebyrank(queue, 0, 0) result, count = pipe.execute() item = result[0] if item: item = json.loads(item) try: req = Request(item['url']) except ValueError: req = Request('http://' + item['url']) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = getattr(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = getattr(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = parse_cookie(item['cookie']) return req def close(self, reason): self.logger.info("Closing Spider", {'spiderid': self.spider.name}) def has_pending_requests(self): return False
class Scheduler(object): # 记录当前正在处理的item, 在处理异常时使用 present_item = None spider = None def __init__(self, crawler): self.settings = crawler.settings self.logger = Logger.from_crawler(crawler) if self.settings.getbool("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.queue_name = "%s:item:queue" self.queues = {} self.request_interval = 60 / self.settings.getint("SPEED", 60) self.last_acs_time = time.time() @classmethod def from_crawler(cls, crawler): return cls(crawler) def open(self, spider): self.spider = spider self.queue_name = self.queue_name % spider.name spider.set_redis(self.redis_conn) def request_to_dict(self, request): headers = dict([(item[0].decode("ascii"), item[1]) for item in request.headers.items()]) req_dict = { 'url': request.url, 'method': request.method, 'headers': headers, 'body': request.body, 'cookies': request.cookies, 'meta': request.meta, '_encoding': request._encoding, 'dont_filter': request.dont_filter, 'callback': request.callback if not isinstance(request.callback, types.MethodType) else request.callback.__name__, 'errback': request.errback if not isinstance(request.errback, types.MethodType) else request.errback.__name__, } return req_dict @enqueue_request_method_wrapper def enqueue_request(self, request): req_dict = self.request_to_dict(request) self.redis_conn.zadd(self.queue_name, pickle.dumps(req_dict), -int(req_dict["meta"]["priority"])) self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue".format( id=req_dict['meta']['crawlid'], url=req_dict['url'])) @next_request_method_wrapper def next_request(self): self.logger.info( "length of queue %s is %s" % (self.queue_name, self.redis_conn.zcard(self.queue_name))) item = None if time.time() - self.request_interval < self.last_acs_time: return item if self.settings.getbool("CUSTOM_REDIS"): item = self.redis_conn.zpop(self.queue_name) else: pipe = self.redis_conn.pipeline() pipe.multi() pipe.zrange(self.queue_name, 0, 0).zremrangebyrank(self.queue_name, 0, 0) result, count = pipe.execute() if result: item = result[0] if item: self.last_acs_time = time.time() item = pickle.loads(item) self.present_item = item headers = item.get("headers", {}) body = item.get("body") if item.get("method"): method = item.get("method") else: method = "GET" try: req = Request(item['url'], method=method, body=body, headers=headers) except ValueError: req = Request('http://' + item['url'], method=method, body=body, headers=headers) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = getattr(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = getattr(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], (str, bytes)): req.cookies = parse_cookie(item['cookie']) return req def close(self, reason): self.logger.info("Closing Spider", {'spiderid': self.spider.name}) def has_pending_requests(self): return False
class SpiderFeeder(object): def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host, custom): self.crawlid = crawlid self.spiderid = spiderid self.url = url self.urls_file = urls_file self.priority = priority self.port = port self.host = host self.custom = custom self.inc = 0 self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.clean_previous_task(self.crawlid) def clean_previous_task(self, crawlid): failed_keys = self.redis_conn.keys("failed_download_*:%s" % crawlid) for fk in failed_keys: self.redis_conn.delete(fk) self.redis_conn.delete("crawlid:%s" % crawlid) self.redis_conn.delete("crawlid:%s:model" % crawlid) def start(self): success_rate, failed_rate = 0, 0 # item抓取 if self.urls_file: with open(self.urls_file) as f: lst = f.readlines() lines_count = len(lst) for index, url in enumerate(lst): req = Request(url=url.strip("\357\273\277\r\n"), callback="parse_item", meta={ "crawlid": self.crawlid, "spiderid": self.spiderid, "priority": self.priority }) self.failed_count += self.feed(self.get_name(), pickle.dumps(req)) success_rate, failed_rate = \ self.show_process_line( lines_count, index + 1, self.failed_count) self.redis_conn.hset("crawlid:%s" % self.crawlid, "total_pages", lines_count) # 分类抓取 else: url_list = self.url.split(" ") lines_count = len(url_list) for index, url in enumerate(url_list): req = Request(url=url.strip(), callback="parse", meta={ "crawlid": self.crawlid, "spiderid": self.spiderid, "priority": self.priority }) self.failed_count += self.feed(self.get_name(), pickle.dumps(req)) sucess_rate, failed_rate = self.show_process_line( lines_count, index + 1, self.failed_count) print("\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" % (success_rate, failed_rate)) def get_name(self): return "{sid}:request:queue".format(sid=self.spiderid) def feed(self, queue_name, req): if self.custom: from custom_redis.client.errors import RedisError else: from redis import RedisError try: self.redis_conn.zadd(queue_name, req, -self.priority) return 0 except RedisError: traceback.print_exc() return 1 def show_process_line(self, count, num, failed): per = count / 100 success = num - failed success_rate = success * 100.0 / count failed_rate = failed * 100.0 / count str_success_rate = "%.2f%% " % success_rate str_failed_rate = "%.2f%% " % failed_rate if num >= self.inc: self.inc += per if sys.platform == "win32": import ctypes std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11) color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute color_ctl(std_out_handle, 2) print("\r", str_success_rate, "") color_ctl(std_out_handle, 32) print(int(success_rate * 30 / 100) * ' ', "") if int(failed_rate): color_ctl(std_out_handle, 64) print(int(failed_rate * 30 / 100) * ' ', "") color_ctl(std_out_handle, 0) color_ctl(std_out_handle, 4) print(str_failed_rate, "") color_ctl(std_out_handle, 7) else: print("\r", str_success_rate, "") print( "%s%s" % (int(success_rate * 50 / 100) * '\033[42m \033[0m', int(failed_rate * 50 / 100) * '\033[41m \033[0m'), str_failed_rate) return success_rate, failed_rate