def __init__(self): self._logger = Log('HandlerMixin') self.scheduler_handler = { 'pause': self.handle_pause_spider, 'start': self.handle_start_spider, 'run': self.handle_run_spider, 'stop': self.handle_stop_spider, 'update_spider': self.handle_update_spider, 'delete_spider': self.handle_delete_spider, 'list_spiders': self.handle_list_spiders, 'list_speed': self.handle_list_speed, 'set_speed': self.handle_set_speed, 'clean_request_queue': self.handle_clean_request_queue, 'clean_dupe_filter': self.handle_clean_dupe_filter } self.parser_handler = { 'list_count': self.handle_count, 'pause': self.handle_pause_spider, 'start': self.handle_start_spider, 'run': self.handle_run_spider, 'stop': self.handle_stop_spider, 'update_spider': self.handle_update_spider, 'delete_spider': self.handle_delete_spider } self.all_handler = {} self.all_handler.update(self.scheduler_handler) self.all_handler.update(self.parser_handler)
def __init__(self, downloader_parser_queue: AsyncRedisPriorityQueue, parser_scheduler_queue: AsyncRedisPriorityQueue, scheduler_downloader_queue: AsyncRedisPriorityQueue, loop: BaseEventLoop, name: str): """ :param downloader_parser_queue:The redis queue :param parser_scheduler_queue:The redis queue :param loop:EventLoop """ super(Parser, self).__init__() self.name = name self.downloader_parser_queue = downloader_parser_queue self.parser_scheduler_queue = parser_scheduler_queue self.scheduler_downloader_queue = scheduler_downloader_queue self.loop = loop self.spider_started = set() self.spider_stopped = set() self.spider_paused = set() self.all_spider_set = [ self.spider_started, self.spider_stopped, self.spider_paused ] self.spider_module_handle = SpiderModuleHandle( catty.config.SPIDER_PATH) self.spider_module_handle.load_all_spider() self.logger = Log('Parser') self.ready_to_exit = False self.counter = Counter(loop)
class BaseHandleClient: """ To connect with WebUI-Server and Scheduler&Parser""" logger = Log('HandlerClient') scheduler_handler_name = { 'pause', 'start', 'run', 'stop', 'update_spider', 'delete_spider', 'list_spiders', 'list_speed', 'set_speed', 'clean_request_queue', 'clean_dupe_filter' } parser_handler_name = { 'list_count', 'pause', 'start', 'run', 'stop', 'update_spider', 'delete_spider' } scheduler_parser_handler_name = scheduler_handler_name & parser_handler_name def __init__(self): self.scheduler_clients = { k: XMLRPCClient("http://localhost:{}".format(v)) for k, v in catty.config.PORT['SCHEDULER'].items() } self.parser_clients = { k: XMLRPCClient("http://localhost:{}".format(v)) for k, v in catty.config.PORT['PARSER'].items() } def action(self, action_type: str, **kwargs) -> list: """Handle action such as 'run','start','set speed'...""" context = {'type': action_type} context.update(kwargs) results = [] if action_type in self.scheduler_parser_handler_name: for _, each_parser_client in self.parser_clients.items(): results.append( getattr(each_parser_client, action_type)(context)) for _, each_scheduler_client in self.scheduler_clients.items(): results.append( getattr(each_scheduler_client, action_type)(context)) elif action_type in self.scheduler_handler_name: for _, each_scheduler_client in self.scheduler_clients.items(): results.append( getattr(each_scheduler_client, action_type)(context)) elif action_type in self.parser_handler_name: for _, each_parser_client in self.parser_clients.items(): results.append( getattr(each_parser_client, action_type)(context)) else: results = [{ 'code': STATUS_CODE.USER_ERROR, 'msg': 'Not a vaild action type.' }] return results
class SpiderModuleHandle(ModuleHandle): def __init__(self, path): super().__init__(path) # spider_name:spider_instantiation self.spider_instantiation = {} self.logger = Log('SpiderModuleHandle') def _instance_spider(self, name): try: spec, module = self.namespace[name] spider_cls = getattr(module, 'Spider') self.spider_instantiation.update({spider_cls.name: spider_cls()}) self.logger.log_it( "[load_spider]Load spider name:{}".format(spider_cls.name), 'INFO') except Exception as e: self.logger.log_it("[load_spider]ErrInfo:{}".format(e), 'WARN') def _instance_all_spider(self): for file_name in self.namespace.keys(): self._instance_spider(file_name) def load_all_spider(self): self.load_all_module() self._instance_all_spider() def load_new_spider(self): self.load_new_module() self._instance_all_spider() def update_spider(self, spider_file_name): if '.py' not in spider_file_name: spider_file_name = spider_file_name + '.py' self.update_module(spider_file_name) self._instance_spider(spider_file_name) def delete_spider(self, spider_file_name): # TODO spider_file_name == Spider.name self.spider_instantiation.pop(spider_file_name) self.delete_module(spider_file_name)
def __init__(self, scheduler_downloader_queue: AsyncRedisPriorityQueue, downloader_parser_queue: AsyncRedisPriorityQueue, loop: BaseEventLoop, conn_limit: int): """ :param scheduler_downloader_queue:The redis queue :param downloader_parser_queue:The redis queue :param loop:EventLoop :param conn_limit:Limit of The total number for simultaneous connections. # :param limit_per_host:The limit for simultaneous connections to the same endpoint(host, port, is_ssl). """ self.scheduler_downloader_queue = scheduler_downloader_queue self.downloader_parser_queue = downloader_parser_queue self.loop = loop self.conn_limit = conn_limit # using in conn_limit self.count = 0 self.logger = Log('Downloader')
def __init__(self, spider_speed: dict, scheduler_downloader_queue: AsyncRedisPriorityQueue, requests_queue: dict, spider_stopped: set, spider_paused: set, spider_started: set, spider_ready_start: set, spider_todo: set, loop: asyncio.BaseEventLoop): self.logger = Log('Selector') self.scheduler_downloader_queue = scheduler_downloader_queue self.requests_queue = requests_queue self.loop = loop # time from begin to now self.running_time = 0 self.run_at = int(time.time()) self.spider_stopped = spider_stopped self.spider_paused = spider_paused self.spider_started = spider_started self.spider_ready_start = spider_ready_start self.spider_todo = spider_todo self.spider_speed = spider_speed self.spider_speed_reciprocal = {} self.init_speed()
def __init__(self, scheduler_downloader_queue: AsyncRedisPriorityQueue, parser_scheduler_queue: AsyncRedisPriorityQueue, loop: asyncio.BaseEventLoop, name: str): """ :param scheduler_downloader_queue:The redis queue :param parser_scheduler_queue:The redis queue :param loop:EventLoop """ super().__init__() self.name = name self.scheduler_downloader_queue = scheduler_downloader_queue self.parser_scheduler_queue = parser_scheduler_queue # connection of all requests-queue self.requests_queue_conn = {} self.bloom_filter = {} self.loop = loop # spider_ready_start: means that you start a spider,it will run from begin. # spider_new: means that you spider that had not run yet. self.spider_stopped = set() self.spider_paused = set() self.spider_started = set() self.spider_ready_start = set() self.spider_todo = set() self.all_spider_set = [ self.spider_todo, self.spider_paused, self.spider_started, self.spider_ready_start, self.spider_started ] self.spider_module_handle = SpiderModuleHandle( catty.config.SPIDER_PATH) self.logger = Log('Scheduler') self.done_all_things = False self.selector = None
class HandlerMixin: """ A Handler to mixin Scheduler and Parser. """ def __init__(self): self._logger = Log('HandlerMixin') self.scheduler_handler = { 'pause': self.handle_pause_spider, 'start': self.handle_start_spider, 'run': self.handle_run_spider, 'stop': self.handle_stop_spider, 'update_spider': self.handle_update_spider, 'delete_spider': self.handle_delete_spider, 'list_spiders': self.handle_list_spiders, 'list_speed': self.handle_list_speed, 'set_speed': self.handle_set_speed, 'clean_request_queue': self.handle_clean_request_queue, 'clean_dupe_filter': self.handle_clean_dupe_filter } self.parser_handler = { 'list_count': self.handle_count, 'pause': self.handle_pause_spider, 'start': self.handle_start_spider, 'run': self.handle_run_spider, 'stop': self.handle_stop_spider, 'update_spider': self.handle_update_spider, 'delete_spider': self.handle_delete_spider } self.all_handler = {} self.all_handler.update(self.scheduler_handler) self.all_handler.update(self.parser_handler) def handle_pause_spider(self: "Scheduler", msg) -> tuple: """Pause the spider""" """ Scheduler & Parser: Started->-spider_started & +spider_paused Paused->PASS Stop->PASS """ spider_name = msg.get('spider_name') if not spider_name: return STATUS_CODE.ARGS_ERROR, {} if spider_name in self.spider_started: set_safe_remove(self.spider_started, spider_name) self.spider_paused.add(spider_name) # Dump request queue if 'scheduler' in self.name: self.loop.create_task(self.dump_tasks(spider_name)) elif 'parser' in self.name: self.loop.create_task(self.dump_tasks(DOWNLOADER_PARSER)) self.loop.create_task(self.dump_tasks(PARSER_SCHEDULER)) self._logger.log_it( "[pause_spider]Success pause spider spider:{}".format(spider_name)) return STATUS_CODE.OK, {} def handle_stop_spider(self: "Scheduler", msg) -> tuple: """Stop the spider""" """ Scheduler & Parser: Started->-spider_started & +spider_stopped Paused->-spider_paused & _spider_stopped Stopped->PASS """ spider_name = msg.get('spider_name') if not spider_name: return STATUS_CODE.ARGS_ERROR, {} if spider_name in self.spider_started: set_safe_remove(self.spider_started, spider_name) self.spider_stopped.add(spider_name) elif spider_name in self.spider_paused: set_safe_remove(self.spider_paused, spider_name) self.spider_stopped.add(spider_name) self._logger.log_it( "[stop_spider]Success pause spider spider:{}".format(spider_name)) return STATUS_CODE.OK, {} def handle_run_spider(self: "Scheduler", msg): """continue if spider was paused or start from begin if spider was stopped""" """ Scheduler: Started->PASS PAUSED->+spider_started STOPPED->+spider_ready_start TODO->+spider_started Parser: Started->PASS PAUSED->+spider_started STOPPED->+spider_started """ spider_name = msg.get('spider_name') if not spider_name: return STATUS_CODE.ARGS_ERROR, {} if spider_name in self.spider_paused: set_safe_remove(self.spider_paused, spider_name) self.spider_started.add(spider_name) elif spider_name in self.spider_stopped: set_safe_remove(self.spider_stopped, spider_name) if 'scheduler' in self.name: self.spider_ready_start.add(spider_name) else: self.spider_started.add(spider_name) elif 'scheduler' in self.name and spider_name in self.spider_todo: set_safe_remove(self.spider_todo, spider_name) self.spider_started.add(spider_name) elif 'parser' in self.name: self.spider_started.add(spider_name) # load the persist file if 'scheduler' in self.name: self.loop.create_task(self.load_tasks(spider_name)) else: self.loop.create_task( self.load_tasks(spider_name, PARSER_SCHEDULER)) self.loop.create_task( self.load_tasks(spider_name, DOWNLOADER_PARSER)) self._logger.log_it( "[run_spider]Success spider:{}".format(spider_name)) return STATUS_CODE.OK, {} def handle_start_spider(self: "Scheduler", msg): """start from begin if spider is todo,start from begin & contionue is spider was paused or stopped""" """ Scheduler: Started->PASS PAUSED->+spider_ready_start STOPPED->+spider_ready_start TODO->+spider_ready_start Parser: Started->PASS PAUSED->+spider_started STOPPED->+spider_started """ spider_name = msg.get('spider_name') if not spider_name: return STATUS_CODE.ARGS_ERROR, {} if 'scheduler' in self.name and spider_name in self.spider_todo: set_safe_remove(self.spider_todo, spider_name) self.spider_ready_start.add(spider_name) elif spider_name in self.spider_paused: set_safe_remove(self.spider_paused, spider_name) self.spider_started.add(spider_name) if 'scheduler' in self.name: self.spider_ready_start.add(spider_name) elif spider_name in self.spider_stopped: set_safe_remove(self.spider_stopped, spider_name) self.spider_started.add(spider_name) if 'scheduler' in self.name: self.spider_ready_start.add(spider_name) elif spider_name in self.spider_started: if 'scheduler' in self.name: self.spider_ready_start.add(spider_name) elif 'parser' in self.name: self.spider_started.add(spider_name) self._logger.log_it( "[start_spider]Success spider:{}".format(spider_name)) return STATUS_CODE.OK, {} # ------------------------SCHEDULER_ONLY---------------------------------- def handle_list_spiders(self: "Scheduler", msg) -> tuple: return STATUS_CODE.OK, { 'Started': list(get_default(self, 'spider_started')) if get_default( self, 'spider_started') else [], 'Todo': list(get_default(self, 'spider_todo')) if get_default( self, 'spider_todo') else [], 'Paused': list(get_default(self, 'spider_paused')) if get_default( self, 'spider_paused') else [], 'Stopped': list(get_default(self, 'spider_stopped')) if get_default( self, 'spider_stopped') else [], 'Ready start': list(get_default(self, 'spider_ready_start')) if get_default( self, 'spider_ready_start') else [] } def handle_list_speed(self: "Scheduler", msg) -> tuple: return STATUS_CODE.OK, self.selector.spider_speed def handle_set_speed(self: "Scheduler", msg) -> tuple: spider_name = msg.get('spider_name') speed = msg['spider_speed'] if 'scheduler' in self.name: self.selector.update_speed(spider_name, int(speed)) return STATUS_CODE.OK, {} def handle_clean_request_queue(self: "Scheduler", msg) -> tuple: spider_name = msg.get('spider_name') if not spider_name: return STATUS_CODE.ARGS_ERROR, {} if self.name == 'master_scheduler': self.loop.create_task(self.clean_requests_queue(spider_name)) return STATUS_CODE.OK, {} else: return STATUS_CODE.NOT_MY_BUSINESS, {} def handle_clean_dupe_filter(self: "Scheduler", msg) -> tuple: spider_name = msg.get('spider_name') if not spider_name: return STATUS_CODE.ARGS_ERROR, {} if self.name == 'master_parser': self.loop.create_task(self.clean_dupefilter(spider_name)) return STATUS_CODE.OK, {} else: return STATUS_CODE.NOT_MY_BUSINESS, {} # --------------------------------------------------------------------- def handle_update_spider(self: "Scheduler", msg) -> tuple: spider_name = msg.get('spider_name') if not spider_name: return STATUS_CODE.ARGS_ERROR, {} self.spider_module_handle.update_spider(spider_name) # only can upload .py files if '.py' in spider_name: self.spider_todo.add( getattr(self.spider_module_handle.namespace[spider_name][1], 'Spider').name) self.selector.update_speed( getattr(self.spider_module_handle.namespace[spider_name][1], 'Spider').name, 1) return STATUS_CODE.OK, {} def handle_delete_spider(self: "Scheduler", msg) -> tuple: spider_name = msg.get('spider_name') if not spider_name: return STATUS_CODE.ARGS_ERROR, {} if spider_name in self.spider_started or ('scheduler' in self.name and spider_name in self.spider_ready_start): return STATUS_CODE.USER_ERROR, { 'msg': "Please stop the spider first." } if 'scheduler' in self.name: self.handle_clean_request_queue(spider_name) self.handle_clean_dupe_filter(spider_name) self.spider_module_handle.delete_spider(spider_name) for spider_set in self.all_spider_set: if spider_name in spider_set: spider_set.remove(spider_name) self.loop.create_task(self.clean_requests_queue(spider_name)) self.loop.create_task(self.clean_dupefilter(spider_name)) return STATUS_CODE.OK, {} # -------------------------PARSER ONLY--------------------------------- def handle_count(self: "Parser", msg) -> tuple: return STATUS_CODE.OK, self.counter.count_all() # --------------------------------------------------------------------- def xmlrpc_run(self: "Scheduler", name): if 'scheduler' in self.name: application = ThreadXMLRPCServer( ('localhost', catty.config.PORT['SCHEDULER'][name])) for k, v in self.scheduler_handler.items(): application.register_function(v, k) else: application = ThreadXMLRPCServer( ('localhost', catty.config.PORT['PARSER'][name])) for k, v in self.parser_handler.items(): application.register_function(v, k) application.serve_forever()
def __init__(self, path): super().__init__(path) # spider_name:spider_instantiation self.spider_instantiation = {} self.logger = Log('SpiderModuleHandle')
class DownLoader: def __init__(self, scheduler_downloader_queue: AsyncRedisPriorityQueue, downloader_parser_queue: AsyncRedisPriorityQueue, loop: BaseEventLoop, conn_limit: int): """ :param scheduler_downloader_queue:The redis queue :param downloader_parser_queue:The redis queue :param loop:EventLoop :param conn_limit:Limit of The total number for simultaneous connections. # :param limit_per_host:The limit for simultaneous connections to the same endpoint(host, port, is_ssl). """ self.scheduler_downloader_queue = scheduler_downloader_queue self.downloader_parser_queue = downloader_parser_queue self.loop = loop self.conn_limit = conn_limit # using in conn_limit self.count = 0 self.logger = Log('Downloader') async def _request(self, aio_request: Request, loop: BaseEventLoop) -> Response: """The real request.It return the Response obj with status 99999 as fail""" t_ = time.time() self.logger.log_it("Downloading url:{} data:{}".format(aio_request.url, aio_request.data)) try: async with aiohttp.ClientSession(loop=loop) as session: if aio_request.method == 'GET': async with session.get(**aio_request.dump_request()) as client: body = await client.read() elif aio_request.method == 'POST': async with session.post(**aio_request.dump_request()) as client: body = await client.read() elif aio_request.method == 'PUT': async with session.put(**aio_request.dump_request()) as client: body = await client.read() elif aio_request.method == 'DELETE': async with session.delete(**aio_request.dump_request()) as client: body = await client.read() elif aio_request.method == 'HEAD': async with session.head(**aio_request.dump_request()) as client: body = await client.read() elif aio_request.method == 'OPTIONS': async with session.options(**aio_request.dump_request()) as client: body = await client.read() elif aio_request.method == 'PATCH': async with session.path(**aio_request.dump_request()) as client: body = await client.read() else: self.logger.log_it("Not a vaild method.Request:{}".format(aio_request), level='INFO') return Response(status=-1, body=str("Not a vaild method.Request:{}".format(aio_request)), ) response = Response( # TODO text accept encoding param to encode the body # text= await client.text(), method=client.method, status=client.status, cookies=client.cookies, headers=client.raw_headers, charset=client.charset, content_type=client.content_type, # history= client.history, body=body, use_time=time.time() - t_, url=client.url, ) except Exception as e: self.logger.log_it("Fail to download url:{} data:{} ErrInfo:{}".format(aio_request.url, aio_request.data, traceback.format_exc())) response = Response(status=99999, body=str(e), ) self.count -= 1 return response async def fail_callback(self, task: dict, aio_request: Request): retry = task['meta']['retry'] retried = task.get('retried', 0) if retry != 0 and retried < retry: task.update({'retried': retried + 1}) self.logger.log_it("Retry url:{} body:{} retried:{}".format(aio_request.url, aio_request.data, retried)) # retry wait await asyncio.sleep(task['meta']['retry_wait'], self.loop) await push_task(self.scheduler_downloader_queue, task, self.loop) async def success_callback(self, task: dict, response: Response): task.update({'response': response}) await push_task(self.downloader_parser_queue, task, self.loop) async def request(self, aio_request: Request, task: dict): """request,update the task and put it in the queue""" response = await self._request(aio_request, self.loop) # TODO:99999 means catch exception during request(or we should uniform the status code and write a doc) if response['status'] != 99999: # success await self.success_callback(task, response) elif response['status'] == -1: # -1 means ignore this status pass else: # fail await self.fail_callback(task, aio_request) async def start_crawler(self, connector): """get item from queue and crawl it & push it to queue at last""" task = await get_task(self.scheduler_downloader_queue) if task is not None: self.count += 1 aio_request = task['request'] self.loop.create_task(self.request(aio_request=aio_request, task=task)) # The limit of concurrent request while self.count > self.conn_limit: await asyncio.sleep(0.5, loop=self.loop) self.loop.create_task(self.start_crawler(connector)) else: # If the queue is empty,wait and try again. await asyncio.sleep(catty.config.LOAD_QUEUE_INTERVAL, loop=self.loop) self.loop.create_task(self.start_crawler(connector)) def run(self): try: crawler = partial(self.start_crawler,connector=aiohttp.TCPConnector(verify_ssl=False)) self.loop.create_task(crawler()) self.loop.run_forever() except KeyboardInterrupt: self.logger.log_it("Bye!", level='INFO')
class Parser(HandlerMixin): def __init__(self, downloader_parser_queue: AsyncRedisPriorityQueue, parser_scheduler_queue: AsyncRedisPriorityQueue, scheduler_downloader_queue: AsyncRedisPriorityQueue, loop: BaseEventLoop, name: str): """ :param downloader_parser_queue:The redis queue :param parser_scheduler_queue:The redis queue :param loop:EventLoop """ super(Parser, self).__init__() self.name = name self.downloader_parser_queue = downloader_parser_queue self.parser_scheduler_queue = parser_scheduler_queue self.scheduler_downloader_queue = scheduler_downloader_queue self.loop = loop self.spider_started = set() self.spider_stopped = set() self.spider_paused = set() self.all_spider_set = [ self.spider_started, self.spider_stopped, self.spider_paused ] self.spider_module_handle = SpiderModuleHandle( catty.config.SPIDER_PATH) self.spider_module_handle.load_all_spider() self.logger = Log('Parser') self.ready_to_exit = False self.counter = Counter(loop) async def load_tasks(self, spider_name: str, which_q: str = ''): """load the persist task & push it to queue""" tasks = await load_task(catty.config.PERSISTENCE['DUMP_PATH'], '{}_{}'.format(self.name, which_q), spider_name) if tasks: self.logger.log_it("[load_tasks]Load tasks:{}".format(tasks)) for each_task in tasks: # push each task to request-queue if which_q == PARSER_SCHEDULER: await push_task(self.parser_scheduler_queue, each_task, self.loop) elif which_q == DOWNLOADER_PARSER: await push_task(self.downloader_parser_queue, each_task, self.loop) async def dump_tasks(self, which_q: str): """ dump the task which in queue """ if which_q == PARSER_SCHEDULER: while await self.parser_scheduler_queue.qsize(): task = await get_task(self.parser_scheduler_queue) if task is not None: await dump_task(task, catty.config.PERSISTENCE['DUMP_PATH'], "{}_{}".format(self.name, which_q), task['spider_name']) self.logger.log_it("[dump_task]Dump task:{}".format(task)) elif which_q == DOWNLOADER_PARSER: while await self.downloader_parser_queue.qsize(): task = await get_task(self.downloader_parser_queue) if task is not None: await dump_task(task, catty.config.PERSISTENCE['DUMP_PATH'], "{}_{}".format(self.name, which_q), task['spider_name']) self.logger.log_it("[dump_task]Dump task:{}".format(task)) def dump_count(self): counter_date = { 'value_d': self.counter.value_d, 'cache_value': self.counter.cache_value } root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'parser') dump_pickle_data(root, '{}_counter'.format(self.name), counter_date) self.logger.log_it("[dump_count]{}".format(counter_date)) def load_count(self): root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'parser') counter_date = load_pickle_data(root, '{}_counter'.format(self.name)) self.counter.value_d = counter_date['value_d'] self.counter.cache_value = counter_date['cache_value'] self.logger.log_it("[load_count]{}".format(counter_date)) def dump_status(self): status = { 'spider_started': self.spider_started, 'spider_paused': self.spider_paused, 'spider_stopped': self.spider_stopped } root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'parser') dump_pickle_data(root, '{}_status'.format(self.name), status) self.logger.log_it("[dump_status]{}".format(status)) def load_status(self): root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'parser') status = load_pickle_data(root, '{}_status'.format(self.name)) self.spider_paused = status['spider_paused'] self.spider_stopped = status['spider_stopped'] self.spider_started = status['spider_started'] self.all_spider_set = [ self.spider_started, self.spider_stopped, self.spider_paused ] self.logger.log_it("[load_status]{}".format(status)) def on_begin(self): """Run before begin to do something like load tasks,or load config""" self.load_status() self.load_count() for started_spider in self.spider_started: self.loop.create_task( self.load_tasks(DOWNLOADER_PARSER, started_spider)) self.loop.create_task( self.load_tasks(PARSER_SCHEDULER, started_spider)) async def check_end(self): done = [] while not done or False in done: done = [] psq = await self.parser_scheduler_queue.qsize() dpq = await self.downloader_parser_queue.qsize() if psq == 0 or psq is None: await asyncio.sleep(0.5, self.loop) done.append(True) else: done.append(False) if dpq == 0 or dpq is None: await asyncio.sleep(0.5, self.loop) done.append(True) else: done.append(False) self.ready_to_exit = True async def on_end(self): """Run when exit to do something like dump queue etc... It make self.done_all_things=True in last """ # no need to # for task in asyncio.Task.all_tasks(): # task.cancel() if catty.config.PERSISTENCE['PERSIST_BEFORE_EXIT']: self.loop.create_task(self.dump_tasks(PARSER_SCHEDULER)) self.loop.create_task(self.dump_tasks(DOWNLOADER_PARSER)) self.dump_count() self.dump_status() self.loop.create_task(self.check_end()) def get_spider_method(self, spider_name: str, method_name: str): """Return a bound method if spider have this method,return None if not.""" try: # get the instantiation spider from dict spider_ins = self.spider_module_handle.spider_instantiation[ spider_name] except IndexError: self.logger.log_it( "[_run_ins_func]No this Spider or had not instance yet.", 'WARN') return # get the spider's method from name method = spider_ins.__getattribute__(method_name) return method, spider_ins async def _run_ins_func(self, spider_name: str, method_name: str, task: dict): """run the spider_ins boned method to parser it & push it to parser-scheduler queue""" self.logger.log_it("[_run_ins_func]Parser the {}.{}".format( spider_name, method_name)) _response = task['response'] method, _ = self.get_spider_method(spider_name, method_name) # get the method'parms _signature = inspect.signature(method).parameters try: # the async method define by user must have a loop param even never use it. if 'loop' in _signature: if 'task' in _signature: parser_return = await method(_response, task=task, loop=self.loop) else: parser_return = await method(_response, loop=self.loop) else: if 'task' in _signature: parser_return = method(_response, task=task) else: parser_return = method(_response) except Retry_current_task: # handle it like a new task self.loop.create_task( push_task(self.parser_scheduler_queue, task, self.loop)) return except: # The except from user spiders traceback.print_exc() return if isinstance(parser_return, dict): # normal if 'tid' not in parser_return: # not a task task['parser'].update({'item': parser_return}) await push_task(self.parser_scheduler_queue, task, self.loop) elif isinstance(parser_return, list): # task_list for each_return_task in parser_return: await push_task(self.scheduler_downloader_queue, each_return_task, self.loop) elif parser_return is None: pass async def make_tasks(self): """run the done task & push them""" task = await get_task(self.downloader_parser_queue) if task: self.loop.create_task(self.make_tasks()) if 'status' in task['response']: if 200 <= task['response']['status'] < 400 and \ task['response']['status'] in task['handle_status_code']: self.counter.add_success(task['spider_name']) if task['spider_name'] in self.spider_started: callback = task['callback'] spider_name = task['spider_name'] for callback_method_name in callback: # number of task that parser return depend on the number of callbacks each_task = deepcopy(task) parser_method_name = callback_method_name.get( 'parser', None) if parser_method_name: self.loop.create_task( self._run_ins_func(spider_name, parser_method_name, each_task)) elif task['spider_name'] in self.spider_paused: # persist await dump_task(task, catty.config.PERSISTENCE['DUMP_PATH'], 'parser', task['spider_name']) elif task['spider_name'] in self.spider_stopped: pass else: retry = task['meta']['retry'] retried = task.get('retried', 0) if retry != 0 and retried < retry: task.update({'retried': retried + 1}) retry_method, _ = self.get_spider_method( task['spider_name'], 'retry') # it could be return a list retry_tasks = retry_method(task) if not isinstance(retry_tasks, list): retry_tasks = [retry_tasks] for each_retry_task in retry_tasks: await asyncio.sleep(task['meta']['retry_wait'], self.loop) await push_task(self.scheduler_downloader_queue, each_retry_task, self.loop) self.counter.add_fail(task['spider_name']) else: # if no task in downloader_parser queue,wait it self.loop.call_later( catty.config.LOAD_QUEUE_INTERVAL, lambda: self.loop.create_task(self.make_tasks())) def quit(self): self.logger.log_it("[Ending]Doing the last thing...", level='INFO') self.loop.create_task(self.on_end()) while not self.ready_to_exit: time.sleep(1) self.logger.log_it("Bye!", level='INFO') os._exit(0) def run_parser(self): for i in range(catty.config.NUM_OF_PARSER_MAKE_TASK): self.loop.create_task(self.make_tasks()) self.loop.create_task(self.counter.update()) self.loop.run_forever() def run(self): try: self.on_begin() xmlrpc_partial_func = partial(self.xmlrpc_run, name=self.name) handler_server_thread = threading.Thread( target=xmlrpc_partial_func) parser_thread = threading.Thread(target=self.run_parser) parser_thread.start() handler_server_thread.start() # In Windows,I cant catch KeyboardInterrupt.FXXK! while True: r = input() if r == 'Q': self.quit() except KeyboardInterrupt: self.quit()
class Selector: def __init__(self, spider_speed: dict, scheduler_downloader_queue: AsyncRedisPriorityQueue, requests_queue: dict, spider_stopped: set, spider_paused: set, spider_started: set, spider_ready_start: set, spider_todo: set, loop: asyncio.BaseEventLoop): self.logger = Log('Selector') self.scheduler_downloader_queue = scheduler_downloader_queue self.requests_queue = requests_queue self.loop = loop # time from begin to now self.running_time = 0 self.run_at = int(time.time()) self.spider_stopped = spider_stopped self.spider_paused = spider_paused self.spider_started = spider_started self.spider_ready_start = spider_ready_start self.spider_todo = spider_todo self.spider_speed = spider_speed self.spider_speed_reciprocal = {} self.init_speed() def init_speed(self): """init all spider speed""" self.spider_speed_reciprocal = { k: math.ceil(1 / v) for k, v in self.spider_speed.items() } def update_speed(self, spider_name: str, speed: int): """update a spider speed""" self.spider_speed.update({spider_name: speed}) self.spider_speed_reciprocal.update( {spider_name: math.ceil(1 / speed)}) async def _select_task(self, requests_q, spider_name): task = await get_task(requests_q) if task: if task['spider_name'] in self.spider_started: await push_task(self.scheduler_downloader_queue, task, self.loop) self.logger.log_it('[select_task]{} tid:{}'.format( spider_name, task['tid'])) elif task['spider_name'] in self.spider_paused: dump_task(task, catty.config.PERSISTENCE['DUMP_PATH'], 'scheduler', task['spider_name']) elif task['spider_name'] in self.spider_stopped: pass async def select_task(self): # TODO 时间粒度 last_running_time = self.running_time self.running_time = int(time.time()) - self.run_at for spider_name in self.spider_started: speed_reciprocal = self.spider_speed_reciprocal[spider_name] requests_q = self.requests_queue.setdefault( "{}:requests".format(spider_name), AsyncRedisPriorityQueue("{}:requests".format(spider_name), loop=self.loop)) for each_diff_time in range(last_running_time, self.running_time): # time's up if each_diff_time % speed_reciprocal == 0: # if speed bigger than 1,means that at last 1 request per sec. if self.spider_speed[spider_name] > 1: for i in range(self.spider_speed[spider_name]): self.loop.create_task( self._select_task(requests_q, spider_name)) else: self.loop.create_task( self._select_task(requests_q, spider_name)) await asyncio.sleep(catty.config.SELECTOR_INTERVAL, loop=self.loop) self.loop.create_task(self.select_task())
class Scheduler(HandlerMixin): def __init__(self, scheduler_downloader_queue: AsyncRedisPriorityQueue, parser_scheduler_queue: AsyncRedisPriorityQueue, loop: asyncio.BaseEventLoop, name: str): """ :param scheduler_downloader_queue:The redis queue :param parser_scheduler_queue:The redis queue :param loop:EventLoop """ super().__init__() self.name = name self.scheduler_downloader_queue = scheduler_downloader_queue self.parser_scheduler_queue = parser_scheduler_queue # connection of all requests-queue self.requests_queue_conn = {} self.bloom_filter = {} self.loop = loop # spider_ready_start: means that you start a spider,it will run from begin. # spider_new: means that you spider that had not run yet. self.spider_stopped = set() self.spider_paused = set() self.spider_started = set() self.spider_ready_start = set() self.spider_todo = set() self.all_spider_set = [ self.spider_todo, self.spider_paused, self.spider_started, self.spider_ready_start, self.spider_started ] self.spider_module_handle = SpiderModuleHandle( catty.config.SPIDER_PATH) self.logger = Log('Scheduler') self.done_all_things = False self.selector = None def instantiate_spider(self): """instantiate all spider""" self.spider_module_handle.load_all_spider() def init_spider_set(self): """init spider set""" for spider_name in self.spider_module_handle.namespace.keys(): try: # find Spider.name spider_name = getattr( self.spider_module_handle.namespace[spider_name][1], 'Spider').name except AttributeError: self.logger.log_it( "[instantiate_spider]Cant get spider's name.SpiderFile:{}". format(spider_name), 'WARN') continue if spider_name not \ in (self.spider_paused | self.spider_ready_start | self.spider_started | self.spider_stopped): self.spider_todo.add(spider_name) def instantiate_selector(self): # Get the speed self.selector = Selector( { spider_name: get_default(obj=self.spider_module_handle. spider_instantiation[spider_name], name_or_index='speed', default=catty.config.SPIDER_DEFAULT['SPEED']) for spider_set in self.all_spider_set for spider_name in spider_set }, self.scheduler_downloader_queue, self.requests_queue_conn, self.spider_stopped, self.spider_paused, self.spider_started, self.spider_ready_start, self.spider_todo, self.loop) async def load_tasks(self, spider_name: str, which_q: str = ''): """load the persist task & push it to queue""" tasks = await load_task(catty.config.PERSISTENCE['DUMP_PATH'], 'request_queue_{}'.format(self.name), spider_name) if tasks: self.logger.log_it("[load_tasks]Load tasks:{}".format(tasks)) for each_task in tasks: # push each task to request-queue await push_task(self.scheduler_downloader_queue, each_task, self.loop) async def dump_tasks(self, spider_name: str): """ dump the task which in queue """ request_q = self.requests_queue_conn.setdefault( "{}:requests".format(spider_name), AsyncRedisPriorityQueue("{}:requests".format(spider_name), loop=self.loop)) if not request_q.redis_conn: await request_q.conn() while await request_q.qsize(): task = await get_task(request_q) if task is not None: await dump_task(task, catty.config.PERSISTENCE['DUMP_PATH'], 'request_queue_{}'.format(self.name), task['spider_name']) self.logger.log_it("[dump_task]Dump task:{}".format(task)) async def dump_all_paused_task(self): """Dump the task which spider was paused.""" for paused_spider_name in self.spider_paused: await self.dump_tasks(paused_spider_name) def dump_speed(self): root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'scheduler') dump_pickle_data(root, 'speed_{}'.format(self.name), self.selector.spider_speed) self.logger.log_it("[dump_speed]{}".format(self.selector.spider_speed)) def load_speed(self): root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'scheduler') spider_speed = load_pickle_data(root, 'speed{}'.format(self.name)) self.selector.spider_speed = spider_speed self.selector.init_speed() self.logger.log_it("[load_speed]{}".format(spider_speed)) def dump_status(self): status = { 'spider_started': self.spider_started, 'spider_paused': self.spider_paused, 'spider_stopped': self.spider_stopped, 'spider_todo': self.spider_todo, 'spider_ready_start': self.spider_ready_start, } root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'scheduler') dump_pickle_data(root, 'status_{}'.format(self.name), status) self.logger.log_it("[dump_status]{}".format(status)) def load_status(self): root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'scheduler') status = load_pickle_data(root, 'status{}'.format(self.name)) self.spider_paused = status['spider_paused'] self.spider_stopped = status['spider_stopped'] self.spider_started = status['spider_started'] self.spider_todo = status['spider_todo'] self.spider_ready_start = status['spider_ready_start'] self.all_spider_set = [ self.spider_todo, self.spider_paused, self.spider_started, self.spider_ready_start, self.spider_started ] self.logger.log_it("[load_status]{}".format(status)) async def clean_requests_queue(self, spider_name: str): """Clean the spider's requests queue""" request_q = self.requests_queue_conn.setdefault( "{}:requests".format(spider_name), AsyncRedisPriorityQueue("{}:requests".format(spider_name), loop=self.loop)) if not request_q.redis_conn: await request_q.conn() await request_q.clear() self.logger.log_it( "[clean_requests_queue]Clean spider {}'s requests queue".format( spider_name)) async def get_requests_queue_size(self, spider_name: str): request_q = self.requests_queue_conn.setdefault( "{}:requests".format(spider_name), AsyncRedisPriorityQueue("{}:requests".format(spider_name), loop=self.loop)) await request_q.qsize() async def clean_dupefilter(self, spider_name: str): """Clean the spider' DumpFilter queue""" bloomfilter = self.bloom_filter.get(spider_name) if bloomfilter: await bloomfilter.conn() await bloomfilter.clean() try: self.bloom_filter.pop(spider_name) except KeyError: self.logger.log_it( '[clean_dupefilter]Cant find bloomfilter.Spidername:{}'.format( spider_name)) self.logger.log_it( '[clean_dupefilter]Clean bloomfilter:{}'.format(spider_name)) def on_begin(self): """Run before begin to do something like load tasks,or load config""" self.load_status() for started_spider in self.spider_started: # load every started_spider's requests self.loop.create_task( self.load_tasks(SCHEDULER_DOWNLOADER, started_spider)) self.instantiate_spider() self.init_spider_set() self.instantiate_selector() self.load_speed() async def check_end(self): done = [] while not done or False in done: done = [] for spider_set in self.all_spider_set: for spider_name in spider_set: q = await self.get_requests_queue_size(spider_name) if q == 0 or q is None: await asyncio.sleep(0.5, self.loop) done.append(True) else: done.append(False) self.done_all_things = True async def on_end(self): """Run when exit to do something like dump queue etc... It make self.done_all_things=True in last """ # for task in asyncio.Task.all_tasks(): # task.cancel() self.dump_speed() self.dump_status() if catty.config.PERSISTENCE['PERSIST_BEFORE_EXIT']: for spider_set in self.all_spider_set: for spider_name in spider_set: self.loop.create_task(self.dump_tasks(spider_name)) self.loop.create_task(self.check_end()) else: self.done_all_things = True async def push_requests(self, task, spider_ins, spider_name): """Filter request & push it in requests queue""" # DupeFilter f = True if task['meta']['dupe_filter']: seeds = get_default(spider_ins, 'seeds', catty.config.SPIDER_DEFAULT['SEEDS']) blocknum = get_default(spider_ins, 'blocknum', catty.config.SPIDER_DEFAULT['BLOCKNUM']) bloom_filter = self.bloom_filter.setdefault( task['spider_name'], RedisBloomFilter(self.loop, task['spider_name'] + ':DupeFilter', seeds, blockNum=blocknum)) if not bloom_filter.redis_conn: await bloom_filter.conn() if not await bloom_filter.is_contain(task['tid']): await bloom_filter.add(task['tid']) else: self.logger.log_it( "[run_ins_func]Filtered tid:{} url:{} data:{} params:{}". format(task['tid'], task['request'].url, task['request'].data, task['request'].params), level='INFO') f = False if f: self.logger.log_it( "[run_ins_func]New request tid:{} url:{} data:{} params:{}". format(task['tid'], task['request'].url, task['request'].data, task['request'].params), level='INFO') request_q = self.requests_queue_conn.setdefault( "{}:requests".format(spider_name), AsyncRedisPriorityQueue("{}:requests".format(spider_name), loop=self.loop)) await push_task(request_q, task, self.loop) def get_spider_method(self, spider_name: str, method_name: str): """Return a bound method if spider have this method,return None if not.""" try: # get the instantiation spider from dict spider_ins = self.spider_module_handle.spider_instantiation[ spider_name] except IndexError: self.logger.log_it( "[_run_ins_func]No this Spider or had not instance yet.", 'WARN') # try to reload it self.spider_module_handle.update_spider(spider_name) return # get the spider's method from name method = spider_ins.__getattribute__(method_name) return method, spider_ins async def _run_ins_func(self, spider_name: str, method_name: str, task: dict = None): """run the spider_ins boned method to return a task & push it to request-queue""" # get the method from instance method, spider_ins = self.get_spider_method(spider_name, method_name) try: if task: func_return_task = method(task=task) else: # without task param,"start" etc... func_return_task = method() except: # The except from user spiders traceback.print_exc() return if not isinstance(func_return_task, list): func_return_task = [func_return_task] # return how many request mean it make how many task if isinstance(func_return_task, list): for each_task in func_return_task: if not isinstance(each_task, Task): self.logger.log_it( "[run_ins_func]Not return a Task in {}".format( spider_name), 'WARN') continue self.loop.create_task( self.push_requests(each_task, spider_ins, spider_name)) async def make_tasks(self): """run the ready_start spider & run the done task & push them""" # start the "ready_start" spiders had_started_ = set() for spider_name in self.spider_ready_start: # start the spider's start method self.logger.log_it( '[make_tasks]Starting spider:{}'.format(spider_name), 'INFO') self.loop.create_task(self._run_ins_func(spider_name, 'start')) self.spider_started.add(spider_name) had_started_.add(spider_name) self.spider_ready_start -= had_started_ # from done task task = await get_task(self.parser_scheduler_queue) if task: self.loop.create_task(self.make_tasks()) spider_name = task['spider_name'] if task['spider_name'] in self.spider_started: callback = task['callback'] for callback_method_name in callback: fetcher_method_name = callback_method_name.get( 'fetcher', None) if not fetcher_method_name: continue if not isinstance(fetcher_method_name, list): fetcher_method_name = [fetcher_method_name] # a task can have many fetcher callbacks for each_fetcher_method_name in fetcher_method_name: # make a new task,if use need to save the data from last task(meta etc..),must handle it. self.logger.log_it( '[make_tasks]{}.{} making task'.format( spider_name, each_fetcher_method_name)) self.loop.create_task( self._run_ins_func(spider_name, each_fetcher_method_name, task)) elif task['spider_name'] in self.spider_paused: # persist dump_task(task, catty.config.PERSISTENCE['DUMP_PATH'], 'scheduler', task['spider_name']) self.loop.create_task(self.dump_tasks(spider_name)) elif task['spider_name'] in self.spider_stopped: pass elif task['spider_name'] in self.spider_todo: pass else: self.loop.call_later( catty.config.LOAD_QUEUE_INTERVAL, lambda: self.loop.create_task(self.make_tasks())) def quit(self): self.logger.log_it("[Ending]Doing the last thing...") self.loop.create_task(self.on_end()) while True: if self.done_all_things: self.logger.log_it("Bye!") os._exit(0) else: # doesn't block the thread time.sleep(1) def run_scheduler(self): self.loop.create_task(self.selector.select_task()) for i in range(catty.config.NUM_OF_SCHEDULER_MAKE_TASK): self.loop.create_task(self.make_tasks()) self.loop.run_forever() def run(self): try: self.on_begin() xmlrpc_partial_func = partial(self.xmlrpc_run, name=self.name) handler_server_thread = threading.Thread( target=xmlrpc_partial_func) handler_server_thread.start() scheduler_thread = threading.Thread(target=self.run_scheduler) scheduler_thread.start() # scheduler_thread.join() while True: r = input() if r == 'Q': self.quit() except KeyboardInterrupt: self.quit()