Ejemplo n.º 1
0
    def __init__(self):
        self._logger = Log('HandlerMixin')
        self.scheduler_handler = {
            'pause': self.handle_pause_spider,
            'start': self.handle_start_spider,
            'run': self.handle_run_spider,
            'stop': self.handle_stop_spider,
            'update_spider': self.handle_update_spider,
            'delete_spider': self.handle_delete_spider,
            'list_spiders': self.handle_list_spiders,
            'list_speed': self.handle_list_speed,
            'set_speed': self.handle_set_speed,
            'clean_request_queue': self.handle_clean_request_queue,
            'clean_dupe_filter': self.handle_clean_dupe_filter
        }
        self.parser_handler = {
            'list_count': self.handle_count,
            'pause': self.handle_pause_spider,
            'start': self.handle_start_spider,
            'run': self.handle_run_spider,
            'stop': self.handle_stop_spider,
            'update_spider': self.handle_update_spider,
            'delete_spider': self.handle_delete_spider
        }

        self.all_handler = {}
        self.all_handler.update(self.scheduler_handler)
        self.all_handler.update(self.parser_handler)
Ejemplo n.º 2
0
    def __init__(self, downloader_parser_queue: AsyncRedisPriorityQueue,
                 parser_scheduler_queue: AsyncRedisPriorityQueue,
                 scheduler_downloader_queue: AsyncRedisPriorityQueue,
                 loop: BaseEventLoop, name: str):
        """
        :param downloader_parser_queue:The redis queue
        :param parser_scheduler_queue:The redis queue
        :param loop:EventLoop
        """
        super(Parser, self).__init__()
        self.name = name

        self.downloader_parser_queue = downloader_parser_queue
        self.parser_scheduler_queue = parser_scheduler_queue
        self.scheduler_downloader_queue = scheduler_downloader_queue
        self.loop = loop

        self.spider_started = set()
        self.spider_stopped = set()
        self.spider_paused = set()

        self.all_spider_set = [
            self.spider_started, self.spider_stopped, self.spider_paused
        ]

        self.spider_module_handle = SpiderModuleHandle(
            catty.config.SPIDER_PATH)
        self.spider_module_handle.load_all_spider()

        self.logger = Log('Parser')
        self.ready_to_exit = False
        self.counter = Counter(loop)
Ejemplo n.º 3
0
class BaseHandleClient:
    """ To connect with WebUI-Server and Scheduler&Parser"""
    logger = Log('HandlerClient')
    scheduler_handler_name = {
        'pause', 'start', 'run', 'stop', 'update_spider', 'delete_spider',
        'list_spiders', 'list_speed', 'set_speed', 'clean_request_queue',
        'clean_dupe_filter'
    }
    parser_handler_name = {
        'list_count', 'pause', 'start', 'run', 'stop', 'update_spider',
        'delete_spider'
    }
    scheduler_parser_handler_name = scheduler_handler_name & parser_handler_name

    def __init__(self):
        self.scheduler_clients = {
            k: XMLRPCClient("http://localhost:{}".format(v))
            for k, v in catty.config.PORT['SCHEDULER'].items()
        }
        self.parser_clients = {
            k: XMLRPCClient("http://localhost:{}".format(v))
            for k, v in catty.config.PORT['PARSER'].items()
        }

    def action(self, action_type: str, **kwargs) -> list:
        """Handle action such as 'run','start','set speed'..."""

        context = {'type': action_type}
        context.update(kwargs)

        results = []

        if action_type in self.scheduler_parser_handler_name:
            for _, each_parser_client in self.parser_clients.items():
                results.append(
                    getattr(each_parser_client, action_type)(context))
            for _, each_scheduler_client in self.scheduler_clients.items():
                results.append(
                    getattr(each_scheduler_client, action_type)(context))
        elif action_type in self.scheduler_handler_name:
            for _, each_scheduler_client in self.scheduler_clients.items():
                results.append(
                    getattr(each_scheduler_client, action_type)(context))
        elif action_type in self.parser_handler_name:
            for _, each_parser_client in self.parser_clients.items():
                results.append(
                    getattr(each_parser_client, action_type)(context))
        else:
            results = [{
                'code': STATUS_CODE.USER_ERROR,
                'msg': 'Not a vaild action type.'
            }]

        return results
Ejemplo n.º 4
0
class SpiderModuleHandle(ModuleHandle):
    def __init__(self, path):
        super().__init__(path)

        # spider_name:spider_instantiation
        self.spider_instantiation = {}
        self.logger = Log('SpiderModuleHandle')

    def _instance_spider(self, name):
        try:
            spec, module = self.namespace[name]
            spider_cls = getattr(module, 'Spider')
            self.spider_instantiation.update({spider_cls.name: spider_cls()})
            self.logger.log_it(
                "[load_spider]Load spider name:{}".format(spider_cls.name),
                'INFO')
        except Exception as e:
            self.logger.log_it("[load_spider]ErrInfo:{}".format(e), 'WARN')

    def _instance_all_spider(self):
        for file_name in self.namespace.keys():
            self._instance_spider(file_name)

    def load_all_spider(self):
        self.load_all_module()
        self._instance_all_spider()

    def load_new_spider(self):
        self.load_new_module()
        self._instance_all_spider()

    def update_spider(self, spider_file_name):
        if '.py' not in spider_file_name:
            spider_file_name = spider_file_name + '.py'
        self.update_module(spider_file_name)
        self._instance_spider(spider_file_name)

    def delete_spider(self, spider_file_name):
        # TODO spider_file_name == Spider.name
        self.spider_instantiation.pop(spider_file_name)
        self.delete_module(spider_file_name)
Ejemplo n.º 5
0
    def __init__(self,
                 scheduler_downloader_queue: AsyncRedisPriorityQueue,
                 downloader_parser_queue: AsyncRedisPriorityQueue,
                 loop: BaseEventLoop,
                 conn_limit: int):
        """
        :param scheduler_downloader_queue:The redis queue
        :param downloader_parser_queue:The redis queue
        :param loop:EventLoop
        :param conn_limit:Limit of The total number for simultaneous connections.
        # :param limit_per_host:The limit for simultaneous connections to the same endpoint(host, port, is_ssl).
        """
        self.scheduler_downloader_queue = scheduler_downloader_queue
        self.downloader_parser_queue = downloader_parser_queue

        self.loop = loop
        self.conn_limit = conn_limit

        # using in conn_limit
        self.count = 0
        self.logger = Log('Downloader')
Ejemplo n.º 6
0
    def __init__(self, spider_speed: dict,
                 scheduler_downloader_queue: AsyncRedisPriorityQueue,
                 requests_queue: dict, spider_stopped: set, spider_paused: set,
                 spider_started: set, spider_ready_start: set,
                 spider_todo: set, loop: asyncio.BaseEventLoop):
        self.logger = Log('Selector')
        self.scheduler_downloader_queue = scheduler_downloader_queue
        self.requests_queue = requests_queue
        self.loop = loop

        # time from begin to now
        self.running_time = 0
        self.run_at = int(time.time())

        self.spider_stopped = spider_stopped
        self.spider_paused = spider_paused
        self.spider_started = spider_started
        self.spider_ready_start = spider_ready_start
        self.spider_todo = spider_todo
        self.spider_speed = spider_speed
        self.spider_speed_reciprocal = {}

        self.init_speed()
Ejemplo n.º 7
0
    def __init__(self, scheduler_downloader_queue: AsyncRedisPriorityQueue,
                 parser_scheduler_queue: AsyncRedisPriorityQueue,
                 loop: asyncio.BaseEventLoop, name: str):
        """
        :param scheduler_downloader_queue:The redis queue
        :param parser_scheduler_queue:The redis queue
        :param loop:EventLoop
        """
        super().__init__()
        self.name = name
        self.scheduler_downloader_queue = scheduler_downloader_queue
        self.parser_scheduler_queue = parser_scheduler_queue
        # connection of all requests-queue
        self.requests_queue_conn = {}
        self.bloom_filter = {}
        self.loop = loop

        # spider_ready_start: means that you start a spider,it will run from begin.
        # spider_new: means that you spider that had not run yet.
        self.spider_stopped = set()
        self.spider_paused = set()
        self.spider_started = set()
        self.spider_ready_start = set()
        self.spider_todo = set()

        self.all_spider_set = [
            self.spider_todo, self.spider_paused, self.spider_started,
            self.spider_ready_start, self.spider_started
        ]

        self.spider_module_handle = SpiderModuleHandle(
            catty.config.SPIDER_PATH)

        self.logger = Log('Scheduler')
        self.done_all_things = False
        self.selector = None
Ejemplo n.º 8
0
class HandlerMixin:
    """ A Handler to mixin Scheduler and Parser. """
    def __init__(self):
        self._logger = Log('HandlerMixin')
        self.scheduler_handler = {
            'pause': self.handle_pause_spider,
            'start': self.handle_start_spider,
            'run': self.handle_run_spider,
            'stop': self.handle_stop_spider,
            'update_spider': self.handle_update_spider,
            'delete_spider': self.handle_delete_spider,
            'list_spiders': self.handle_list_spiders,
            'list_speed': self.handle_list_speed,
            'set_speed': self.handle_set_speed,
            'clean_request_queue': self.handle_clean_request_queue,
            'clean_dupe_filter': self.handle_clean_dupe_filter
        }
        self.parser_handler = {
            'list_count': self.handle_count,
            'pause': self.handle_pause_spider,
            'start': self.handle_start_spider,
            'run': self.handle_run_spider,
            'stop': self.handle_stop_spider,
            'update_spider': self.handle_update_spider,
            'delete_spider': self.handle_delete_spider
        }

        self.all_handler = {}
        self.all_handler.update(self.scheduler_handler)
        self.all_handler.update(self.parser_handler)

    def handle_pause_spider(self: "Scheduler", msg) -> tuple:
        """Pause the spider"""
        """
        Scheduler & Parser:
            Started->-spider_started & +spider_paused
            Paused->PASS
            Stop->PASS
        """
        spider_name = msg.get('spider_name')
        if not spider_name:
            return STATUS_CODE.ARGS_ERROR, {}

        if spider_name in self.spider_started:
            set_safe_remove(self.spider_started, spider_name)
            self.spider_paused.add(spider_name)

        # Dump request queue
        if 'scheduler' in self.name:
            self.loop.create_task(self.dump_tasks(spider_name))
        elif 'parser' in self.name:
            self.loop.create_task(self.dump_tasks(DOWNLOADER_PARSER))
            self.loop.create_task(self.dump_tasks(PARSER_SCHEDULER))

        self._logger.log_it(
            "[pause_spider]Success pause spider spider:{}".format(spider_name))
        return STATUS_CODE.OK, {}

    def handle_stop_spider(self: "Scheduler", msg) -> tuple:
        """Stop the spider"""
        """
        Scheduler & Parser:
            Started->-spider_started & +spider_stopped
            Paused->-spider_paused & _spider_stopped
            Stopped->PASS
        """
        spider_name = msg.get('spider_name')
        if not spider_name:
            return STATUS_CODE.ARGS_ERROR, {}

        if spider_name in self.spider_started:
            set_safe_remove(self.spider_started, spider_name)
            self.spider_stopped.add(spider_name)
        elif spider_name in self.spider_paused:
            set_safe_remove(self.spider_paused, spider_name)
            self.spider_stopped.add(spider_name)

        self._logger.log_it(
            "[stop_spider]Success pause spider spider:{}".format(spider_name))
        return STATUS_CODE.OK, {}

    def handle_run_spider(self: "Scheduler", msg):
        """continue if spider was paused or start from begin if spider was stopped"""
        """
        Scheduler:
            Started->PASS
            PAUSED->+spider_started
            STOPPED->+spider_ready_start
            TODO->+spider_started
        Parser:
            Started->PASS
            PAUSED->+spider_started
            STOPPED->+spider_started
        """
        spider_name = msg.get('spider_name')
        if not spider_name:
            return STATUS_CODE.ARGS_ERROR, {}

        if spider_name in self.spider_paused:
            set_safe_remove(self.spider_paused, spider_name)
            self.spider_started.add(spider_name)
        elif spider_name in self.spider_stopped:
            set_safe_remove(self.spider_stopped, spider_name)
            if 'scheduler' in self.name:
                self.spider_ready_start.add(spider_name)
            else:
                self.spider_started.add(spider_name)
        elif 'scheduler' in self.name and spider_name in self.spider_todo:
            set_safe_remove(self.spider_todo, spider_name)
            self.spider_started.add(spider_name)
        elif 'parser' in self.name:
            self.spider_started.add(spider_name)

        # load the persist file
        if 'scheduler' in self.name:
            self.loop.create_task(self.load_tasks(spider_name))
        else:
            self.loop.create_task(
                self.load_tasks(spider_name, PARSER_SCHEDULER))
            self.loop.create_task(
                self.load_tasks(spider_name, DOWNLOADER_PARSER))
        self._logger.log_it(
            "[run_spider]Success spider:{}".format(spider_name))

        return STATUS_CODE.OK, {}

    def handle_start_spider(self: "Scheduler", msg):
        """start from begin if spider is todo,start from begin & contionue is spider was paused or stopped"""
        """
        Scheduler:
            Started->PASS
            PAUSED->+spider_ready_start
            STOPPED->+spider_ready_start
            TODO->+spider_ready_start
        Parser:
            Started->PASS
            PAUSED->+spider_started
            STOPPED->+spider_started
        """
        spider_name = msg.get('spider_name')
        if not spider_name:
            return STATUS_CODE.ARGS_ERROR, {}

        if 'scheduler' in self.name and spider_name in self.spider_todo:
            set_safe_remove(self.spider_todo, spider_name)
            self.spider_ready_start.add(spider_name)
        elif spider_name in self.spider_paused:
            set_safe_remove(self.spider_paused, spider_name)
            self.spider_started.add(spider_name)
            if 'scheduler' in self.name:
                self.spider_ready_start.add(spider_name)
        elif spider_name in self.spider_stopped:
            set_safe_remove(self.spider_stopped, spider_name)
            self.spider_started.add(spider_name)
            if 'scheduler' in self.name:
                self.spider_ready_start.add(spider_name)
        elif spider_name in self.spider_started:
            if 'scheduler' in self.name:
                self.spider_ready_start.add(spider_name)
        elif 'parser' in self.name:
            self.spider_started.add(spider_name)
        self._logger.log_it(
            "[start_spider]Success spider:{}".format(spider_name))
        return STATUS_CODE.OK, {}

    # ------------------------SCHEDULER_ONLY----------------------------------

    def handle_list_spiders(self: "Scheduler", msg) -> tuple:
        return STATUS_CODE.OK, {
            'Started':
            list(get_default(self, 'spider_started')) if get_default(
                self, 'spider_started') else [],
            'Todo':
            list(get_default(self, 'spider_todo')) if get_default(
                self, 'spider_todo') else [],
            'Paused':
            list(get_default(self, 'spider_paused')) if get_default(
                self, 'spider_paused') else [],
            'Stopped':
            list(get_default(self, 'spider_stopped')) if get_default(
                self, 'spider_stopped') else [],
            'Ready start':
            list(get_default(self, 'spider_ready_start')) if get_default(
                self, 'spider_ready_start') else []
        }

    def handle_list_speed(self: "Scheduler", msg) -> tuple:
        return STATUS_CODE.OK, self.selector.spider_speed

    def handle_set_speed(self: "Scheduler", msg) -> tuple:
        spider_name = msg.get('spider_name')
        speed = msg['spider_speed']
        if 'scheduler' in self.name:
            self.selector.update_speed(spider_name, int(speed))
        return STATUS_CODE.OK, {}

    def handle_clean_request_queue(self: "Scheduler", msg) -> tuple:
        spider_name = msg.get('spider_name')
        if not spider_name:
            return STATUS_CODE.ARGS_ERROR, {}

        if self.name == 'master_scheduler':
            self.loop.create_task(self.clean_requests_queue(spider_name))
            return STATUS_CODE.OK, {}
        else:
            return STATUS_CODE.NOT_MY_BUSINESS, {}

    def handle_clean_dupe_filter(self: "Scheduler", msg) -> tuple:
        spider_name = msg.get('spider_name')
        if not spider_name:
            return STATUS_CODE.ARGS_ERROR, {}

        if self.name == 'master_parser':
            self.loop.create_task(self.clean_dupefilter(spider_name))
            return STATUS_CODE.OK, {}
        else:
            return STATUS_CODE.NOT_MY_BUSINESS, {}

    # ---------------------------------------------------------------------

    def handle_update_spider(self: "Scheduler", msg) -> tuple:
        spider_name = msg.get('spider_name')
        if not spider_name:
            return STATUS_CODE.ARGS_ERROR, {}

        self.spider_module_handle.update_spider(spider_name)
        # only can upload .py files
        if '.py' in spider_name:
            self.spider_todo.add(
                getattr(self.spider_module_handle.namespace[spider_name][1],
                        'Spider').name)
            self.selector.update_speed(
                getattr(self.spider_module_handle.namespace[spider_name][1],
                        'Spider').name, 1)
        return STATUS_CODE.OK, {}

    def handle_delete_spider(self: "Scheduler", msg) -> tuple:
        spider_name = msg.get('spider_name')
        if not spider_name:
            return STATUS_CODE.ARGS_ERROR, {}

        if spider_name in self.spider_started or ('scheduler' in self.name
                                                  and spider_name
                                                  in self.spider_ready_start):
            return STATUS_CODE.USER_ERROR, {
                'msg': "Please stop the spider first."
            }

        if 'scheduler' in self.name:
            self.handle_clean_request_queue(spider_name)
            self.handle_clean_dupe_filter(spider_name)

        self.spider_module_handle.delete_spider(spider_name)

        for spider_set in self.all_spider_set:
            if spider_name in spider_set:
                spider_set.remove(spider_name)

        self.loop.create_task(self.clean_requests_queue(spider_name))
        self.loop.create_task(self.clean_dupefilter(spider_name))
        return STATUS_CODE.OK, {}

    # -------------------------PARSER ONLY---------------------------------

    def handle_count(self: "Parser", msg) -> tuple:
        return STATUS_CODE.OK, self.counter.count_all()

    # ---------------------------------------------------------------------

    def xmlrpc_run(self: "Scheduler", name):
        if 'scheduler' in self.name:
            application = ThreadXMLRPCServer(
                ('localhost', catty.config.PORT['SCHEDULER'][name]))
            for k, v in self.scheduler_handler.items():
                application.register_function(v, k)
        else:
            application = ThreadXMLRPCServer(
                ('localhost', catty.config.PORT['PARSER'][name]))
            for k, v in self.parser_handler.items():
                application.register_function(v, k)

        application.serve_forever()
Ejemplo n.º 9
0
    def __init__(self, path):
        super().__init__(path)

        # spider_name:spider_instantiation
        self.spider_instantiation = {}
        self.logger = Log('SpiderModuleHandle')
Ejemplo n.º 10
0
class DownLoader:
    def __init__(self,
                 scheduler_downloader_queue: AsyncRedisPriorityQueue,
                 downloader_parser_queue: AsyncRedisPriorityQueue,
                 loop: BaseEventLoop,
                 conn_limit: int):
        """
        :param scheduler_downloader_queue:The redis queue
        :param downloader_parser_queue:The redis queue
        :param loop:EventLoop
        :param conn_limit:Limit of The total number for simultaneous connections.
        # :param limit_per_host:The limit for simultaneous connections to the same endpoint(host, port, is_ssl).
        """
        self.scheduler_downloader_queue = scheduler_downloader_queue
        self.downloader_parser_queue = downloader_parser_queue

        self.loop = loop
        self.conn_limit = conn_limit

        # using in conn_limit
        self.count = 0
        self.logger = Log('Downloader')

    async def _request(self, aio_request: Request, loop: BaseEventLoop) -> Response:
        """The real request.It return the Response obj with status 99999 as fail"""
        t_ = time.time()
        self.logger.log_it("Downloading url:{} data:{}".format(aio_request.url, aio_request.data))
        try:
            async with aiohttp.ClientSession(loop=loop) as session:
                if aio_request.method == 'GET':
                    async with session.get(**aio_request.dump_request()) as client:
                        body = await client.read()
                elif aio_request.method == 'POST':
                    async with session.post(**aio_request.dump_request()) as client:
                        body = await client.read()
                elif aio_request.method == 'PUT':
                    async with session.put(**aio_request.dump_request()) as client:
                        body = await client.read()
                elif aio_request.method == 'DELETE':
                    async with session.delete(**aio_request.dump_request()) as client:
                        body = await client.read()
                elif aio_request.method == 'HEAD':
                    async with session.head(**aio_request.dump_request()) as client:
                        body = await client.read()
                elif aio_request.method == 'OPTIONS':
                    async with session.options(**aio_request.dump_request()) as client:
                        body = await client.read()
                elif aio_request.method == 'PATCH':
                    async with session.path(**aio_request.dump_request()) as client:
                        body = await client.read()
                else:
                    self.logger.log_it("Not a vaild method.Request:{}".format(aio_request), level='INFO')
                    return Response(status=-1, body=str("Not a vaild method.Request:{}".format(aio_request)), )

                response = Response(
                    # TODO text accept encoding param to encode the body
                    # text= await client.text(),
                    method=client.method,
                    status=client.status,
                    cookies=client.cookies,
                    headers=client.raw_headers,
                    charset=client.charset,
                    content_type=client.content_type,
                    # history= client.history,
                    body=body,
                    use_time=time.time() - t_,
                    url=client.url,
                )

        except Exception as e:
            self.logger.log_it("Fail to download url:{} data:{} ErrInfo:{}".format(aio_request.url, aio_request.data,
                                                                                   traceback.format_exc()))
            response = Response(status=99999, body=str(e), )

        self.count -= 1
        return response

    async def fail_callback(self, task: dict, aio_request: Request):
        retry = task['meta']['retry']
        retried = task.get('retried', 0)
        if retry != 0 and retried < retry:
            task.update({'retried': retried + 1})
            self.logger.log_it("Retry url:{} body:{} retried:{}".format(aio_request.url, aio_request.data, retried))
            # retry wait
            await asyncio.sleep(task['meta']['retry_wait'], self.loop)
            await push_task(self.scheduler_downloader_queue, task, self.loop)

    async def success_callback(self, task: dict, response: Response):
        task.update({'response': response})
        await push_task(self.downloader_parser_queue, task, self.loop)

    async def request(self, aio_request: Request, task: dict):
        """request,update the task and put it in the queue"""
        response = await self._request(aio_request, self.loop)
        # TODO:99999 means catch exception during request(or we should uniform the status code and write a doc)
        if response['status'] != 99999:
            # success
            await self.success_callback(task, response)
        elif response['status'] == -1:
            # -1 means ignore this status
            pass
        else:
            # fail
            await self.fail_callback(task, aio_request)

    async def start_crawler(self, connector):
        """get item from queue and crawl it & push it to queue at last"""
        task = await get_task(self.scheduler_downloader_queue)
        if task is not None:
            self.count += 1
            aio_request = task['request']
            self.loop.create_task(self.request(aio_request=aio_request, task=task))

            # The limit of concurrent request
            while self.count > self.conn_limit:
                await asyncio.sleep(0.5, loop=self.loop)

            self.loop.create_task(self.start_crawler(connector))
        else:
            # If the queue is empty,wait and try again.
            await asyncio.sleep(catty.config.LOAD_QUEUE_INTERVAL, loop=self.loop)
            self.loop.create_task(self.start_crawler(connector))

    def run(self):
        try:
            crawler = partial(self.start_crawler,connector=aiohttp.TCPConnector(verify_ssl=False))
            self.loop.create_task(crawler())
            self.loop.run_forever()
        except KeyboardInterrupt:
            self.logger.log_it("Bye!", level='INFO')
Ejemplo n.º 11
0
class Parser(HandlerMixin):
    def __init__(self, downloader_parser_queue: AsyncRedisPriorityQueue,
                 parser_scheduler_queue: AsyncRedisPriorityQueue,
                 scheduler_downloader_queue: AsyncRedisPriorityQueue,
                 loop: BaseEventLoop, name: str):
        """
        :param downloader_parser_queue:The redis queue
        :param parser_scheduler_queue:The redis queue
        :param loop:EventLoop
        """
        super(Parser, self).__init__()
        self.name = name

        self.downloader_parser_queue = downloader_parser_queue
        self.parser_scheduler_queue = parser_scheduler_queue
        self.scheduler_downloader_queue = scheduler_downloader_queue
        self.loop = loop

        self.spider_started = set()
        self.spider_stopped = set()
        self.spider_paused = set()

        self.all_spider_set = [
            self.spider_started, self.spider_stopped, self.spider_paused
        ]

        self.spider_module_handle = SpiderModuleHandle(
            catty.config.SPIDER_PATH)
        self.spider_module_handle.load_all_spider()

        self.logger = Log('Parser')
        self.ready_to_exit = False
        self.counter = Counter(loop)

    async def load_tasks(self, spider_name: str, which_q: str = ''):
        """load the persist task & push it to queue"""
        tasks = await load_task(catty.config.PERSISTENCE['DUMP_PATH'],
                                '{}_{}'.format(self.name,
                                               which_q), spider_name)
        if tasks:
            self.logger.log_it("[load_tasks]Load tasks:{}".format(tasks))
            for each_task in tasks:
                # push each task to request-queue
                if which_q == PARSER_SCHEDULER:
                    await push_task(self.parser_scheduler_queue, each_task,
                                    self.loop)
                elif which_q == DOWNLOADER_PARSER:
                    await push_task(self.downloader_parser_queue, each_task,
                                    self.loop)

    async def dump_tasks(self, which_q: str):
        """ dump the task which in queue """
        if which_q == PARSER_SCHEDULER:
            while await self.parser_scheduler_queue.qsize():
                task = await get_task(self.parser_scheduler_queue)
                if task is not None:
                    await dump_task(task,
                                    catty.config.PERSISTENCE['DUMP_PATH'],
                                    "{}_{}".format(self.name, which_q),
                                    task['spider_name'])
                    self.logger.log_it("[dump_task]Dump task:{}".format(task))
        elif which_q == DOWNLOADER_PARSER:
            while await self.downloader_parser_queue.qsize():
                task = await get_task(self.downloader_parser_queue)
                if task is not None:
                    await dump_task(task,
                                    catty.config.PERSISTENCE['DUMP_PATH'],
                                    "{}_{}".format(self.name, which_q),
                                    task['spider_name'])
                    self.logger.log_it("[dump_task]Dump task:{}".format(task))

    def dump_count(self):
        counter_date = {
            'value_d': self.counter.value_d,
            'cache_value': self.counter.cache_value
        }
        root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'parser')
        dump_pickle_data(root, '{}_counter'.format(self.name), counter_date)
        self.logger.log_it("[dump_count]{}".format(counter_date))

    def load_count(self):
        root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'parser')
        counter_date = load_pickle_data(root, '{}_counter'.format(self.name))
        self.counter.value_d = counter_date['value_d']
        self.counter.cache_value = counter_date['cache_value']
        self.logger.log_it("[load_count]{}".format(counter_date))

    def dump_status(self):
        status = {
            'spider_started': self.spider_started,
            'spider_paused': self.spider_paused,
            'spider_stopped': self.spider_stopped
        }
        root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'parser')
        dump_pickle_data(root, '{}_status'.format(self.name), status)
        self.logger.log_it("[dump_status]{}".format(status))

    def load_status(self):
        root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'parser')

        status = load_pickle_data(root, '{}_status'.format(self.name))
        self.spider_paused = status['spider_paused']
        self.spider_stopped = status['spider_stopped']
        self.spider_started = status['spider_started']
        self.all_spider_set = [
            self.spider_started, self.spider_stopped, self.spider_paused
        ]
        self.logger.log_it("[load_status]{}".format(status))

    def on_begin(self):
        """Run before begin to do something like load tasks,or load config"""
        self.load_status()
        self.load_count()
        for started_spider in self.spider_started:
            self.loop.create_task(
                self.load_tasks(DOWNLOADER_PARSER, started_spider))
            self.loop.create_task(
                self.load_tasks(PARSER_SCHEDULER, started_spider))

    async def check_end(self):
        done = []
        while not done or False in done:
            done = []
            psq = await self.parser_scheduler_queue.qsize()
            dpq = await self.downloader_parser_queue.qsize()
            if psq == 0 or psq is None:
                await asyncio.sleep(0.5, self.loop)
                done.append(True)
            else:
                done.append(False)

            if dpq == 0 or dpq is None:
                await asyncio.sleep(0.5, self.loop)
                done.append(True)
            else:
                done.append(False)

        self.ready_to_exit = True

    async def on_end(self):
        """Run when exit to do something like dump queue etc... It make self.done_all_things=True in last """
        # no need to
        # for task in asyncio.Task.all_tasks():
        #     task.cancel()

        if catty.config.PERSISTENCE['PERSIST_BEFORE_EXIT']:
            self.loop.create_task(self.dump_tasks(PARSER_SCHEDULER))
            self.loop.create_task(self.dump_tasks(DOWNLOADER_PARSER))

        self.dump_count()
        self.dump_status()

        self.loop.create_task(self.check_end())

    def get_spider_method(self, spider_name: str, method_name: str):
        """Return a bound method if spider have this method,return None if not."""
        try:
            # get the instantiation spider from dict
            spider_ins = self.spider_module_handle.spider_instantiation[
                spider_name]
        except IndexError:
            self.logger.log_it(
                "[_run_ins_func]No this Spider or had not instance yet.",
                'WARN')
            return

        # get the spider's method from name
        method = spider_ins.__getattribute__(method_name)

        return method, spider_ins

    async def _run_ins_func(self, spider_name: str, method_name: str,
                            task: dict):
        """run the spider_ins boned method to parser it & push it to parser-scheduler queue"""
        self.logger.log_it("[_run_ins_func]Parser the {}.{}".format(
            spider_name, method_name))
        _response = task['response']

        method, _ = self.get_spider_method(spider_name, method_name)
        # get the method'parms
        _signature = inspect.signature(method).parameters

        try:
            # the async method define by user must have a loop param even never use it.
            if 'loop' in _signature:
                if 'task' in _signature:
                    parser_return = await method(_response,
                                                 task=task,
                                                 loop=self.loop)
                else:
                    parser_return = await method(_response, loop=self.loop)
            else:
                if 'task' in _signature:
                    parser_return = method(_response, task=task)
                else:
                    parser_return = method(_response)
        except Retry_current_task:
            # handle it like a new task
            self.loop.create_task(
                push_task(self.parser_scheduler_queue, task, self.loop))
            return
        except:
            # The except from user spiders
            traceback.print_exc()
            return

        if isinstance(parser_return, dict):
            # normal
            if 'tid' not in parser_return:
                # not a task
                task['parser'].update({'item': parser_return})
            await push_task(self.parser_scheduler_queue, task, self.loop)
        elif isinstance(parser_return, list):
            # task_list
            for each_return_task in parser_return:
                await push_task(self.scheduler_downloader_queue,
                                each_return_task, self.loop)
        elif parser_return is None:
            pass

    async def make_tasks(self):
        """run the done task & push them"""
        task = await get_task(self.downloader_parser_queue)
        if task:
            self.loop.create_task(self.make_tasks())
            if 'status' in task['response']:
                if 200 <= task['response']['status'] < 400 and \
                                task['response']['status'] in task['handle_status_code']:
                    self.counter.add_success(task['spider_name'])
                    if task['spider_name'] in self.spider_started:
                        callback = task['callback']
                        spider_name = task['spider_name']
                        for callback_method_name in callback:
                            # number of task that parser return depend on the number of callbacks
                            each_task = deepcopy(task)
                            parser_method_name = callback_method_name.get(
                                'parser', None)

                            if parser_method_name:
                                self.loop.create_task(
                                    self._run_ins_func(spider_name,
                                                       parser_method_name,
                                                       each_task))

                    elif task['spider_name'] in self.spider_paused:
                        # persist
                        await dump_task(task,
                                        catty.config.PERSISTENCE['DUMP_PATH'],
                                        'parser', task['spider_name'])
                    elif task['spider_name'] in self.spider_stopped:
                        pass
                else:
                    retry = task['meta']['retry']
                    retried = task.get('retried', 0)
                    if retry != 0 and retried < retry:
                        task.update({'retried': retried + 1})
                        retry_method, _ = self.get_spider_method(
                            task['spider_name'], 'retry')

                        # it could be return a list
                        retry_tasks = retry_method(task)
                        if not isinstance(retry_tasks, list):
                            retry_tasks = [retry_tasks]
                        for each_retry_task in retry_tasks:
                            await asyncio.sleep(task['meta']['retry_wait'],
                                                self.loop)
                            await push_task(self.scheduler_downloader_queue,
                                            each_retry_task, self.loop)
                    self.counter.add_fail(task['spider_name'])

        else:
            # if no task in downloader_parser queue,wait it
            self.loop.call_later(
                catty.config.LOAD_QUEUE_INTERVAL,
                lambda: self.loop.create_task(self.make_tasks()))

    def quit(self):
        self.logger.log_it("[Ending]Doing the last thing...", level='INFO')
        self.loop.create_task(self.on_end())
        while not self.ready_to_exit:
            time.sleep(1)
        self.logger.log_it("Bye!", level='INFO')
        os._exit(0)

    def run_parser(self):
        for i in range(catty.config.NUM_OF_PARSER_MAKE_TASK):
            self.loop.create_task(self.make_tasks())
        self.loop.create_task(self.counter.update())
        self.loop.run_forever()

    def run(self):
        try:
            self.on_begin()
            xmlrpc_partial_func = partial(self.xmlrpc_run, name=self.name)
            handler_server_thread = threading.Thread(
                target=xmlrpc_partial_func)
            parser_thread = threading.Thread(target=self.run_parser)
            parser_thread.start()
            handler_server_thread.start()

            # In Windows,I cant catch KeyboardInterrupt.FXXK!
            while True:
                r = input()
                if r == 'Q':
                    self.quit()
        except KeyboardInterrupt:
            self.quit()
Ejemplo n.º 12
0
class Selector:
    def __init__(self, spider_speed: dict,
                 scheduler_downloader_queue: AsyncRedisPriorityQueue,
                 requests_queue: dict, spider_stopped: set, spider_paused: set,
                 spider_started: set, spider_ready_start: set,
                 spider_todo: set, loop: asyncio.BaseEventLoop):
        self.logger = Log('Selector')
        self.scheduler_downloader_queue = scheduler_downloader_queue
        self.requests_queue = requests_queue
        self.loop = loop

        # time from begin to now
        self.running_time = 0
        self.run_at = int(time.time())

        self.spider_stopped = spider_stopped
        self.spider_paused = spider_paused
        self.spider_started = spider_started
        self.spider_ready_start = spider_ready_start
        self.spider_todo = spider_todo
        self.spider_speed = spider_speed
        self.spider_speed_reciprocal = {}

        self.init_speed()

    def init_speed(self):
        """init all spider speed"""
        self.spider_speed_reciprocal = {
            k: math.ceil(1 / v)
            for k, v in self.spider_speed.items()
        }

    def update_speed(self, spider_name: str, speed: int):
        """update a spider speed"""
        self.spider_speed.update({spider_name: speed})
        self.spider_speed_reciprocal.update(
            {spider_name: math.ceil(1 / speed)})

    async def _select_task(self, requests_q, spider_name):
        task = await get_task(requests_q)
        if task:
            if task['spider_name'] in self.spider_started:
                await push_task(self.scheduler_downloader_queue, task,
                                self.loop)
                self.logger.log_it('[select_task]{} tid:{}'.format(
                    spider_name, task['tid']))
            elif task['spider_name'] in self.spider_paused:
                dump_task(task, catty.config.PERSISTENCE['DUMP_PATH'],
                          'scheduler', task['spider_name'])
            elif task['spider_name'] in self.spider_stopped:
                pass

    async def select_task(self):
        # TODO 时间粒度
        last_running_time = self.running_time
        self.running_time = int(time.time()) - self.run_at

        for spider_name in self.spider_started:
            speed_reciprocal = self.spider_speed_reciprocal[spider_name]
            requests_q = self.requests_queue.setdefault(
                "{}:requests".format(spider_name),
                AsyncRedisPriorityQueue("{}:requests".format(spider_name),
                                        loop=self.loop))
            for each_diff_time in range(last_running_time, self.running_time):
                # time's up
                if each_diff_time % speed_reciprocal == 0:
                    # if speed bigger than 1,means that at last 1 request per sec.
                    if self.spider_speed[spider_name] > 1:
                        for i in range(self.spider_speed[spider_name]):
                            self.loop.create_task(
                                self._select_task(requests_q, spider_name))
                    else:
                        self.loop.create_task(
                            self._select_task(requests_q, spider_name))

        await asyncio.sleep(catty.config.SELECTOR_INTERVAL, loop=self.loop)
        self.loop.create_task(self.select_task())
Ejemplo n.º 13
0
class Scheduler(HandlerMixin):
    def __init__(self, scheduler_downloader_queue: AsyncRedisPriorityQueue,
                 parser_scheduler_queue: AsyncRedisPriorityQueue,
                 loop: asyncio.BaseEventLoop, name: str):
        """
        :param scheduler_downloader_queue:The redis queue
        :param parser_scheduler_queue:The redis queue
        :param loop:EventLoop
        """
        super().__init__()
        self.name = name
        self.scheduler_downloader_queue = scheduler_downloader_queue
        self.parser_scheduler_queue = parser_scheduler_queue
        # connection of all requests-queue
        self.requests_queue_conn = {}
        self.bloom_filter = {}
        self.loop = loop

        # spider_ready_start: means that you start a spider,it will run from begin.
        # spider_new: means that you spider that had not run yet.
        self.spider_stopped = set()
        self.spider_paused = set()
        self.spider_started = set()
        self.spider_ready_start = set()
        self.spider_todo = set()

        self.all_spider_set = [
            self.spider_todo, self.spider_paused, self.spider_started,
            self.spider_ready_start, self.spider_started
        ]

        self.spider_module_handle = SpiderModuleHandle(
            catty.config.SPIDER_PATH)

        self.logger = Log('Scheduler')
        self.done_all_things = False
        self.selector = None

    def instantiate_spider(self):
        """instantiate all spider"""
        self.spider_module_handle.load_all_spider()

    def init_spider_set(self):
        """init spider set"""
        for spider_name in self.spider_module_handle.namespace.keys():
            try:
                # find Spider.name
                spider_name = getattr(
                    self.spider_module_handle.namespace[spider_name][1],
                    'Spider').name
            except AttributeError:
                self.logger.log_it(
                    "[instantiate_spider]Cant get spider's name.SpiderFile:{}".
                    format(spider_name), 'WARN')
                continue

            if spider_name not \
                    in (self.spider_paused | self.spider_ready_start | self.spider_started | self.spider_stopped):
                self.spider_todo.add(spider_name)

    def instantiate_selector(self):
        # Get the speed
        self.selector = Selector(
            {
                spider_name:
                get_default(obj=self.spider_module_handle.
                            spider_instantiation[spider_name],
                            name_or_index='speed',
                            default=catty.config.SPIDER_DEFAULT['SPEED'])
                for spider_set in self.all_spider_set
                for spider_name in spider_set
            }, self.scheduler_downloader_queue, self.requests_queue_conn,
            self.spider_stopped, self.spider_paused, self.spider_started,
            self.spider_ready_start, self.spider_todo, self.loop)

    async def load_tasks(self, spider_name: str, which_q: str = ''):
        """load the persist task & push it to queue"""
        tasks = await load_task(catty.config.PERSISTENCE['DUMP_PATH'],
                                'request_queue_{}'.format(self.name),
                                spider_name)
        if tasks:
            self.logger.log_it("[load_tasks]Load tasks:{}".format(tasks))
            for each_task in tasks:
                # push each task to request-queue
                await push_task(self.scheduler_downloader_queue, each_task,
                                self.loop)

    async def dump_tasks(self, spider_name: str):
        """ dump the task which in queue """
        request_q = self.requests_queue_conn.setdefault(
            "{}:requests".format(spider_name),
            AsyncRedisPriorityQueue("{}:requests".format(spider_name),
                                    loop=self.loop))

        if not request_q.redis_conn:
            await request_q.conn()
        while await request_q.qsize():
            task = await get_task(request_q)
            if task is not None:
                await dump_task(task, catty.config.PERSISTENCE['DUMP_PATH'],
                                'request_queue_{}'.format(self.name),
                                task['spider_name'])
                self.logger.log_it("[dump_task]Dump task:{}".format(task))

    async def dump_all_paused_task(self):
        """Dump the task which spider was paused."""
        for paused_spider_name in self.spider_paused:
            await self.dump_tasks(paused_spider_name)

    def dump_speed(self):
        root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'scheduler')
        dump_pickle_data(root, 'speed_{}'.format(self.name),
                         self.selector.spider_speed)
        self.logger.log_it("[dump_speed]{}".format(self.selector.spider_speed))

    def load_speed(self):
        root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'scheduler')
        spider_speed = load_pickle_data(root, 'speed{}'.format(self.name))
        self.selector.spider_speed = spider_speed
        self.selector.init_speed()
        self.logger.log_it("[load_speed]{}".format(spider_speed))

    def dump_status(self):
        status = {
            'spider_started': self.spider_started,
            'spider_paused': self.spider_paused,
            'spider_stopped': self.spider_stopped,
            'spider_todo': self.spider_todo,
            'spider_ready_start': self.spider_ready_start,
        }

        root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'scheduler')
        dump_pickle_data(root, 'status_{}'.format(self.name), status)
        self.logger.log_it("[dump_status]{}".format(status))

    def load_status(self):
        root = os.path.join(catty.config.PERSISTENCE['DUMP_PATH'], 'scheduler')

        status = load_pickle_data(root, 'status{}'.format(self.name))
        self.spider_paused = status['spider_paused']
        self.spider_stopped = status['spider_stopped']
        self.spider_started = status['spider_started']
        self.spider_todo = status['spider_todo']
        self.spider_ready_start = status['spider_ready_start']
        self.all_spider_set = [
            self.spider_todo, self.spider_paused, self.spider_started,
            self.spider_ready_start, self.spider_started
        ]
        self.logger.log_it("[load_status]{}".format(status))

    async def clean_requests_queue(self, spider_name: str):
        """Clean the spider's requests queue"""
        request_q = self.requests_queue_conn.setdefault(
            "{}:requests".format(spider_name),
            AsyncRedisPriorityQueue("{}:requests".format(spider_name),
                                    loop=self.loop))
        if not request_q.redis_conn:
            await request_q.conn()
        await request_q.clear()
        self.logger.log_it(
            "[clean_requests_queue]Clean spider {}'s requests queue".format(
                spider_name))

    async def get_requests_queue_size(self, spider_name: str):
        request_q = self.requests_queue_conn.setdefault(
            "{}:requests".format(spider_name),
            AsyncRedisPriorityQueue("{}:requests".format(spider_name),
                                    loop=self.loop))
        await request_q.qsize()

    async def clean_dupefilter(self, spider_name: str):
        """Clean the spider' DumpFilter queue"""
        bloomfilter = self.bloom_filter.get(spider_name)
        if bloomfilter:
            await bloomfilter.conn()
            await bloomfilter.clean()
        try:
            self.bloom_filter.pop(spider_name)
        except KeyError:
            self.logger.log_it(
                '[clean_dupefilter]Cant find bloomfilter.Spidername:{}'.format(
                    spider_name))
        self.logger.log_it(
            '[clean_dupefilter]Clean bloomfilter:{}'.format(spider_name))

    def on_begin(self):
        """Run before begin to do something like load tasks,or load config"""
        self.load_status()
        for started_spider in self.spider_started:
            # load every started_spider's requests
            self.loop.create_task(
                self.load_tasks(SCHEDULER_DOWNLOADER, started_spider))
        self.instantiate_spider()
        self.init_spider_set()
        self.instantiate_selector()
        self.load_speed()

    async def check_end(self):
        done = []
        while not done or False in done:
            done = []
            for spider_set in self.all_spider_set:
                for spider_name in spider_set:
                    q = await self.get_requests_queue_size(spider_name)
                    if q == 0 or q is None:
                        await asyncio.sleep(0.5, self.loop)
                        done.append(True)
                    else:
                        done.append(False)

        self.done_all_things = True

    async def on_end(self):
        """Run when exit to do something like dump queue etc... It make self.done_all_things=True in last """
        # for task in asyncio.Task.all_tasks():
        #     task.cancel()

        self.dump_speed()
        self.dump_status()

        if catty.config.PERSISTENCE['PERSIST_BEFORE_EXIT']:
            for spider_set in self.all_spider_set:
                for spider_name in spider_set:
                    self.loop.create_task(self.dump_tasks(spider_name))
                    self.loop.create_task(self.check_end())
        else:
            self.done_all_things = True

    async def push_requests(self, task, spider_ins, spider_name):
        """Filter request & push it in requests queue"""
        # DupeFilter
        f = True
        if task['meta']['dupe_filter']:
            seeds = get_default(spider_ins, 'seeds',
                                catty.config.SPIDER_DEFAULT['SEEDS'])
            blocknum = get_default(spider_ins, 'blocknum',
                                   catty.config.SPIDER_DEFAULT['BLOCKNUM'])
            bloom_filter = self.bloom_filter.setdefault(
                task['spider_name'],
                RedisBloomFilter(self.loop,
                                 task['spider_name'] + ':DupeFilter',
                                 seeds,
                                 blockNum=blocknum))

            if not bloom_filter.redis_conn:
                await bloom_filter.conn()

            if not await bloom_filter.is_contain(task['tid']):
                await bloom_filter.add(task['tid'])
            else:
                self.logger.log_it(
                    "[run_ins_func]Filtered tid:{} url:{} data:{} params:{}".
                    format(task['tid'], task['request'].url,
                           task['request'].data, task['request'].params),
                    level='INFO')
                f = False

        if f:
            self.logger.log_it(
                "[run_ins_func]New request tid:{} url:{} data:{} params:{}".
                format(task['tid'], task['request'].url, task['request'].data,
                       task['request'].params),
                level='INFO')
            request_q = self.requests_queue_conn.setdefault(
                "{}:requests".format(spider_name),
                AsyncRedisPriorityQueue("{}:requests".format(spider_name),
                                        loop=self.loop))
            await push_task(request_q, task, self.loop)

    def get_spider_method(self, spider_name: str, method_name: str):
        """Return a bound method if spider have this method,return None if not."""
        try:
            # get the instantiation spider from dict
            spider_ins = self.spider_module_handle.spider_instantiation[
                spider_name]
        except IndexError:
            self.logger.log_it(
                "[_run_ins_func]No this Spider or had not instance yet.",
                'WARN')
            # try to reload it
            self.spider_module_handle.update_spider(spider_name)
            return

        # get the spider's method from name
        method = spider_ins.__getattribute__(method_name)

        return method, spider_ins

    async def _run_ins_func(self,
                            spider_name: str,
                            method_name: str,
                            task: dict = None):
        """run the spider_ins boned method to return a task & push it to request-queue"""

        # get the method from instance
        method, spider_ins = self.get_spider_method(spider_name, method_name)

        try:
            if task:
                func_return_task = method(task=task)
            else:
                # without task param,"start" etc...
                func_return_task = method()
        except:
            # The except from user spiders
            traceback.print_exc()
            return

        if not isinstance(func_return_task, list):
            func_return_task = [func_return_task]

        # return how many request mean it make how many task
        if isinstance(func_return_task, list):
            for each_task in func_return_task:
                if not isinstance(each_task, Task):
                    self.logger.log_it(
                        "[run_ins_func]Not return a Task in {}".format(
                            spider_name), 'WARN')
                    continue
                self.loop.create_task(
                    self.push_requests(each_task, spider_ins, spider_name))

    async def make_tasks(self):
        """run the ready_start spider & run the done task & push them"""
        # start the "ready_start" spiders
        had_started_ = set()
        for spider_name in self.spider_ready_start:
            # start the spider's start method
            self.logger.log_it(
                '[make_tasks]Starting spider:{}'.format(spider_name), 'INFO')
            self.loop.create_task(self._run_ins_func(spider_name, 'start'))
            self.spider_started.add(spider_name)
            had_started_.add(spider_name)

        self.spider_ready_start -= had_started_

        # from done task
        task = await get_task(self.parser_scheduler_queue)
        if task:
            self.loop.create_task(self.make_tasks())
            spider_name = task['spider_name']

            if task['spider_name'] in self.spider_started:
                callback = task['callback']

                for callback_method_name in callback:
                    fetcher_method_name = callback_method_name.get(
                        'fetcher', None)

                    if not fetcher_method_name:
                        continue

                    if not isinstance(fetcher_method_name, list):
                        fetcher_method_name = [fetcher_method_name]

                    # a task can have many fetcher callbacks
                    for each_fetcher_method_name in fetcher_method_name:
                        # make a new task,if use need to save the data from last task(meta etc..),must handle it.
                        self.logger.log_it(
                            '[make_tasks]{}.{} making task'.format(
                                spider_name, each_fetcher_method_name))
                        self.loop.create_task(
                            self._run_ins_func(spider_name,
                                               each_fetcher_method_name, task))

            elif task['spider_name'] in self.spider_paused:
                # persist
                dump_task(task, catty.config.PERSISTENCE['DUMP_PATH'],
                          'scheduler', task['spider_name'])
                self.loop.create_task(self.dump_tasks(spider_name))
            elif task['spider_name'] in self.spider_stopped:
                pass
            elif task['spider_name'] in self.spider_todo:
                pass
        else:
            self.loop.call_later(
                catty.config.LOAD_QUEUE_INTERVAL,
                lambda: self.loop.create_task(self.make_tasks()))

    def quit(self):
        self.logger.log_it("[Ending]Doing the last thing...")
        self.loop.create_task(self.on_end())
        while True:
            if self.done_all_things:
                self.logger.log_it("Bye!")
                os._exit(0)
            else:
                # doesn't block the thread
                time.sleep(1)

    def run_scheduler(self):
        self.loop.create_task(self.selector.select_task())
        for i in range(catty.config.NUM_OF_SCHEDULER_MAKE_TASK):
            self.loop.create_task(self.make_tasks())
        self.loop.run_forever()

    def run(self):
        try:
            self.on_begin()
            xmlrpc_partial_func = partial(self.xmlrpc_run, name=self.name)
            handler_server_thread = threading.Thread(
                target=xmlrpc_partial_func)
            handler_server_thread.start()
            scheduler_thread = threading.Thread(target=self.run_scheduler)
            scheduler_thread.start()
            # scheduler_thread.join()
            while True:
                r = input()
                if r == 'Q':
                    self.quit()
        except KeyboardInterrupt:
            self.quit()