Beispiel #1
0
    def __init__(self, **kwargs):
        self.statistics = SpiderStatistics()

        self.dispatcher = Dispatcher(**kwargs)
        self.tasks = Tasks(**kwargs)
        self.cache_extension = CacheExtension(**kwargs)

        self.restart_tasks_generator(generator=self.tasks_generator())
Beispiel #2
0
class MultiFetcher(SpiderBase):
    '''Менеджер асинхроной работы'''

    name = None

    def __init__(self, **kwargs):
        self.statistics = SpiderStatistics()

        self.dispatcher = Dispatcher(**kwargs)
        self.tasks = Tasks(**kwargs)
        self.cache_extension = CacheExtension(**kwargs)

        self.restart_tasks_generator(generator=self.tasks_generator())

    def start(self):
        '''Стартует работу менеджера'''

        self.statistics.reset()

        self._process_for_items(self.on_start())

        try:
            self._should_stop = False

            self._process_for_items(self._process_tasks_generator, limit=True)

            while not self._should_stop:
                while not self.dispatcher.is_full() and not self.tasks.empty():
                    _, task = self.tasks.get_task()
                    if task:
                        if self.cache_process(task):
                            continue
                        self.dispatcher.process_task(task)

                if self.dispatcher.is_empty():
                    break

                self.dispatcher.wait_available()

                for finished_task, error in self.dispatcher.finished_tasks():
                    self.cache_store(finished_task, error)
                    self._process_finished_task(finished_task, error)

                self._process_for_items(self._process_tasks_generator, limit=True)

                self.on_loop()

        except KeyboardInterrupt:
            pass

        self.statistics.hold()

        self.on_stop()

    def stop(self):
        '''Останавливает работу менеджера'''
        self._should_stop = True

    def cache_process(self, task):
        '''Возвращает True если task обработан расширением кэша'''
        if self.cache_extension.is_process_tasks and not task.no_cache_restore:
            without_process, task, error = self.cache_extension.process_task(task)
            if without_process and self.cache_extension.is_good_for_restore(task, error):
                self._process_finished_task(task, error)
                self._process_for_items(self._process_tasks_generator, limit=True)
                return True

    def cache_store(self, task, error=None):
        '''Сохраняет task в кэш если это допустимо'''
        if self.cache_extension.is_process_tasks and not task.no_cache_store:
            if self.cache_extension.is_good_for_store(task, error):
                self.cache_extension.store_task(task, error)

    def tasks_generator(self):
        '''Генератор задач выполняемый при каждом выполнении хотя бы одной задачи'''
        yield None

    def _process_tasks_generator(self):
        '''Генерация задач если генератор включен'''
        if self.tasks_generator_enabled:
            try:
                while not self.tasks.full():
                    yield self.tasks_generator_object.next()
            except StopIteration:
                self.tasks_generator_enabled = False

    def restart_tasks_generator(self, generator):
        '''Перезапуск генератора задач'''
        self.tasks_generator_object = generator
        self.tasks_generator_enabled = True
        self._process_for_items(self._process_tasks_generator, limit=True)

    def _process_finished_task(self, task, error=None):
        '''Передача управление обработчику для каждого завершенного task'''
        if not task:
            return

        kwargs = dict(
            task=task,
            error=error
        )

        self._process_item(
            ProcessItem(
                handler=getattr(task, 'handler', self.tasks_collector),
                **kwargs
            )
        )

    def _process_for_items(self, generator, limit=None):
        '''Извлекает и добавляет в очередь задания из функции'''
        if not generator:
            return

        if limit:
            if self.tasks.full():
                return

        for item in generator() if callable(generator) else generator:
            if isinstance(item, Task):
                self.tasks.add_task(item)

            elif isinstance(item, DataItem):
                self._process_item(item, prefix='data')

            elif isinstance(item, ProcessItem):
                self._process_item(item)

            elif isinstance(item, TasksGroup):
                item.spider = self
                self.tasks.add_group(item)

            if limit:
                if self.tasks.full():
                    return

    def _process_item(self, process_item, prefix='task'):
        handler = process_item.handler

        if isinstance(handler, str):
            handler = getattr(self, '%s_%s' % (prefix, handler), None)

        if callable(handler):
            try:
                self._process_for_items(handler(**process_item.kwargs))
            except Exception:
                self._traceback_logger()