Ejemplo n.º 1
0
 def cache_writer_service(self, val):
     raise_feature_is_deprecated('Cache feature')
Ejemplo n.º 2
0
 def cache_reader_service(self):
     raise_feature_is_deprecated('Cache feature')
Ejemplo n.º 3
0
 def setup_cache(self, *args, **kwargs):
     raise_feature_is_deprecated('Cache feature')
Ejemplo n.º 4
0
    def __init__(
            self,
            thread_number=None,
            network_try_limit=None, task_try_limit=None,
            request_pause=NULL,
            priority_mode='random',
            meta=None,
            config=None,
            args=None,
            parser_requests_per_process=10000,
            parser_pool_size=1,
            http_api_port=None,
            network_service='threaded',
            grab_transport='pycurl',
            # Deprecated
            transport=None,
            only_cache=False,
        ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * task_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        """

        self.fatal_error_queue = Queue()
        self.task_queue_parameters = None
        self.http_api_port = http_api_port
        self._started = None
        assert grab_transport in ('pycurl', 'urllib3')
        self.grab_transport_name = grab_transport
        self.parser_requests_per_process = parser_requests_per_process
        self.stat = Stat()
        self.task_queue = None
        if args is None:
            self.args = {}
        else:
            self.args = args
        if config is not None:
            self.config = config
        else:
            self.config = {}
        if meta:
            self.meta = meta
        else:
            self.meta = {}
        self.thread_number = (
            thread_number or
            int(self.config.get('thread_number',
                                DEFAULT_NETWORK_STREAM_NUMBER)))
        self.task_try_limit = (
            task_try_limit or
            int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT)))
        self.network_try_limit = (
            network_try_limit or
            int(self.config.get('network_try_limit',
                                DEFAULT_NETWORK_TRY_LIMIT)))
        self._grab_config = {}
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode
        if only_cache:
            raise_feature_is_deprecated('Cache feature')
        self.work_allowed = True
        if request_pause is not NULL:
            warn('Option `request_pause` is deprecated and is not '
                 'supported anymore')
        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False
        self.interrupted = False
        self.parser_pool_size = parser_pool_size
        self.parser_service = ParserService(
            spider=self,
            pool_size=self.parser_pool_size,
        )
        if transport is not None:
            warn('The "transport" argument of Spider constructor is'
                 ' deprecated. Use "network_service" argument.')
            network_service = transport
        assert network_service in ('threaded',)
        if network_service == 'threaded':
            # pylint: disable=no-name-in-module, import-error
            from grab.spider.network_service.threaded import (
                NetworkServiceThreaded
            )
            self.network_service = NetworkServiceThreaded(
                self, self.thread_number
            )
        self.task_dispatcher = TaskDispatcherService(self)
        if self.http_api_port:
            self.http_api_service = HttpApiService(self)
        else:
            self.http_api_service = None
        self.task_generator_service = TaskGeneratorService(
            self.task_generator(), self,
        )
Ejemplo n.º 5
0
    def __init__(
            self,
            name=None,
            url=None,
            grab=None,
            grab_config=None,
            priority=None,
            priority_set_explicitly=True,
            network_try_count=0,
            task_try_count=1,
            valid_status=None,
            use_proxylist=True,
            delay=None,
            raw=False,
            callback=None,
            fallback_name=None,
            # deprecated
            disable_cache=False,
            refresh_cache=False,
            cache_timeout=None,
            # kwargs
            **kwargs):
        """
        Create `Task` object.

        If more than one of url, grab and grab_config options are non-empty
        then they processed in following order:
        * grab overwrite grab_config
        * grab_config overwrite url

        Args:
            :param name: name of the task. After successful network operation
                task's result will be passed to `task_<name>` method.
            :param url: URL of network document. Any task requires `url` or
                `grab` option to be specified.
            :param grab: configured `Grab` instance. You can use that option in
                case when `url` option is not enough. Do not forget to
                configure `url` option of `Grab` instance because in this case
                the `url` option of `Task` constructor will be overwritten
                with `grab.config['url']`.
            :param priority: - priority of the Task. Tasks with lower priority
                will be processed earlier. By default each new task is assigned
                with random priority from (80, 100) range.
            :param priority_set_explicitly: - internal flag which tells if that
                task priority was assigned manually or generated by spider
                according to priority generation rules.
            :param network_try_count: you'll probably will not need to use it.
                It is used internally to control how many times this task was
                restarted due to network errors. The `Spider` instance has
                `network_try_limit` option. When `network_try_count` attribute
                of the task exceeds the `network_try_limit` attribute then
                processing of the task is abandoned.
            :param task_try_count: the as `network_try_count` but it increased
                only then you use `clone` method. Also you can set it manually.
                It is useful if you want to restart the task after it was
                cancelled due to multiple network errors. As you might guessed
                there is `task_try_limit` option in `Spider` instance. Both
                options `network_try_count` and `network_try_limit` guarantee
                you that you'll not get infinite loop of restarting some task.
            :param valid_status: extra status codes which counts as valid
            :param use_proxylist: it means to use proxylist which was
                configured via `setup_proxylist` method of spider
            :param delay: if specified tells the spider to schedule the task
                and execute    it after `delay` seconds
            :param raw: if `raw` is True then the network response is
                forwarding to the corresponding handler without any check of
                HTTP status code of network error, if `raw` is False (by
                default) then failed response is putting back to task queue or
                if tries limit is reached then the processing of this  request
                is finished.
            :param callback: if you pass some function in `callback` option
                then the network response will be passed to this callback and
                the usual 'task_*' handler will be ignored and no error will be
                raised if such 'task_*' handler does not exist.
            :param fallback_name: the name of method that is called when spider
                gives up to do the task (due to multiple network errors)

            Any non-standard named arguments passed to `Task` constructor will
            be saved as attributes of the object. You can get their values
            later as attributes or with `get` method which allows to use
            default value if attribute does not exist.
        """

        if disable_cache or refresh_cache or cache_timeout:
            raise_feature_is_deprecated('Cache feature')

        if name == 'generator':
            # The name "generator" is restricted because
            # `task_generator` handler could not be created because
            # this name is already used for special method which
            # generates new tasks
            raise SpiderMisuseError('Task name could not be "generator"')

        self.name = name

        if url is None and grab is None and grab_config is None:
            raise SpiderMisuseError('Either url, grab or grab_config argument '
                                    'of Task constructor should not be None')

        if url is not None and grab is not None:
            raise SpiderMisuseError('Options url and grab could not be used '
                                    'together')

        if url is not None and grab_config is not None:
            raise SpiderMisuseError('Options url and grab_config could not be '
                                    'used together')

        if grab is not None and grab_config is not None:
            raise SpiderMisuseError(
                'Options grab and grab_config could not be used together')

        if grab:
            self.setup_grab_config(grab.dump_config())
        elif grab_config:
            self.setup_grab_config(grab_config)
        else:
            self.grab_config = None
            self.url = url

        if valid_status is None:
            self.valid_status = []
        else:
            self.valid_status = valid_status

        self.process_delay_option(delay)

        self.fallback_name = fallback_name
        self.priority_set_explicitly = priority_set_explicitly
        self.priority = priority
        self.network_try_count = network_try_count
        self.task_try_count = task_try_count
        self.use_proxylist = use_proxylist
        self.raw = raw
        self.callback = callback
        self.coroutines_stack = []
        for key, value in kwargs.items():
            setattr(self, key, value)
Ejemplo n.º 6
0
def main(spider_name,
         thread_number=None,
         settings_module='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         api_port=None,
         parser_pool_size=2,
         grab_log_file=None,
         network_log_file=None,
         network_service=None,
         grab_transport=None,
         **kwargs):  # pylint: disable=unused-argument
    default_logging(
        grab_log=grab_log_file,
        network_log=network_log_file,
        propagate_network_logger=network_logs,
    )

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, _ = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
        http_api_port=api_port,
        parser_pool_size=parser_pool_size,
        network_service=network_service,
        grab_transport=grab_transport,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    if spider_config.get('cache'):
        raise_feature_is_deprecated('Cache feature')

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats()

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d', pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.makedirs(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.stat.collections.items():
                    fname_key = key.replace('-', '_')
                    save_list(lst, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats))

    return {
        'spider_stats': bot.render_stats(),
    }