def _set_response(self, val): warn( 'The `Grab.response` attribute is deprecated. ' 'Use `Grab.doc` instead.', stacklevel=3) # pylint: disable=assigning-non-slot, attribute-defined-outside-init self.doc = val
def setup_cache(self, backend='mongodb', database=None, **kwargs): """ Setup cache. :param backend: Backend name Should be one of the following: 'mongo', 'mysql' or 'postgresql'. :param database: Database name. :param kwargs: Additional credentials for backend. """ if database is None: raise SpiderMisuseError('setup_cache method requires database ' 'option') if backend == 'mongo': warn('Backend name "mongo" is deprecated. Use "mongodb" instead.') backend = 'mongodb' mod = __import__('grab.spider.cache_backend.%s' % backend, globals(), locals(), ['foo']) backend = mod.CacheBackend( database=database, spider=self, **kwargs ) self.cache_reader_service = CacheReaderService(self, backend) backend = mod.CacheBackend( database=database, spider=self, **kwargs ) self.cache_writer_service = CacheWriterService(self, backend)
def submit(self, *args, **kwargs): warn( 'Method `Document.submit` is deprecated. ' 'Use `Grab.submit` method instead.', stacklevel=3 ) self.grab.submit(*args, **kwargs)
def xml_tree(self): """ Return DOM-tree of the document built with XML DOM builder. """ warn('Attribute `grab.xml_tree` is deprecated. ' 'Use `Grab.doc.tree` attribute ' 'AND content_type="xml" option instead.') return self.build_xml_tree()
def save_timer(self, key): warn('Method `Spider::save_timer` is deprecated. ' 'Use `Spider::timer.log_time` method instead.') self.timer.start(key) try: yield finally: self.timer.stop(key)
def load_module(self, name): """ This method is called by Python if CustomImporter.find_module does not return None. """ try: module = import_module(self.name, 'weblib') sys.modules[name] = module warn('Module `grab.tools%s` is deprecated. ' 'Use `weblib%s` module.' % (self.name, self.name)) except: raise ImportError(name) return module
def add_task(self, task, raise_error=False): """ Add task to the task queue. """ # MP: # *** if self.parser_mode: self.parser_result_queue.put((task, None)) return if self.task_queue is None: raise SpiderMisuseError('You should configure task queue before ' 'adding tasks. Use `setup_queue` method.') if task.priority is None or not task.priority_is_custom: task.priority = self.generate_task_priority() task.priority_is_custom = False else: task.priority_is_custom = True try: if not task.url.startswith( ('http://', 'https://', 'ftp://', 'file://', 'feed://')): if self.base_url is None: msg = 'Could not resolve relative URL because base_url ' \ 'is not specified. Task: %s, URL: %s'\ % (task.name, task.url) raise SpiderError(msg) else: warn('Class attribute `Spider::base_url` is deprecated. ' 'Use Task objects with absolute URLs') task.url = urljoin(self.base_url, task.url) # If task has grab_config object then update it too if task.grab_config: task.grab_config['url'] = task.url except Exception as ex: self.stat.collect('task-with-invalid-url', task.url) if raise_error: raise else: logger.error('', exc_info=ex) return False # TODO: keep original task priority if it was set explicitly self.task_queue.put(task, task.priority, schedule_time=task.schedule_time) return True
def setup_queue(self, backend='memory', **kwargs): """ Setup queue. :param backend: Backend name Should be one of the following: 'memory', 'redis' or 'mongo'. :param kwargs: Additional credentials for backend. """ if backend == 'mongo': warn('Backend name "mongo" is deprecated. Use "mongodb" instead.') backend = 'mongodb' logger.debug('Using %s backend for task queue', backend) mod = __import__('grab.spider.queue_backend.%s' % backend, globals(), locals(), ['foo']) self.task_queue = mod.QueueBackend(spider_name=self.get_spider_name(), **kwargs)
def add_task(self, task, raise_error=False): """ Add task to the task queue. """ # MP: # *** if self.parser_mode: self.parser_result_queue.put((task, None)) return if self.task_queue is None: raise SpiderMisuseError('You should configure task queue before ' 'adding tasks. Use `setup_queue` method.') if task.priority is None or not task.priority_is_custom: task.priority = self.generate_task_priority() task.priority_is_custom = False else: task.priority_is_custom = True try: if not task.url.startswith(('http://', 'https://', 'ftp://', 'file://', 'feed://')): if self.base_url is None: msg = 'Could not resolve relative URL because base_url ' \ 'is not specified. Task: %s, URL: %s'\ % (task.name, task.url) raise SpiderError(msg) else: warn('Class attribute `Spider::base_url` is deprecated. ' 'Use Task objects with absolute URLs') task.url = urljoin(self.base_url, task.url) # If task has grab_config object then update it too if task.grab_config: task.grab_config['url'] = task.url except Exception as ex: self.stat.collect('task-with-invalid-url', task.url) if raise_error: raise else: logger.error('', exc_info=ex) return False # TODO: keep original task priority if it was set explicitly self.task_queue.put(task, task.priority, schedule_time=task.schedule_time) return True
def render_stats(self, timing=None): if timing is not None: warn('Option timing of method render_stats is deprecated.' ' There is no more timing feature.') out = ['------------ Stats: ------------'] out.append('Counters:') # Process counters items = sorted(self.stat.counters.items(), key=lambda x: x[0], reverse=True) for item in items: out.append(' %s: %s' % item) out.append('') out.append('Lists:') # Process collections sorted by size desc col_sizes = [(x, len(y)) for x, y in self.stat.collections.items()] col_sizes = sorted(col_sizes, key=lambda x: x[1], reverse=True) for col_size in col_sizes: out.append(' %s: %d' % col_size) out.append('') # Process extra metrics if 'download-size' in self.stat.counters: out.append('Network download: %s' % metric.format_traffic_value( self.stat.counters['download-size'])) out.append('Queue size: %d' % self.task_queue.size() if self.task_queue else 'NA') out.append('Network streams: %d' % self.thread_number) if self._started: elapsed = time.time() - self._started else: elapsed = 0 hours, seconds = divmod(elapsed, 3600) minutes, seconds = divmod(seconds, 60) out.append('Time elapsed: %d:%d:%d (H:M:S)' % ( hours, minutes, seconds)) out.append('End time: %s' % datetime.utcnow().strftime('%d %b %Y, %H:%M:%S UTC')) return '\n'.join(out) + '\n'
def setup(self, **kwargs): """ Setting up Grab instance configuration. """ if 'hammer_mode' in kwargs: warn('Option `hammer_mode` is deprecated. Grab does not ' 'support hammer mode anymore.') del kwargs['hammer_mode'] if 'hammer_timeouts' in kwargs: warn('Option `hammer_timeouts` is deprecated. Grab does not ' 'support hammer mode anymore.') del kwargs['hammer_timeouts'] for key in kwargs: if key not in self.config.keys(): raise error.GrabMisuseError('Unknown option: %s' % key) if 'url' in kwargs: if self.config.get('url'): kwargs['url'] = self.make_url_absolute(kwargs['url']) self.config.update(kwargs)
def __init__( self, thread_number=None, network_try_limit=None, task_try_limit=None, request_pause=NULL, priority_mode='random', meta=None, only_cache=False, config=None, args=None, parser_requests_per_process=10000, parser_pool_size=1, http_api_port=None, network_service='threaded', grab_transport='pycurl', # Deprecated transport=None): """ Arguments: * thread-number - Number of concurrent network streams * network_try_limit - How many times try to send request again if network error was occurred, use 0 to disable * task_try_limit - Limit of tries to execute some task this is not the same as network_try_limit network try limit limits the number of tries which are performed automatically in case of network timeout of some other physical error but task_try_limit limits the number of attempts which are scheduled manually in the spider business logic * priority_mode - could be "random" or "const" * meta - arbitrary user data * retry_rebuild_user_agent - generate new random user-agent for each network request which is performed again due to network error * args - command line arguments parsed with `setup_arg_parser` method """ self.fatal_error_queue = Queue() self.task_queue_parameters = None self.http_api_port = http_api_port self._started = None assert grab_transport in ('pycurl', 'urllib3') self.grab_transport_name = grab_transport self.parser_requests_per_process = parser_requests_per_process self.stat = Stat() self.task_queue = None if args is None: self.args = {} else: self.args = args if config is not None: self.config = config else: self.config = {} if meta: self.meta = meta else: self.meta = {} self.thread_number = ( thread_number or int(self.config.get('thread_number', DEFAULT_NETWORK_STREAM_NUMBER))) self.task_try_limit = ( task_try_limit or int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT))) self.network_try_limit = ( network_try_limit or int(self.config.get('network_try_limit', DEFAULT_NETWORK_TRY_LIMIT))) self._grab_config = {} if priority_mode not in ['random', 'const']: raise SpiderMisuseError('Value of priority_mode option should be ' '"random" or "const"') else: self.priority_mode = priority_mode self.only_cache = only_cache self.work_allowed = True if request_pause is not NULL: warn('Option `request_pause` is deprecated and is not ' 'supported anymore') self.proxylist_enabled = None self.proxylist = None self.proxy = None self.proxy_auto_change = False self.interrupted = False self.cache_reader_service = None self.cache_writer_service = None self.parser_pool_size = parser_pool_size self.parser_service = ParserService( spider=self, pool_size=self.parser_pool_size, ) if transport is not None: warn('The "transport" argument of Spider constructor is' ' deprecated. Use "network_service" argument.') network_service = transport assert network_service in ('threaded',) if network_service == 'threaded': # pylint: disable=no-name-in-module, import-error from grab.spider.network_service.threaded import ( NetworkServiceThreaded ) self.network_service = NetworkServiceThreaded( self, self.thread_number ) self.task_dispatcher = TaskDispatcherService(self) if self.http_api_port: self.http_api_service = HttpApiService(self) else: self.http_api_service = None self.task_generator_service = TaskGeneratorService( self.task_generator(), self, )
def __init__( self, thread_number=None, network_try_limit=None, task_try_limit=None, request_pause=NULL, priority_mode='random', meta=None, config=None, args=None, parser_requests_per_process=10000, parser_pool_size=1, http_api_port=None, network_service='threaded', grab_transport='pycurl', # Deprecated transport=None, only_cache=False, ): """ Arguments: * thread-number - Number of concurrent network streams * network_try_limit - How many times try to send request again if network error was occurred, use 0 to disable * task_try_limit - Limit of tries to execute some task this is not the same as network_try_limit network try limit limits the number of tries which are performed automatically in case of network timeout of some other physical error but task_try_limit limits the number of attempts which are scheduled manually in the spider business logic * priority_mode - could be "random" or "const" * meta - arbitrary user data * retry_rebuild_user_agent - generate new random user-agent for each network request which is performed again due to network error * args - command line arguments parsed with `setup_arg_parser` method """ self.fatal_error_queue = Queue() self.task_queue_parameters = None self.http_api_port = http_api_port self._started = None assert grab_transport in ('pycurl', 'urllib3') self.grab_transport_name = grab_transport self.parser_requests_per_process = parser_requests_per_process self.stat = Stat() self.task_queue = None if args is None: self.args = {} else: self.args = args if config is not None: self.config = config else: self.config = {} if meta: self.meta = meta else: self.meta = {} self.thread_number = ( thread_number or int(self.config.get('thread_number', DEFAULT_NETWORK_STREAM_NUMBER))) self.task_try_limit = ( task_try_limit or int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT))) self.network_try_limit = ( network_try_limit or int(self.config.get('network_try_limit', DEFAULT_NETWORK_TRY_LIMIT))) self._grab_config = {} if priority_mode not in ['random', 'const']: raise SpiderMisuseError('Value of priority_mode option should be ' '"random" or "const"') else: self.priority_mode = priority_mode if only_cache: raise_feature_is_deprecated('Cache feature') self.work_allowed = True if request_pause is not NULL: warn('Option `request_pause` is deprecated and is not ' 'supported anymore') self.proxylist_enabled = None self.proxylist = None self.proxy = None self.proxy_auto_change = False self.interrupted = False self.parser_pool_size = parser_pool_size self.parser_service = ParserService( spider=self, pool_size=self.parser_pool_size, ) if transport is not None: warn('The "transport" argument of Spider constructor is' ' deprecated. Use "network_service" argument.') network_service = transport assert network_service in ('threaded',) if network_service == 'threaded': # pylint: disable=no-name-in-module, import-error from grab.spider.network_service.threaded import ( NetworkServiceThreaded ) self.network_service = NetworkServiceThreaded( self, self.thread_number ) self.task_dispatcher = TaskDispatcherService(self) if self.http_api_port: self.http_api_service = HttpApiService(self) else: self.http_api_service = None self.task_generator_service = TaskGeneratorService( self.task_generator(), self, )
def __init__(self, thread_number=None, network_try_limit=None, task_try_limit=None, request_pause=NULL, priority_mode='random', meta=None, only_cache=False, config=None, slave=None, args=None, # New options start here taskq=None, # MP: network_result_queue=None, parser_result_queue=None, is_parser_idle=None, shutdown_event=None, mp_mode=False, parser_pool_size=None, parser_mode=False, parser_requests_per_process=10000, # http api http_api_port=None, transport='multicurl', grab_transport='pycurl', ): """ Arguments: * thread-number - Number of concurrent network streams * network_try_limit - How many times try to send request again if network error was occurred, use 0 to disable * network_try_limit - Limit of tries to execute some task this is not the same as network_try_limit network try limit limits the number of tries which are performed automatically in case of network timeout of some other physical error but task_try_limit limits the number of attempts which are scheduled manually in the spider business logic * priority_mode - could be "random" or "const" * meta - arbitrary user data * retry_rebuild_user_agent - generate new random user-agent for each network request which is performed again due to network error * args - command line arguments parsed with `setup_arg_parser` method New options: * taskq=None, * newtork_response_queue=None, """ if slave is not None: raise SpiderConfigurtionError( 'Slave mode is not supported anymore. ' 'Use `mp_mode=True` option to run multiple HTML' ' parser processes.') # API: self.http_api_port = http_api_port assert transport in ('multicurl', 'threaded') self.transport_name = transport assert grab_transport in ('pycurl', 'urllib3') self.grab_transport_name = grab_transport # MP: self.mp_mode = mp_mode if self.mp_mode: from multiprocessing import Process, Event, Queue else: from multiprocessing.dummy import Process, Event, Queue if network_result_queue is not None: self.network_result_queue = network_result_queue else: self.network_result_queue = Queue() self.parser_result_queue = parser_result_queue self.is_parser_idle = is_parser_idle if shutdown_event is not None: self.shutdown_event = shutdown_event else: self.shutdown_event = Event() if not self.mp_mode and parser_pool_size and parser_pool_size > 1: raise SpiderConfigurationError( 'Parser pool size could be only 1 in ' 'non-multiprocess mode') self.parser_pool_size = parser_pool_size self.parser_mode = parser_mode self.parser_requests_per_process = parser_requests_per_process self.stat = Stat() self.timer = Timer() self.task_queue = taskq if args is None: self.args = {} else: self.args = args if config is not None: self.config = config else: self.config = {} if meta: self.meta = meta else: self.meta = {} self.thread_number = ( thread_number or int(self.config.get('thread_number', DEFAULT_NETWORK_STREAM_NUMBER))) self.task_try_limit = ( task_try_limit or int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT))) self.network_try_limit = ( network_try_limit or int(self.config.get('network_try_limit', DEFAULT_NETWORK_TRY_LIMIT))) self._grab_config = {} if priority_mode not in ['random', 'const']: raise SpiderMisuseError('Value of priority_mode option should be ' '"random" or "const"') else: self.priority_mode = priority_mode self.only_cache = only_cache self.cache_pipeline = None self.work_allowed = True if request_pause is not NULL: warn('Option `request_pause` is deprecated and is not ' 'supported anymore') self.proxylist_enabled = None self.proxylist = None self.proxy = None self.proxy_auto_change = False self.interrupted = False
def valid_response_code(self, code, task): warn('Method `Spider::valid_response_code` is deprecated. ' 'Use `Spider::is_valid_network_response_code` method or ' '`Spider::is_valid_network_result` method.') return self.is_valid_network_response_code(code, task)
def set_grab_config(self, val): warn('Using `grab_config` attribute is deprecated. Override ' '`create_grab_instance method instead.') self._grab_config = val
def __init__(self, *args, **kwargs): warn('You are using XpathSelector from deprecated `grab.selector` ' 'package. Please, switch to `selection` package.') super(XpathSelector, self).__init__(*args, **kwargs)
def items(self): warn('Attribute `Spider::items` is deprecated. ' 'Use `Spider::stat.collections` attribute instead.') return self.stat.collections
def inc_count(self, key, count=1): warn('Method `Spider::inc_count` is deprecated. ' 'Use `Spider::stat.inc` method instead.') self.stat.inc(key, count)
def test_warn(self): out = StringIO() with mock.patch('sys.stderr', out): warn('abc') self.assertTrue('GrabDeprecationWarning: abc' in out.getvalue())
def form(self): warn('The `Grab.form` attribute is deprecated. ' 'Use `Grab.doc.form` instead.') return self.doc.form
from grab.selector.selector import * # noqa from grab.util.warning import warn warn("Module `grab.selector` is deprecated. Use `selection` package.")
def __init__(self, thread_number=None, network_try_limit=None, task_try_limit=None, request_pause=NULL, priority_mode='random', meta=None, only_cache=False, config=None, slave=None, args=None, # New options start here taskq=None, # MP: network_result_queue=None, parser_result_queue=None, is_parser_idle=None, shutdown_event=None, mp_mode=False, parser_pool_size=None, parser_mode=False, parser_requests_per_process=10000, # http api http_api_port=None, ): """ Arguments: * thread-number - Number of concurrent network streams * network_try_limit - How many times try to send request again if network error was occurred, use 0 to disable * network_try_limit - Limit of tries to execute some task this is not the same as network_try_limit network try limit limits the number of tries which are performed automatically in case of network timeout of some other physical error but task_try_limit limits the number of attempts which are scheduled manually in the spider business logic * priority_mode - could be "random" or "const" * meta - arbitrary user data * retry_rebuild_user_agent - generate new random user-agent for each network request which is performed again due to network error * args - command line arguments parsed with `setup_arg_parser` method New options: * taskq=None, * newtork_response_queue=None, """ if slave is not None: raise SpiderConfigurtionError( 'Slave mode is not supported anymore. ' 'Use `mp_mode=True` option to run multiple HTML' ' parser processes.') # API: self.http_api_port = http_api_port # MP: self.mp_mode = mp_mode if self.mp_mode: from multiprocessing import Process, Event, Queue else: from multiprocessing.dummy import Process, Event, Queue if network_result_queue is not None: self.network_result_queue = network_result_queue else: self.network_result_queue = Queue() self.parser_result_queue = parser_result_queue self.is_parser_idle = is_parser_idle if shutdown_event is not None: self.shutdown_event = shutdown_event else: self.shutdown_event = Event() if not self.mp_mode and parser_pool_size and parser_pool_size > 1: raise SpiderConfigurationError( 'Parser pool size could be only 1 in ' 'non-multiprocess mode') self.parser_pool_size = parser_pool_size self.parser_mode = parser_mode self.parser_requests_per_process = parser_requests_per_process self.stat = Stat() self.timer = Timer() self.task_queue = taskq if args is None: self.args = {} else: self.args = args if config is not None: self.config = config else: self.config = {} if meta: self.meta = meta else: self.meta = {} self.thread_number = ( thread_number or int(self.config.get('thread_number', DEFAULT_NETWORK_STREAM_NUMBER))) self.task_try_limit = ( task_try_limit or int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT))) self.network_try_limit = ( network_try_limit or int(self.config.get('network_try_limit', DEFAULT_NETWORK_TRY_LIMIT))) self._grab_config = {} if priority_mode not in ['random', 'const']: raise SpiderMisuseError('Value of priority_mode option should be ' '"random" or "const"') else: self.priority_mode = priority_mode self.only_cache = only_cache self.cache_pipeline = None self.work_allowed = True if request_pause is not NULL: warn('Option `request_pause` is deprecated and is not ' 'supported anymore') self.proxylist_enabled = None self.proxylist = None self.proxy = None self.proxy_auto_change = False self.interrupted = False
def add_item(self, list_name, item): warn('Method `Spider::add_item` is deprecated. ' 'Use `Spider::stat.collect` method instead.') self.stat.collect(list_name, item)
def stop_timer(self, key): warn('Method `Spider::stop_timer` is deprecated. ' 'Use `Spider::timer.stop` method instead.') self.timer.stop(key)
def setup_spider_config(cls, config): warn('Method `Spider::setup_spider_config` is deprecated. ' 'Use `Spider::update_spider_config` method.') cls.update_spider_config(config)
def counters(self): warn('Attribute `Spider::counters` is deprecated. ' 'Use `Spider::stat.counters` attribute instead.') return self.stat.counters
def __init__(self, name=None, url=None, grab=None, grab_config=None, priority=None, priority_set_explicitly=True, network_try_count=0, task_try_count=1, disable_cache=False, refresh_cache=False, valid_status=None, use_proxylist=True, cache_timeout=None, delay=None, raw=False, callback=None, fallback_name=None, **kwargs): """ Create `Task` object. If more than one of url, grab and grab_config options are non-empty then they processed in following order: * grab overwrite grab_config * grab_config overwrite url Args: :param name: name of the task. After successful network operation task's result will be passed to `task_<name>` method. :param url: URL of network document. Any task requires `url` or `grab` option to be specified. :param grab: configured `Grab` instance. You can use that option in case when `url` option is not enough. Do not forget to configure `url` option of `Grab` instance because in this case the `url` option of `Task` constructor will be overwritten with `grab.config['url']`. :param priority: - priority of the Task. Tasks with lower priority will be processed earlier. By default each new task is assigned with random priority from (80, 100) range. :param priority_set_explicitly: - internal flag which tells if that task priority was assigned manually or generated by spider according to priority generation rules. :param network_try_count: you'll probably will not need to use it. It is used internally to control how many times this task was restarted due to network errors. The `Spider` instance has `network_try_limit` option. When `network_try_count` attribute of the task exceeds the `network_try_limit` attribute then processing of the task is abandoned. :param task_try_count: the as `network_try_count` but it increased only then you use `clone` method. Also you can set it manually. It is useful if you want to restart the task after it was cancelled due to multiple network errors. As you might guessed there is `task_try_limit` option in `Spider` instance. Both options `network_try_count` and `network_try_limit` guarantee you that you'll not get infinite loop of restarting some task. :param disable_cache: if `True` disable cache subsystem. The document will be fetched from the Network and it will not be saved to cache. :param refresh_cache: if `True` the document will be fetched from the Network and saved to cache. :param valid_status: extra status codes which counts as valid :param use_proxylist: it means to use proxylist which was configured via `setup_proxylist` method of spider :param delay: if specified tells the spider to schedule the task and execute it after `delay` seconds :param raw: if `raw` is True then the network response is forwarding to the corresponding handler without any check of HTTP status code of network error, if `raw` is False (by default) then failed response is putting back to task queue or if tries limit is reached then the processing of this request is finished. :param callback: if you pass some function in `callback` option then the network response will be passed to this callback and the usual 'task_*' handler will be ignored and no error will be raised if such 'task_*' handler does not exist. :param fallback_name: the name of method that is called when spider gives up to do the task (due to multiple network errors) Any non-standard named arguments passed to `Task` constructor will be saved as attributes of the object. You can get their values later as attributes or with `get` method which allows to use default value if attribute does not exist. """ if name == 'generator': # The name "generator" is restricted because # `task_generator` handler could not be created because # this name is already used for special method which # generates new tasks raise SpiderMisuseError('Task name could not be "generator"') self.name = name if url is None and grab is None and grab_config is None: raise SpiderMisuseError('Either url, grab or grab_config argument ' 'of Task constructor should not be None') if url is not None and grab is not None: raise SpiderMisuseError('Options url and grab could not be used ' 'together') if url is not None and grab_config is not None: raise SpiderMisuseError('Options url and grab_config could not be ' 'used together') if grab is not None and grab_config is not None: raise SpiderMisuseError( 'Options grab and grab_config could not be used together') if grab: self.setup_grab_config(grab.dump_config()) elif grab_config: self.setup_grab_config(grab_config) else: self.grab_config = None self.url = url if valid_status is None: self.valid_status = [] else: self.valid_status = valid_status self.process_delay_option(delay) self.cache_timeout = cache_timeout if cache_timeout is not None: warn( 'Option `cache_timeout` is deprecated and' ' is not supported anymore' ) self.fallback_name = fallback_name self.priority_set_explicitly = priority_set_explicitly self.priority = priority self.network_try_count = network_try_count self.task_try_count = task_try_count self.disable_cache = disable_cache self.refresh_cache = refresh_cache self.use_proxylist = use_proxylist self.raw = raw self.callback = callback self.coroutines_stack = [] for key, value in kwargs.items(): setattr(self, key, value)
def get_grab_config(self): warn('Using `grab_config` attribute is deprecated. Override ' '`create_grab_instance method instead.') return self._grab_config
def __init__(self, name=None, url=None, grab=None, grab_config=None, priority=None, priority_set_explicitly=True, network_try_count=0, task_try_count=1, disable_cache=False, refresh_cache=False, valid_status=None, use_proxylist=True, cache_timeout=None, delay=None, raw=False, callback=None, fallback_name=None, **kwargs): """ Create `Task` object. If more than one of url, grab and grab_config options are non-empty then they processed in following order: * grab overwrite grab_config * grab_config overwrite url Args: :param name: name of the task. After successful network operation task's result will be passed to `task_<name>` method. :param url: URL of network document. Any task requires `url` or `grab` option to be specified. :param grab: configured `Grab` instance. You can use that option in case when `url` option is not enough. Do not forget to configure `url` option of `Grab` instance because in this case the `url` option of `Task` constructor will be overwritten with `grab.config['url']`. :param priority: - priority of the Task. Tasks with lower priority will be processed earlier. By default each new task is assigned with random priority from (80, 100) range. :param priority_set_explicitly: - internal flag which tells if that task priority was assigned manually or generated by spider according to priority generation rules. :param network_try_count: you'll probably will not need to use it. It is used internally to control how many times this task was restarted due to network errors. The `Spider` instance has `network_try_limit` option. When `network_try_count` attribute of the task exceeds the `network_try_limit` attribute then processing of the task is abandoned. :param task_try_count: the as `network_try_count` but it increased only then you use `clone` method. Also you can set it manually. It is useful if you want to restart the task after it was cancelled due to multiple network errors. As you might guessed there is `task_try_limit` option in `Spider` instance. Both options `network_try_count` and `network_try_limit` guarantee you that you'll not get infinite loop of restarting some task. :param disable_cache: if `True` disable cache subsystem. The document will be fetched from the Network and it will not be saved to cache. :param refresh_cache: if `True` the document will be fetched from the Network and saved to cache. :param valid_status: extra status codes which counts as valid :param use_proxylist: it means to use proxylist which was configured via `setup_proxylist` method of spider :param delay: if specified tells the spider to schedule the task and execute it after `delay` seconds :param raw: if `raw` is True then the network response is forwarding to the corresponding handler without any check of HTTP status code of network error, if `raw` is False (by default) then failed response is putting back to task queue or if tries limit is reached then the processing of this request is finished. :param callback: if you pass some function in `callback` option then the network response will be passed to this callback and the usual 'task_*' handler will be ignored and no error will be raised if such 'task_*' handler does not exist. :param fallback_name: the name of method that is called when spider gives up to do the task (due to multiple network errors) Any non-standard named arguments passed to `Task` constructor will be saved as attributes of the object. You can get their values later as attributes or with `get` method which allows to use default value if attribute does not exist. """ if name == 'generator': # The name "generator" is restricted because # `task_generator` handler could not be created because # this name is already used for special method which # generates new tasks raise SpiderMisuseError('Task name could not be "generator"') self.name = name if url is None and grab is None and grab_config is None: raise SpiderMisuseError('Either url, grab or grab_config argument ' 'of Task constructor should not be None') if url is not None and grab is not None: raise SpiderMisuseError('Options url and grab could not be used ' 'together') if url is not None and grab_config is not None: raise SpiderMisuseError('Options url and grab_config could not be ' 'used together') if grab is not None and grab_config is not None: raise SpiderMisuseError( 'Options grab and grab_config could not be used together') if grab: self.setup_grab_config(grab.dump_config()) elif grab_config: self.setup_grab_config(grab_config) else: self.grab_config = None self.url = url if valid_status is None: self.valid_status = [] else: self.valid_status = valid_status self.process_delay_option(delay) self.cache_timeout = cache_timeout if cache_timeout is not None: warn('Option `cache_timeout` is deprecated and' ' is not supported anymore') self.fallback_name = fallback_name self.priority_set_explicitly = priority_set_explicitly self.priority = priority self.network_try_count = network_try_count self.task_try_count = task_try_count self.disable_cache = disable_cache self.refresh_cache = refresh_cache self.use_proxylist = use_proxylist self.raw = raw self.callback = callback self.coroutines_stack = [] for key, value in kwargs.items(): setattr(self, key, value)
def setup_grab(self, **kwargs): warn('Method `Spider::setup_grab` is deprecated. ' 'Define `Spider::create_grab_instance` or ' 'Spider::update_grab_instance` methods in your ' 'Spider sub-class.') self.grab_config.update(**kwargs)
def _get_response(self): warn('The `Grab.response` attribute is deprecated. ' 'Use `Grab.doc` instead.') return self.doc
def taskq(self): warn('Attribute `Spider::taskq` is deprecated. ' 'Use `Spider::task_queue` attribute.') return self.task_queue
def load_cookies(self, path, file_required=None): if file_required is not None: warn('The option `file_required` is no longer supported') self.cookies.load_from_file(path) # pylint: disable=no-member
def append(self, key, val): warn('Method `Stat::append` is deprecated. ' 'Use `Stat::collect` method instead.') self.collect(key, val)
def time(self): warn('Attribute `Document.time` is deprecated. ' 'Use `Document.total_time` instead.') return self.total_time
from grab.selector.selector import * # noqa from grab.util.warning import warn warn('Module `grab.selector` is deprecated. Use `selection` package.')