Ejemplo n.º 1
0
    def load_proxylist(self, source, source_type=None, proxy_type='http',
                       auto_init=True, auto_change=True,
                       **kwargs):
        self.proxylist = ProxyList()
        if isinstance(source, BaseProxySource):
            self.proxylist.set_source(source)
        elif isinstance(source, six.string_types):
            if source_type == 'text_file':
                self.proxylist.load_file(source, proxy_type=proxy_type)
            elif source_type == 'url':
                self.proxylist.load_url(source, proxy_type=proxy_type)
            else:
                raise SpiderMisuseError('Method `load_proxylist` received '
                                        'invalid `source_type` argument: %s'
                                        % source_type) 
        else:
            raise SpiderMisuseError('Method `load_proxylist` received '
                                    'invalid `source` argument: %s'
                                    % source) 

        self.proxylist_enabled = True
        self.proxy = None
        if not auto_change and auto_init:
            self.proxy = self.proxylist.get_random_proxy()
        self.proxy_auto_change = auto_change
Ejemplo n.º 2
0
    def clone(self, **kwargs):
        """
        Clone Task instance.

        Reset network_try_count, increase task_try_count.
        """

        # First, create exact copy of the current Task object
        attr_copy = self.__dict__.copy()
        if attr_copy.get('grab_config') is not None:
            del attr_copy['url']
        task = Task(**attr_copy)

        # Reset some task properties if they have not
        # been set explicitly in kwargs
        if not 'network_try_count' in kwargs:
            task.network_try_count = 0
        if not 'task_try_count' in kwargs:
            task.task_try_count = self.task_try_count + 1
        if not 'refresh_cache' in kwargs:
            task.refresh_cache = False
        if not 'disable_cache' in kwargs:
            task.disable_cache = False

        if kwargs.get('url') is not None and kwargs.get('grab') is not None:
            raise SpiderMisuseError('Options url and grab could not be '
                                    'used together')

        if kwargs.get('url') is not None and kwargs.get(
                'grab_config') is not None:
            raise SpiderMisuseError('Options url and grab_config could not '
                                    'be used together')

        if kwargs.get('grab') is not None and kwargs.get(
                'grab_config') is not None:
            raise SpiderMisuseError('Options grab and grab_config could not '
                                    'be used together')

        if kwargs.get('grab'):
            task.setup_grab_config(kwargs['grab'].dump_config())
            del kwargs['grab']
        elif kwargs.get('grab_config'):
            task.setup_grab_config(kwargs['grab_config'])
            del kwargs['grab_config']
        elif kwargs.get('url'):
            task.url = kwargs['url']
            if task.grab_config:
                task.grab_config['url'] = kwargs['url']
            del kwargs['url']

        for key, value in kwargs.items():
            setattr(task, key, value)

        # WTF?
        # The `Task` object can't has `delay` attribute
        # I think in next line the `process_delay_option` method
        # always gets None as input argument
        task.process_delay_option(task.get('delay', None))

        return task
Ejemplo n.º 3
0
    def clone(self, **kwargs):
        """
        Clone Task instance.

        Reset network_try_count, increase task_try_count.
        Reset priority attribute if it was not set explicitly.
        """

        # First, create exact copy of the current Task object
        attr_copy = self.__dict__.copy()
        if attr_copy.get('grab_config') is not None:
            del attr_copy['url']
        if not attr_copy['priority_set_explicitly']:
            attr_copy['priority'] = None
        task = Task(**attr_copy)

        # Reset some task properties if they have not
        # been set explicitly in kwargs
        if 'network_try_count' not in kwargs:
            task.network_try_count = 0
        if 'task_try_count' not in kwargs:
            task.task_try_count = self.task_try_count + 1
        if 'refresh_cache' not in kwargs:
            task.refresh_cache = False
        if 'disable_cache' not in kwargs:
            task.disable_cache = False

        if kwargs.get('url') is not None and kwargs.get('grab') is not None:
            raise SpiderMisuseError('Options url and grab could not be '
                                    'used together')

        if (kwargs.get('url') is not None
                and kwargs.get('grab_config') is not None):
            raise SpiderMisuseError('Options url and grab_config could not '
                                    'be used together')

        if (kwargs.get('grab') is not None
                and kwargs.get('grab_config') is not None):
            raise SpiderMisuseError('Options grab and grab_config could not '
                                    'be used together')

        if kwargs.get('grab'):
            task.setup_grab_config(kwargs['grab'].dump_config())
            del kwargs['grab']
        elif kwargs.get('grab_config'):
            task.setup_grab_config(kwargs['grab_config'])
            del kwargs['grab_config']
        elif kwargs.get('url'):
            task.url = kwargs['url']
            if task.grab_config:
                task.grab_config['url'] = kwargs['url']
            del kwargs['url']

        for key, value in kwargs.items():
            setattr(task, key, value)

        task.process_delay_option(None)

        return task
Ejemplo n.º 4
0
    def add_task(self, task, queue=None, raise_error=False):
        """
        Add task to the task queue.
        """

        if queue is None:
            queue = self.task_queue
        if queue is None:
            raise SpiderMisuseError('You should configure task queue before '
                                    'adding tasks. Use `setup_queue` method.')
        if task.priority is None or not task.priority_set_explicitly:
            task.priority = self.generate_task_priority()
            task.priority_set_explicitly = False
        else:
            task.priority_set_explicitly = True

        if not task.url.startswith(('http://', 'https://', 'ftp://',
                                    'file://', 'feed://')):
            self.stat.collect('task-with-invalid-url', task.url)
            msg = 'Invalid task URL: %s' % task.url
            if raise_error:
                raise SpiderError(msg)
            else:
                logger.error(
                    '%s\nTraceback:\n%s', msg, ''.join(format_stack()),
                )
                return False
        else:
            # TODO: keep original task priority if it was set explicitly
            # WTF the previous comment means?
            queue.put(
                task, priority=task.priority, schedule_time=task.schedule_time
            )
            return True
Ejemplo n.º 5
0
    def setup_cache(self,
                    backend='mongo',
                    database=None,
                    use_compression=True,
                    **kwargs):
        """
        Setup cache.

        :param backend: Backend name
            Should be one of the following: 'mongo', 'mysql' or 'postgresql'.
        :param database: Database name.
        :param kwargs: Additional credentials for backend.

        """
        if database is None:
            raise SpiderMisuseError('setup_cache method requires database '
                                    'option')
        self.cache_enabled = True
        mod = __import__('grab.spider.cache_backend.%s' % backend, globals(),
                         locals(), ['foo'])
        cache = mod.CacheBackend(database=database,
                                 use_compression=use_compression,
                                 spider=self,
                                 **kwargs)
        self.cache_pipeline = CachePipeline(self, cache)
Ejemplo n.º 6
0
    def setup_cache(self, backend='mongodb', database=None,
                    **kwargs):
        """
        Setup cache.

        :param backend: Backend name
            Should be one of the following: 'mongo', 'mysql' or 'postgresql'.
        :param database: Database name.
        :param kwargs: Additional credentials for backend.

        """
        if database is None:
            raise SpiderMisuseError('setup_cache method requires database '
                                    'option')
        if backend == 'mongo':
            warn('Backend name "mongo" is deprecated. Use "mongodb" instead.')
            backend = 'mongodb'
        mod = __import__('grab.spider.cache_backend.%s' % backend,
                         globals(), locals(), ['foo'])
        backend = mod.CacheBackend(
            database=database, spider=self, **kwargs
        )
        self.cache_reader_service = CacheReaderService(self, backend)
        backend = mod.CacheBackend(
            database=database, spider=self, **kwargs
        )
        self.cache_writer_service = CacheWriterService(self, backend)
Ejemplo n.º 7
0
    def add_task(self, task):
        """
        Add task to the task queue.
        """

        if self.taskq is None:
            raise SpiderMisuseError('You should configure task queue before '
                                    'adding tasks. Use `setup_queue` method.')
        if task.priority is None or not task.priority_is_custom:
            task.priority = self.generate_task_priority()
            task.priority_is_custom = False
        else:
            task.priority_is_custom = True

        if not isinstance(task, NullTask):
            if not task.url.startswith(
                ('http://', 'https://', 'ftp://', 'file://', 'feed://')):
                if self.base_url is None:
                    msg = 'Could not resolve relative URL because base_url ' \
                          'is not specified. Task: %s, URL: %s'\
                          % (task.name, task.url)
                    logger.error(msg)
                    self.add_item('task-with-invalid-url', task.url)
                    return False
                else:
                    task.url = urljoin(self.base_url, task.url)
                    # If task has grab_config object then update it too
                    if task.grab_config:
                        task.grab_config['url'] = task.url

        # TODO: keep original task priority if it was set explicitly
        self.taskq.put(task, task.priority, schedule_time=task.schedule_time)
        is_valid = True
        return is_valid
Ejemplo n.º 8
0
 def setup_cache(self, backend='mongo', database=None, use_compression=True, **kwargs):
     if database is None:
         raise SpiderMisuseError('setup_cache method requires database option')
     self.cache_enabled = True
     mod = __import__('grab.spider.cache_backend.%s' % backend,
                      globals(), locals(), ['foo'])
     self.cache = mod.CacheBackend(database=database, use_compression=use_compression,
                                   spider=self, **kwargs)
Ejemplo n.º 9
0
Archivo: mongo.py Proyecto: subeax/grab
 def put(self, task, priority, schedule_time=None):
     if schedule_time is not None:
         raise SpiderMisuseError('Mongo task queue does not support delayed task') 
     item = {
         'task': Binary(pickle.dumps(task)),
         'priority': priority,
     }
     self.collection.save(item)
Ejemplo n.º 10
0
    def load_proxylist(self,
                       source,
                       source_type=None,
                       proxy_type='http',
                       auto_init=True,
                       auto_change=True):
        """
        Load proxy list.

        :param source: Proxy source.
            Accepts string (file path, url) or ``BaseProxySource`` instance.
        :param source_type: The type of the specified source.
            Should be one of the following: 'text_file' or 'url'.
        :param proxy_type:
            Should be one of the following: 'socks4', 'socks5' or'http'.
        :param auto_change:
            If set to `True` then automatical random proxy rotation
            will be used.


        Proxy source format should be one of the following (for each line):
            - ip:port
            - ip:port:login:password

        """
        self.proxylist = ProxyList()
        if isinstance(source, BaseProxySource):
            self.proxylist.set_source(source)
        elif isinstance(source, six.string_types):
            if source_type == 'text_file':
                self.proxylist.load_file(source, proxy_type=proxy_type)
            elif source_type == 'url':
                self.proxylist.load_url(source, proxy_type=proxy_type)
            else:
                raise SpiderMisuseError('Method `load_proxylist` received '
                                        'invalid `source_type` argument: %s' %
                                        source_type)
        else:
            raise SpiderMisuseError('Method `load_proxylist` received '
                                    'invalid `source` argument: %s' % source)

        self.proxylist_enabled = True
        self.proxy = None
        if not auto_change and auto_init:
            self.proxy = self.proxylist.get_random_proxy()
        self.proxy_auto_change = auto_change
Ejemplo n.º 11
0
 def add_interface(self, backend=None, **kwargs):
     if backend == 'redis':
         iface = RedisCommandInterface(self.spider.get_name(), **kwargs)
         self.ifaces[backend] = iface
         self.enabled = True
         return iface
     else:
         raise SpiderMisuseError('Unknown command interface: %s' % backend)
Ejemplo n.º 12
0
    def put(self, task, priority, schedule_time=None):
        if schedule_time is not None:
            raise SpiderMisuseError('Redis task queue does not support '
                                    'delayed task')
        # Add attribute with random value
        # This is required because qr library
        # does not allow to store multiple values with same hash
        # in the PriorityQueue

        task.redis_qr_rnd = random.random()
        self.queue_object.push(task, priority)
Ejemplo n.º 13
0
    def add_task(self, task, raise_error=False):
        """
        Add task to the task queue.
        """

        # MP:
        # ***
        if self.parser_mode:
            self.parser_result_queue.put((task, None))
            return

        if self.task_queue is None:
            raise SpiderMisuseError('You should configure task queue before '
                                    'adding tasks. Use `setup_queue` method.')
        if task.priority is None or not task.priority_is_custom:
            task.priority = self.generate_task_priority()
            task.priority_is_custom = False
        else:
            task.priority_is_custom = True

        try:
            if not task.url.startswith(
                ('http://', 'https://', 'ftp://', 'file://', 'feed://')):
                if self.base_url is None:
                    msg = 'Could not resolve relative URL because base_url ' \
                          'is not specified. Task: %s, URL: %s'\
                          % (task.name, task.url)
                    raise SpiderError(msg)
                else:
                    warn('Class attribute `Spider::base_url` is deprecated. '
                         'Use Task objects with absolute URLs')
                    task.url = urljoin(self.base_url, task.url)
                    # If task has grab_config object then update it too
                    if task.grab_config:
                        task.grab_config['url'] = task.url
        except Exception as ex:
            self.stat.collect('task-with-invalid-url', task.url)
            if raise_error:
                raise
            else:
                logger.error('', exc_info=ex)
                return False

        # TODO: keep original task priority if it was set explicitly
        self.task_queue.put(task,
                            task.priority,
                            schedule_time=task.schedule_time)
        return True
Ejemplo n.º 14
0
    def add_task(self, task, raise_error=False):
        """
        Add task to the task queue.
        """

        # MP:
        # ***
        if self.parser_mode:
            self.parser_result_queue.put((task, None))
            return

        if self.task_queue is None:
            raise SpiderMisuseError('You should configure task queue before '
                                    'adding tasks. Use `setup_queue` method.')
        if task.priority is None or not task.priority_set_explicitly:
            task.priority = self.generate_task_priority()
            task.priority_set_explicitly = False
        else:
            task.priority_set_explicitly = True

        if not task.url.startswith(('http://', 'https://', 'ftp://',
                                    'file://', 'feed://')):
            self.stat.collect('task-with-invalid-url', task.url)
            msg = ('It is not allowed to build Task object with '
                   'relative URL: %s' % task.url)
            ex = SpiderError(msg)
            if raise_error:
                raise ex
            else:
                # Just want to print traceback
                # Do this to avoid the error
                # http://bugs.python.org/issue23003
                # FIXME: use something less awkward
                try:
                    raise ex
                except SpiderError as ex:
                    logger.error('', exc_info=ex)
                return False

        # TODO: keep original task priority if it was set explicitly
        # WTF the previous comment means?
        self.task_queue.put(task, task.priority,
                            schedule_time=task.schedule_time)
        return True
Ejemplo n.º 15
0
    def add_task(self, task):
        """
        Add task to the task queue.

        Abort the task which was restarted too many times.
        """

        if self.taskq is None:
            raise SpiderMisuseError('You should configure task queue before adding tasks. Use `setup_queue` method.')
        if task.priority is None or not task.priority_is_custom:
            task.priority = self.generate_task_priority()
            task.priority_is_custom = False
        else:
            task.priority_is_custom = True

        if not isinstance(task, NullTask):
            if not task.url.startswith(('http://', 'https://', 'ftp://', 'file://')):
                if self.base_url is None:
                    #raise SpiderMisuseError('Could not resolve relative URL because base_url is not specified. Task: %s, URL: %s' % (task.name, task.url))
                    msg = 'Could not resolve relative URL because base_url is not specified. Task: %s, URL: %s' % (task.name, task.url)
                    logger.error(msg)
                    self.add_item('task-with-invalid-url', task.url)
                    return False
                else:
                    task.url = urljoin(self.base_url, task.url)
                    # If task has grab_config object then update it too
                    if task.grab_config:
                        task.grab_config['url'] = task.url

        if self.config.get('GRAB_TASK_REFRESH_CACHE', {}).get(task.name, False):
            task.refresh_cache = True

        if not self.config.get('TASK_ENABLED', {}).get(task.name, True):
            logger.debug('Task %s disabled via config' % task.name)
            self.inc_count('task-disabled-via-config')
            is_valid = False
        else:
            # TODO: keep original task priority if it was set explicitly
            self.taskq.put(task, task.priority, schedule_time=task.schedule_time)
            is_valid = True
        return is_valid
Ejemplo n.º 16
0
    def __init__(
        self,
        thread_number=3,
        network_try_limit=10,
        task_try_limit=10,
        request_pause=NULL,
        priority_mode='random',
        meta=None,
        only_cache=False,
        config=None,
        slave=False,
        max_task_generator_chunk=None,
        args=None,
        # New options start here
        waiting_shutdown_event=None,
        taskq=None,
        result_queue=None,
        network_response_queue=None,
        shutdown_event=None,
        generator_done_event=None,
        ng=False,
    ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * network_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        New options:
        * waiting_shutdown_event=None,
        * taskq=None,
        * result_queue=None,
        * newtork_response_queue=None,
        * shutdown_event=None,
        * generator_done_event=None):
        """

        # New options starts
        self.waiting_shutdown_event = waiting_shutdown_event
        self.taskq = taskq
        self.result_queue = result_queue
        self.shutdown_event = shutdown_event
        self.generator_done_event = generator_done_event
        self.network_response_queue = network_response_queue
        self.ng = ng
        # New options ends

        if args is None:
            self.args = {}
        else:
            self.args = args

        self.slave = slave

        self.max_task_generator_chunk = max_task_generator_chunk
        self.timers = {
            'network-name-lookup': 0,
            'network-connect': 0,
            'network-total': 0,
        }
        self.time_points = {}
        self.start_timer('total')
        if config is not None:
            self.config = config
        else:
            # Fix circular import error
            from grab.util.config import Config
            self.config = Config()

        if meta:
            self.meta = meta
        else:
            self.meta = {}

        self.task_generator_enabled = False
        self.only_cache = only_cache
        self.thread_number = thread_number
        self.counters = defaultdict(int)
        self._grab_config = {}
        self.items = {}
        self.task_try_limit = task_try_limit
        self.network_try_limit = network_try_limit
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode

        try:
            signal.signal(signal.SIGUSR1, self.sigusr1_handler)
        except (ValueError, AttributeError):
            pass

        try:
            signal.signal(signal.SIGUSR2, self.sigusr2_handler)
        except (ValueError, AttributeError):
            pass

        # Initial cache-subsystem values
        self.cache_enabled = False
        self.cache = None

        self.work_allowed = True
        if request_pause is not NULL:
            logger.error('Option `request_pause` is deprecated and is not '
                         'supported anymore')

        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False

        # FIXIT: REMOVE
        self.dump_spider_stats = None

        self.controller = CommandController(self)

        # snapshots contains information about spider's state
        # for each 10 seconds interval
        self.snapshots = {}
        self.last_snapshot_values = {
            'timestamp': 0,
            'download-size': 0,
            'upload-size': 0,
            'download-size-with-cache': 0,
            'request-count': 0,
        }
        self.snapshot_timestamps = []
        self.snapshot_interval = self.config.get('GRAB_SNAPSHOT_CONFIG',
                                                 {}).get('interval', 10)
        self.snapshot_file = self.config.get('GRAB_SNAPSHOT_CONFIG',
                                             {}).get('file', None)
        if self.snapshot_file:
            open(self.snapshot_file, 'w').write('')
Ejemplo n.º 17
0
    def __init__(
            self,
            thread_number=None,
            network_try_limit=None, task_try_limit=None,
            request_pause=NULL,
            priority_mode='random',
            meta=None,
            config=None,
            args=None,
            parser_requests_per_process=10000,
            parser_pool_size=1,
            http_api_port=None,
            network_service='threaded',
            grab_transport='pycurl',
            # Deprecated
            transport=None,
            only_cache=False,
        ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * task_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        """

        self.fatal_error_queue = Queue()
        self.task_queue_parameters = None
        self.http_api_port = http_api_port
        self._started = None
        assert grab_transport in ('pycurl', 'urllib3')
        self.grab_transport_name = grab_transport
        self.parser_requests_per_process = parser_requests_per_process
        self.stat = Stat()
        self.task_queue = None
        if args is None:
            self.args = {}
        else:
            self.args = args
        if config is not None:
            self.config = config
        else:
            self.config = {}
        if meta:
            self.meta = meta
        else:
            self.meta = {}
        self.thread_number = (
            thread_number or
            int(self.config.get('thread_number',
                                DEFAULT_NETWORK_STREAM_NUMBER)))
        self.task_try_limit = (
            task_try_limit or
            int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT)))
        self.network_try_limit = (
            network_try_limit or
            int(self.config.get('network_try_limit',
                                DEFAULT_NETWORK_TRY_LIMIT)))
        self._grab_config = {}
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode
        if only_cache:
            raise_feature_is_deprecated('Cache feature')
        self.work_allowed = True
        if request_pause is not NULL:
            warn('Option `request_pause` is deprecated and is not '
                 'supported anymore')
        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False
        self.interrupted = False
        self.parser_pool_size = parser_pool_size
        self.parser_service = ParserService(
            spider=self,
            pool_size=self.parser_pool_size,
        )
        if transport is not None:
            warn('The "transport" argument of Spider constructor is'
                 ' deprecated. Use "network_service" argument.')
            network_service = transport
        assert network_service in ('threaded',)
        if network_service == 'threaded':
            # pylint: disable=no-name-in-module, import-error
            from grab.spider.network_service.threaded import (
                NetworkServiceThreaded
            )
            self.network_service = NetworkServiceThreaded(
                self, self.thread_number
            )
        self.task_dispatcher = TaskDispatcherService(self)
        if self.http_api_port:
            self.http_api_service = HttpApiService(self)
        else:
            self.http_api_service = None
        self.task_generator_service = TaskGeneratorService(
            self.task_generator(), self,
        )
Ejemplo n.º 18
0
    def __init__(self, thread_number=None,
                 network_try_limit=None, task_try_limit=None,
                 request_pause=NULL,
                 priority_mode='random',
                 meta=None,
                 only_cache=False,
                 config=None,
                 slave=None,
                 args=None,
                 # New options start here
                 taskq=None,
                 # MP:
                 network_result_queue=None,
                 parser_result_queue=None,
                 is_parser_idle=None,
                 shutdown_event=None,
                 mp_mode=False,
                 parser_pool_size=None,
                 parser_mode=False,
                 parser_requests_per_process=10000,
                 # http api
                 http_api_port=None,
                 transport='multicurl',
                 grab_transport='pycurl',
                 ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * network_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        New options:
        * taskq=None,
        * newtork_response_queue=None,
        """

        if slave is not None:
            raise SpiderConfigurtionError(
                'Slave mode is not supported anymore. '
                'Use `mp_mode=True` option to run multiple HTML'
                ' parser processes.')

        # API:
        self.http_api_port = http_api_port

        assert transport in ('multicurl', 'threaded')
        self.transport_name = transport

        assert grab_transport in ('pycurl', 'urllib3')
        self.grab_transport_name = grab_transport

        # MP:
        self.mp_mode = mp_mode
        if self.mp_mode:
            from multiprocessing import Process, Event, Queue
        else:
            from multiprocessing.dummy import Process, Event, Queue

        if network_result_queue is not None:
            self.network_result_queue = network_result_queue
        else:
            self.network_result_queue = Queue()
        self.parser_result_queue = parser_result_queue
        self.is_parser_idle = is_parser_idle
        if shutdown_event is not None:
            self.shutdown_event = shutdown_event
        else:
            self.shutdown_event = Event()
        if not self.mp_mode and parser_pool_size and parser_pool_size > 1:
            raise SpiderConfigurationError(
                'Parser pool size could be only 1 in '
                'non-multiprocess mode')
        self.parser_pool_size = parser_pool_size
        self.parser_mode = parser_mode
        self.parser_requests_per_process = parser_requests_per_process

        self.stat = Stat()
        self.timer = Timer()
        self.task_queue = taskq

        if args is None:
            self.args = {}
        else:
            self.args = args

        if config is not None:
            self.config = config
        else:
            self.config = {}

        if meta:
            self.meta = meta
        else:
            self.meta = {}

        self.thread_number = (
            thread_number or
            int(self.config.get('thread_number',
                                DEFAULT_NETWORK_STREAM_NUMBER)))
        self.task_try_limit = (
            task_try_limit or
            int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT)))
        self.network_try_limit = (
            network_try_limit or
            int(self.config.get('network_try_limit',
                                DEFAULT_NETWORK_TRY_LIMIT)))

        self._grab_config = {}
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode

        self.only_cache = only_cache
        self.cache_pipeline = None
        self.work_allowed = True
        if request_pause is not NULL:
            warn('Option `request_pause` is deprecated and is not '
                 'supported anymore')

        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False
        self.interrupted = False
Ejemplo n.º 19
0
    def __init__(
        self,
        thread_number=None,
        network_try_limit=None,
        task_try_limit=None,
        request_pause=NULL,
        priority_mode='random',
        meta=None,
        only_cache=False,
        config=None,
        slave=False,
        max_task_generator_chunk=None,
        args=None,
        # New options start here
        taskq=None,
    ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * network_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        New options:
        * taskq=None,
        * newtork_response_queue=None,
        """

        self.taskq = taskq

        if args is None:
            self.args = {}
        else:
            self.args = args

        self.slave = slave

        self.max_task_generator_chunk = max_task_generator_chunk
        self.timers = {
            'network-name-lookup': 0,
            'network-connect': 0,
            'network-total': 0,
        }
        self.time_points = {}
        self.start_timer('total')
        if config is not None:
            self.config = config
        else:
            self.config = {}

        if meta:
            self.meta = meta
        else:
            self.meta = {}

        self.task_generator_enabled = False
        self.only_cache = only_cache

        self.thread_number = thread_number or\
                             int(self.config.get('thread_number', 3))
        self.task_try_limit = task_try_limit or\
                              int(self.config.get('task_try_limit', 10))
        self.network_try_limit = network_try_limit or \
                                 int(self.config.get('network_try_limit', 10))

        self.counters = defaultdict(int)
        self._grab_config = {}
        self.items = {}
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode

        try:
            signal.signal(signal.SIGUSR1, self.sigusr1_handler)
        except (ValueError, AttributeError):
            pass

        try:
            signal.signal(signal.SIGUSR2, self.sigusr2_handler)
        except (ValueError, AttributeError):
            pass

        # Initial cache-subsystem values
        self.cache_enabled = False
        self.cache = None

        self.work_allowed = True
        if request_pause is not NULL:
            logger.error('Option `request_pause` is deprecated and is not '
                         'supported anymore')

        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False
        self.interrupted = False
Ejemplo n.º 20
0
    def __init__(
            self,
            name=None,
            url=None,
            grab=None,
            grab_config=None,
            priority=None,
            priority_set_explicitly=True,
            network_try_count=0,
            task_try_count=1,
            valid_status=None,
            use_proxylist=True,
            delay=None,
            raw=False,
            callback=None,
            fallback_name=None,
            # deprecated
            disable_cache=False,
            refresh_cache=False,
            cache_timeout=None,
            # kwargs
            **kwargs):
        """
        Create `Task` object.

        If more than one of url, grab and grab_config options are non-empty
        then they processed in following order:
        * grab overwrite grab_config
        * grab_config overwrite url

        Args:
            :param name: name of the task. After successful network operation
                task's result will be passed to `task_<name>` method.
            :param url: URL of network document. Any task requires `url` or
                `grab` option to be specified.
            :param grab: configured `Grab` instance. You can use that option in
                case when `url` option is not enough. Do not forget to
                configure `url` option of `Grab` instance because in this case
                the `url` option of `Task` constructor will be overwritten
                with `grab.config['url']`.
            :param priority: - priority of the Task. Tasks with lower priority
                will be processed earlier. By default each new task is assigned
                with random priority from (80, 100) range.
            :param priority_set_explicitly: - internal flag which tells if that
                task priority was assigned manually or generated by spider
                according to priority generation rules.
            :param network_try_count: you'll probably will not need to use it.
                It is used internally to control how many times this task was
                restarted due to network errors. The `Spider` instance has
                `network_try_limit` option. When `network_try_count` attribute
                of the task exceeds the `network_try_limit` attribute then
                processing of the task is abandoned.
            :param task_try_count: the as `network_try_count` but it increased
                only then you use `clone` method. Also you can set it manually.
                It is useful if you want to restart the task after it was
                cancelled due to multiple network errors. As you might guessed
                there is `task_try_limit` option in `Spider` instance. Both
                options `network_try_count` and `network_try_limit` guarantee
                you that you'll not get infinite loop of restarting some task.
            :param valid_status: extra status codes which counts as valid
            :param use_proxylist: it means to use proxylist which was
                configured via `setup_proxylist` method of spider
            :param delay: if specified tells the spider to schedule the task
                and execute    it after `delay` seconds
            :param raw: if `raw` is True then the network response is
                forwarding to the corresponding handler without any check of
                HTTP status code of network error, if `raw` is False (by
                default) then failed response is putting back to task queue or
                if tries limit is reached then the processing of this  request
                is finished.
            :param callback: if you pass some function in `callback` option
                then the network response will be passed to this callback and
                the usual 'task_*' handler will be ignored and no error will be
                raised if such 'task_*' handler does not exist.
            :param fallback_name: the name of method that is called when spider
                gives up to do the task (due to multiple network errors)

            Any non-standard named arguments passed to `Task` constructor will
            be saved as attributes of the object. You can get their values
            later as attributes or with `get` method which allows to use
            default value if attribute does not exist.
        """

        if disable_cache or refresh_cache or cache_timeout:
            raise_feature_is_deprecated('Cache feature')

        if name == 'generator':
            # The name "generator" is restricted because
            # `task_generator` handler could not be created because
            # this name is already used for special method which
            # generates new tasks
            raise SpiderMisuseError('Task name could not be "generator"')

        self.name = name

        if url is None and grab is None and grab_config is None:
            raise SpiderMisuseError('Either url, grab or grab_config argument '
                                    'of Task constructor should not be None')

        if url is not None and grab is not None:
            raise SpiderMisuseError('Options url and grab could not be used '
                                    'together')

        if url is not None and grab_config is not None:
            raise SpiderMisuseError('Options url and grab_config could not be '
                                    'used together')

        if grab is not None and grab_config is not None:
            raise SpiderMisuseError(
                'Options grab and grab_config could not be used together')

        if grab:
            self.setup_grab_config(grab.dump_config())
        elif grab_config:
            self.setup_grab_config(grab_config)
        else:
            self.grab_config = None
            self.url = url

        if valid_status is None:
            self.valid_status = []
        else:
            self.valid_status = valid_status

        self.process_delay_option(delay)

        self.fallback_name = fallback_name
        self.priority_set_explicitly = priority_set_explicitly
        self.priority = priority
        self.network_try_count = network_try_count
        self.task_try_count = task_try_count
        self.use_proxylist = use_proxylist
        self.raw = raw
        self.callback = callback
        self.coroutines_stack = []
        for key, value in kwargs.items():
            setattr(self, key, value)