Esempio n. 1
0
 def test_without_none_values(self):
     self.assertEqual(without_none_values([1, None, 3, 4]), [1, 3, 4])
     self.assertEqual(without_none_values((1, None, 3, 4)), (1, 3, 4))
     self.assertEqual(
         without_none_values({
             'one': 1,
             'none': None,
             'three': 3,
             'four': 4
         }), {
             'one': 1,
             'three': 3,
             'four': 4
         })
Esempio n. 2
0
def build_component_list(compdict, convert=update_classpath):
    """Compose a component list from a { class: order } dictionary."""
    def _check_components(complist):
        if len({convert(c) for c in complist}) != len(complist):
            raise ValueError('Some paths in {!r} convert to the same object, '
                             'please update your settings'.format(complist))

    def _map_keys(compdict):
        if isinstance(compdict, BaseSettings):
            compbs = BaseSettings()
            for k, v in six.iteritems(compdict):
                prio = compdict.getpriority(k)
                if compbs.getpriority(convert(k)) == prio:
                    raise ValueError('Some paths in {!r} convert to the same '
                                     'object, please update your settings'
                                     ''.format(list(compdict.keys())))
                else:
                    compbs.set(convert(k), v, priority=prio)
            return compbs
        else:
            _check_components(compdict)
            return {convert(k): v for k, v in six.iteritems(compdict)}

    if isinstance(compdict, (list, tuple)):
        _check_components(compdict)
        return type(compdict)(convert(c) for c in compdict)
    compdict = without_none_values(_map_keys(compdict))
    return [k for k, v in sorted(six.iteritems(compdict), key=itemgetter(1))]
Esempio n. 3
0
 def process_options(self, args, opts):
     ScrapyCommand.process_options(self, args, opts)
     try:
         opts.spargs = arglist_to_dict(opts.spargs)
     except ValueError:
         raise UsageError("Invalid -a value, use -a NAME=VALUE",
                          print_help=False)
     if opts.output:
         if opts.output == '-':
             self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
         else:
             self.settings.set('FEED_URI', opts.output, priority='cmdline')
         feed_exporters = without_none_values(
             self.settings.getwithbase('FEED_EXPORTERS'))
         valid_output_formats = feed_exporters.keys()
         if not opts.output_format:
             opts.output_format = os.path.splitext(opts.output)[1].replace(
                 ".", "")
         if opts.output_format not in valid_output_formats:
             raise UsageError(
                 "Unrecognized output format '%s', set one"
                 " using the '-t' switch or as a file extension"
                 " from the supported list %s" %
                 (opts.output_format, tuple(valid_output_formats)))
         self.settings.set('FEED_FORMAT',
                           opts.output_format,
                           priority='cmdline')
Esempio n. 4
0
def build_component_list(compdict, convert=update_classpath):
    """Compose a component list from a { class: order } dictionary."""

    def _check_components(complist):
        if len({convert(c) for c in complist}) != len(complist):
            raise ValueError('Some paths in {!r} convert to the same object, '
                             'please update your settings'.format(complist))

    def _map_keys(compdict):
        if isinstance(compdict, BaseSettings):
            compbs = BaseSettings()
            for k, v in six.iteritems(compdict):
                prio = compdict.getpriority(k)
                if compbs.getpriority(convert(k)) == prio:
                    raise ValueError('Some paths in {!r} convert to the same '
                                     'object, please update your settings'
                                     ''.format(list(compdict.keys())))
                else:
                    compbs.set(convert(k), v, priority=prio)
            return compbs
        else:
            _check_components(compdict)
            return {convert(k): v for k, v in six.iteritems(compdict)}

    if isinstance(compdict, (list, tuple)):
        _check_components(compdict)
        return type(compdict)(convert(c) for c in compdict)
    compdict = without_none_values(_map_keys(compdict))
    return [k for k, v in sorted(six.iteritems(compdict), key=itemgetter(1))]
Esempio n. 5
0
 def test_without_none_values(self):
     self.assertEqual(without_none_values([1, None, 3, 4]), [1, 3, 4])
     self.assertEqual(without_none_values((1, None, 3, 4)), (1, 3, 4))
     self.assertEqual(
         without_none_values({
             "one": 1,
             "none": None,
             "three": 3,
             "four": 4
         }),
         {
             "one": 1,
             "three": 3,
             "four": 4
         },
     )
Esempio n. 6
0
    def __init__(self, crawler):
        """
        就是需下载的资源是什么类型,就选用哪一种下载处理器进行网络下载,其中最常用的就是http和https对应的处理器。
        """
        self._crawler = crawler
        self._schemes = {
        }  # stores acceptable schemes on instancing 存储scheme对应的类路径,后面用于实例化
        self._handlers = {
        }  # stores instanced handlers for schemes 存储scheme对应的下载器
        self._notconfigured = {}  # remembers failed handlers
        """
        从配置中找到DOWNLOAD_HANDLERS_BASE,构造下载处理器
        注意:这里是调用getwithbase方法,取的是配置中的XXXX_BASE配置。
        
        handlers就是包含了default_settins.py中DOWNLOAD_HANDLERS_BASE项下的所有handler,
        也就是根据下载资源的类型,采用不同的下载器,最常用的就是http和https了
        """
        handlers = without_none_values(
            crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
        for scheme, clspath in six.iteritems(
                handlers):  # 存储scheme对应的类路径,后面用于实例化
            self._schemes[
                scheme] = clspath  # 其实就是把handler复制了一遍,为啥不用深拷贝?clspath就是不同handler类的路径,可用于实例化
            self._load_handler(
                scheme, skip_lazy=True
            )  # 这里就把每种handler实例化了,保存在self._handlers中。所以不用深拷贝,这里还有实例化这个步骤

        crawler.signals.connect(self._close, signals.engine_stopped)
Esempio n. 7
0
 def _load_components(self, setting_prefix):
     conf = without_none_values(self.settings.getwithbase(setting_prefix))
     d = {}
     for k, v in conf.items():
         try:
             d[k] = load_object(v)
         except NotConfigured:
             pass
     return d
Esempio n. 8
0
    def __init__(self, crawler):
        self._crawler = crawler
        self._schemes = {}  # stores acceptable schemes on instancing
        self._handlers = {}  # stores instanced handlers for schemes
        self._notconfigured = {}  # remembers failed handlers
        handlers = without_none_values(crawler.settings._getcomposite('DOWNLOAD_HANDLERS'))
        for scheme, clspath in six.iteritems(handlers):
            self._schemes[scheme] = clspath

        crawler.signals.connect(self._close, signals.engine_stopped)
Esempio n. 9
0
    def __init__(self, crawler):
        self._crawler = crawler
        self._schemes = {}  # stores acceptable schemes on instancing
        self._handlers = {}  # stores instanced handlers for schemes
        self._notconfigured = {}  # remembers failed handlers
        handlers = without_none_values(
            crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
        for scheme, clspath in six.iteritems(handlers):
            self._schemes[scheme] = clspath

        crawler.signals.connect(self._close, signals.engine_stopped)
Esempio n. 10
0
    def _build_component_list(self, compdict, custom=None, convert=update_classpath):
        """
        Compose a component list from a { class: order|[orders] } dictionary.

        Adapted from scrapy.utils.conf.build_component_list
        """

        def _check_components(complist):
            if len({convert(c) for c in complist}) != len(complist):
                raise ValueError('Some paths in {!r} convert to the same object, '
                                 'please update your settings'.format(complist))

        def _map_keys(compdict):
            if isinstance(compdict, BaseSettings):
                compbs = BaseSettings()
                for k, v in six.iteritems(compdict):
                    prio = compdict.getpriority(k)
                    if compbs.getpriority(convert(k)) == prio:
                        raise ValueError('Some paths in {!r} convert to the same '
                                         'object, please update your settings'
                                         ''.format(list(compdict.keys())))
                    else:
                        compbs.set(convert(k), v, priority=prio)
                return compbs
            else:
                _check_components(compdict)
                return {convert(k): v for k, v in six.iteritems(compdict)}

        def _validate_values(compdict):
            """Fail if a value in the components dict is not a real number or a list of them or None."""
            for name, value in six.iteritems(compdict):
                try:
                    vals = iter(value)
                except:
                    vals = [value]
                for val in vals:
                    if val is not None and not isinstance(val, numbers.Real):
                        raise ValueError('Invalid value {} for component {}, please provide '
                                         'a real number or None instead'.format(val, name))

        # BEGIN Backward compatibility for old (base, custom) call signature
        if isinstance(custom, (list, tuple)):
            _check_components(custom)
            return type(custom)(convert(c) for c in custom)

        if custom is not None:
            compdict.update(custom)
        # END Backward compatibility

        _validate_values(compdict)
        compdict = without_none_values(_map_keys(compdict))
        comptuples = dol2lot(compdict)
        return [k for k, v in sorted(comptuples, key=itemgetter(1))]
Esempio n. 11
0
def feed_process_params_from_cli(settings, output, output_format=None):
    """
    Receives feed export params (from the 'crawl' or 'runspider' commands),
    checks for inconsistencies in their quantities and returns a dictionary
    suitable to be used as the FEEDS setting.
    """
    valid_output_formats = without_none_values(
        settings.getwithbase("FEED_EXPORTERS")).keys()

    def check_valid_format(output_format):
        if output_format not in valid_output_formats:
            raise UsageError(
                "Unrecognized output format '%s', set one after a"
                " colon using the -o option (i.e. -o <URI>:<FORMAT>)"
                " or as a file extension, from the supported list %s" %
                (output_format, tuple(valid_output_formats)))

    if output_format:
        if len(output) == 1:
            check_valid_format(output_format)
            warnings.warn(
                "The -t command line option is deprecated in favor"
                " of specifying the output format within the -o"
                " option, please check the -o option docs for more details",
                category=ScrapyDeprecationWarning,
                stacklevel=2,
            )
            return {output[0]: {"format": output_format}}
        else:
            raise UsageError(
                "The -t command line option cannot be used if multiple"
                " output files are specified with the -o option")

    result = {}
    for element in output:
        try:
            feed_uri, feed_format = element.rsplit(":", 1)
        except ValueError:
            feed_uri = element
            feed_format = os.path.splitext(element)[1].replace(".", "")
        else:
            if feed_uri == "-":
                feed_uri = "stdout:"
        check_valid_format(feed_format)
        result[feed_uri] = {"format": feed_format}

    # FEEDS setting should take precedence over the -o and -t CLI options
    result.update(settings.getdict("FEEDS"))

    return result
Esempio n. 12
0
    def __init__(self, crawler):
        self._crawler = crawler
        self._schemes = {}  # 存储scheme对应的类路径,后面用于实例化
        self._handlers = {}  # 存储scheme对应的下载器
        self._notconfigured = {}  # remembers failed handlers
        # 从配置中找到DOWNLOAD_HANDLERS_BASE,构造下载处理器
        # 注意:这里是调用getwithbase方法,取的是配置中的XXXX_BASE配置
        handlers = without_none_values(
            crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
        # 存储scheme对应的类路径,后面用于实例化
        for scheme, clspath in six.iteritems(handlers):
            self._schemes[scheme] = clspath

        crawler.signals.connect(self._close, signals.engine_stopped)
Esempio n. 13
0
    def __init__(self, crawler):
        """ 下载器处理器功能: 管理各种资源对应的下载器, 在真正发起网络请求时, 选择对应的下载器, 此时才实例化 """
        self._crawler = crawler
        self._schemes = {}  # stores acceptable schemes on instancing
        self._handlers = {}  # stores instanced handlers for schemes
        self._notconfigured = {}  # remembers failed handlers
        # 根据配置中的handlers构造下载处理器, 见配置: DOWNLOAD_HANDLERS_BASE
        handlers = without_none_values(
            crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
        # 存储每一个下载器对应的类路径, 以便后续的实例化操作
        for scheme, clspath in six.iteritems(handlers):
            self._schemes[scheme] = clspath
            self._load_handler(scheme, skip_lazy=True)

        crawler.signals.connect(self._close, signals.engine_stopped)
Esempio n. 14
0
 def __init__(self, crawler):
     self._crawler = crawler
     self._schemes = {
     }  # stores acceptable schemes on instancing | 存储实例化可接受的协议
     self._handlers = {
     }  # stores instanced handlers for schemes | 存储实例化可接受的处理函数
     self._notconfigured = {}  # remembers failed handlers | 存储失败的处理程序
     # 返回不为None的处理函数路径
     handlers = without_none_values(
         crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
     for scheme, clspath in handlers.items():
         # 实例化各个协议的处理函数
         self._schemes[scheme] = clspath
         self._load_handler(scheme, skip_lazy=True)
     # s.engine_stopped todo 这里有一个信号,暂时还不知道具体用处
     crawler.signals.connect(self._close, signals.engine_stopped)
Esempio n. 15
0
    def __init__(self, crawler):
        self._crawler = crawler
        self._schemes = {
        }  # stores acceptable schemes on instancing  存储方案的,http及方案路径
        self._handlers = {
        }  # stores instanced handlers for schemes  存储所有的handler对象,都是load_obj之后的
        self._notconfigured = {}  # remembers failed handlers
        handlers = without_none_values(
            crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))  # 获取BASE设置
        for scheme, clspath in six.iteritems(
                handlers
        ):  # 'http'和'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
            self._schemes[scheme] = clspath
            self._load_handler(scheme, skip_lazy=True)

        crawler.signals.connect(self._close, signals.engine_stopped)
Esempio n. 16
0
def build_component_list(compdict, custom=None, convert=update_classpath):
    """Compose a component list from a { class: order } dictionary."""
    # 根据{class:order}字典组成一个组件列表。

    def _check_components(complist):
        if len({convert(c) for c in complist}) != len(complist):
            raise ValueError(f'Some paths in {complist!r} convert to the same object, '
                             'please update your settings')

    def _map_keys(compdict):
        if isinstance(compdict, BaseSettings):
            compbs = BaseSettings()
            for k, v in compdict.items():
                prio = compdict.getpriority(k)
                if compbs.getpriority(convert(k)) == prio:
                    raise ValueError(f'Some paths in {list(compdict.keys())!r} '
                                     'convert to the same '
                                     'object, please update your settings'
                                     )
                else:
                    compbs.set(convert(k), v, priority=prio)
            return compbs
        else:
            _check_components(compdict)
            return {convert(k): v for k, v in compdict.items()}

    def _validate_values(compdict):
        """Fail if a value in the components dict is not a real number or None."""
        # 如果组件字典中的值不是实数或无,则失败。
        for name, value in compdict.items():
            if value is not None and not isinstance(value, numbers.Real):
                raise ValueError(f'Invalid value {value} for component {name}, '
                                 'please provide a real number or None instead')

    # BEGIN Backward compatibility for old (base, custom) call signature
    if isinstance(custom, (list, tuple)):
        _check_components(custom)
        return type(custom)(convert(c) for c in custom)

    if custom is not None:
        compdict.update(custom)
    # END Backward compatibility

    _validate_values(compdict)
    compdict = without_none_values(_map_keys(compdict))
    # 前面都是验证,这里最关键,根据后面的值排序
    return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
Esempio n. 17
0
    def __init__(self, crawler):
        self._crawler = crawler
        ## 存储在实例化时可接受的协议
        self._schemes = {}  # stores acceptable schemes on instancing
        ## 存储协议对应的处理器
        self._handlers = {}  # stores instanced handlers for schemes
        self._notconfigured = {}  # remembers failed handlers
        ## 从配置中找到 DOWNLOAD_HANDLERS_BASE,构造下载处理器
        ## 注意:这里是调用 getwithbase 方法,取的是配置中的 XXXX_BASE 配置
        handlers = without_none_values(
            crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
        ##  存储协议对应的类路径,后面用于实例化
        for scheme, clspath in six.iteritems(handlers):
            self._schemes[scheme] = clspath
            self._load_handler(scheme, skip_lazy=True)

        crawler.signals.connect(self._close, signals.engine_stopped)
Esempio n. 18
0
def build_component_list(compdict, custom=None, convert=update_classpath):
    """Compose a component list from a { class: order } dictionary."""

    def _check_components(complist):
        if len({convert(c) for c in complist}) != len(complist):
            raise ValueError('Some paths in {!r} convert to the same object, '
                             'please update your settings'.format(complist))

    def _map_keys(compdict):
        if isinstance(compdict, BaseSettings):
            compbs = BaseSettings()
            for k, v in six.iteritems(compdict):
                prio = compdict.getpriority(k)
                if compbs.getpriority(convert(k)) == prio:
                    raise ValueError('Some paths in {!r} convert to the same '
                                     'object, please update your settings'
                                     ''.format(list(compdict.keys())))
                else:
                    compbs.set(convert(k), v, priority=prio)
            return compbs
        else:
            _check_components(compdict)
            return {convert(k): v for k, v in six.iteritems(compdict)}

    def _validate_values(compdict):
        """Fail if a value in the components dict is not a real number or None."""
        for name, value in six.iteritems(compdict):
            if value is not None and not isinstance(value, numbers.Real):
                raise ValueError('Invalid value {} for component {}, please provide ' \
                                 'a real number or None instead'.format(value, name))

    # BEGIN Backwards compatibility for old (base, custom) call signature
    if isinstance(custom, (list, tuple)):
        _check_components(custom)
        return type(custom)(convert(c) for c in custom)

    if custom is not None:
        compdict.update(custom)
    # END Backwards compatibility

    _validate_values(compdict)
    compdict = without_none_values(_map_keys(compdict))
    return [k for k, v in sorted(six.iteritems(compdict), key=itemgetter(1))]
Esempio n. 19
0
def build_component_list(compdict, custom=None, convert=update_classpath):
    """Compose a component list from a { class: order } dictionary."""
    def _check_components(complist):
        if len({convert(c) for c in complist}) != len(complist):
            raise ValueError("Some paths in {!r} convert to the same object, "
                             "please update your settings".format(complist))

    def _map_keys(compdict):
        if isinstance(compdict, BaseSettings):
            compbs = BaseSettings()
            for k, v in compdict.items():
                prio = compdict.getpriority(k)
                if compbs.getpriority(convert(k)) == prio:
                    raise ValueError("Some paths in {!r} convert to the same "
                                     "object, please update your settings"
                                     "".format(list(compdict.keys())))
                else:
                    compbs.set(convert(k), v, priority=prio)
            return compbs
        else:
            _check_components(compdict)
            return {convert(k): v for k, v in compdict.items()}

    def _validate_values(compdict):
        """Fail if a value in the components dict is not a real number or None."""
        for name, value in compdict.items():
            if value is not None and not isinstance(value, numbers.Real):
                raise ValueError(
                    "Invalid value {} for component {}, please provide "
                    "a real number or None instead".format(value, name))

    # BEGIN Backward compatibility for old (base, custom) call signature
    if isinstance(custom, (list, tuple)):
        _check_components(custom)
        return type(custom)(convert(c) for c in custom)

    if custom is not None:
        compdict.update(custom)
    # END Backward compatibility

    _validate_values(compdict)
    compdict = without_none_values(_map_keys(compdict))
    return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
Esempio n. 20
0
 def process_options(self, args, opts):
     ScrapyCommand.process_options(self, args, opts)
     try:
         opts.spargs = arglist_to_dict(opts.spargs)
     except ValueError:
         raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
     if opts.output:
         if opts.output == '-':
             self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
         else:
             self.settings.set('FEED_URI', opts.output, priority='cmdline')
         feed_exporters = without_none_values(self.settings._getcomposite('FEED_EXPORTERS'))
         valid_output_formats = feed_exporters.keys()
         if not opts.output_format:
             opts.output_format = os.path.splitext(opts.output)[1].replace(".", "")
         if opts.output_format not in valid_output_formats:
             raise UsageError("Unrecognized output format '%s', set one"
                              " using the '-t' switch or as a file extension"
                              " from the supported list %s" % (opts.output_format,
                                                               tuple(valid_output_formats)))
         self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')
Esempio n. 21
0
 def test_without_none_values(self):
     self.assertEqual(without_none_values([1, None, 3, 4]), [1, 3, 4])
     self.assertEqual(without_none_values((1, None, 3, 4)), (1, 3, 4))
     self.assertEqual(
         without_none_values({"one": 1, "none": None, "three": 3, "four": 4}), {"one": 1, "three": 3, "four": 4}
     )
Esempio n. 22
0
 def test_without_none_values(self):
     self.assertEqual(without_none_values([1, None, 3, 4]), [1, 3, 4])
     self.assertEqual(without_none_values((1, None, 3, 4)), (1, 3, 4))
     self.assertEqual(
         without_none_values({'one': 1, 'none': None, 'three': 3, 'four': 4}),
         {'one': 1, 'three': 3, 'four': 4})
Esempio n. 23
0
def feed_process_params_from_cli(settings,
                                 output,
                                 output_format=None,
                                 overwrite_output=None):
    """
    Receives feed export params (from the 'crawl' or 'runspider' commands),
    checks for inconsistencies in their quantities and returns a dictionary
    suitable to be used as the FEEDS setting.
    """
    valid_output_formats = without_none_values(
        settings.getwithbase('FEED_EXPORTERS')).keys()

    def check_valid_format(output_format):
        if output_format not in valid_output_formats:
            raise UsageError(
                f"Unrecognized output format '{output_format}'. "
                f"Set a supported one ({tuple(valid_output_formats)}) "
                "after a colon at the end of the output URI (i.e. -o/-O "
                "<URI>:<FORMAT>) or as a file extension.")

    overwrite = False
    if overwrite_output:
        if output:
            raise UsageError(
                "Please use only one of -o/--output and -O/--overwrite-output")
        output = overwrite_output
        overwrite = True

    if output_format:
        if len(output) == 1:
            check_valid_format(output_format)
            message = (
                'The -t command line option is deprecated in favor of '
                'specifying the output format within the output URI. See the '
                'documentation of the -o and -O options for more information.',
            )
            warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
            return {output[0]: {'format': output_format}}
        else:
            raise UsageError(
                'The -t command-line option cannot be used if multiple output '
                'URIs are specified')

    result = {}
    for element in output:
        try:
            feed_uri, feed_format = element.rsplit(':', 1)
        except ValueError:
            feed_uri = element
            feed_format = os.path.splitext(element)[1].replace('.', '')
        else:
            if feed_uri == '-':
                feed_uri = 'stdout:'
        check_valid_format(feed_format)
        result[feed_uri] = {'format': feed_format}
        if overwrite:
            result[feed_uri]['overwrite'] = True

    # FEEDS setting should take precedence over the matching CLI options
    result.update(settings.getdict('FEEDS'))

    return result
Esempio n. 24
0
 def from_crawler(cls, crawler):
     headers = without_none_values(crawler.settings['DEFAULT_REQUEST_HEADERS'])
     return cls(headers.items())
Esempio n. 25
0
 def from_crawler(cls, crawler):
     headers = without_none_values(crawler.settings['DEFAULT_REQUEST_HEADERS'])
     return cls(headers.items())