Beispiel #1
0
def test_parse_url_opts():
    url = 'http://map.org/api/download/?id=157'
    output = parse_url_opts(url)
    eq_(output, ('http://map.org/api/download/', {'id': '157'}))

    url = 's3://bucket/save/?key=891'
    output = parse_url_opts(url)
    eq_(output, ('s3://bucket/save/', {'key': '891'}))

    url = 'http://map.org/api/download/?id=98&code=13'
    output = parse_url_opts(url)
    eq_(output, ('http://map.org/api/download/', {'id': '98', 'code': '13'}))
Beispiel #2
0
def load_pipeline_from_config(path):
    """Given a path to the pipeline configuration file, instantiate a pipeline

    Typical example description

        [crawl:pipeline]
        pipeline = standard
        func = pipeline1
        _kwarg1 = 1

    which would instantiate a pipeline from standard.py module by calling
    `standard.pipeline1` with `_kwarg1='1'`.  This definition is identical to

        [crawl:pipeline]
        pipeline = standard?func=pipeline1&_kwarg1=1

    so that theoretically we could specify basic pipelines completely within
    a URL
    """
    cfg_ = SafeConfigParserWithIncludes()
    cfg_.read([path])
    pipeline = None
    for sec in (CRAWLER_PIPELINE_SECTION, CRAWLER_PIPELINE_SECTION_DEPRECATED):
        if not cfg_.has_section(sec):
            continue
        if sec == CRAWLER_PIPELINE_SECTION_DEPRECATED:
            lgr.warning(
                "Crawler section was renamed from %s to %s and format has changed"
                " please adjust", CRAWLER_PIPELINE_SECTION_DEPRECATED,
                CRAWLER_PIPELINE_SECTION)
        opts = cfg_.options(sec)
        # must have template
        if 'template' not in opts:
            raise PipelineNotSpecifiedError(
                "%s lacks %r field within %s section" %
                (path, 'template', sec))
        template = cfg_.get(sec, 'template')
        # parse template spec
        template_name, url_opts = parse_url_opts(template)

        # so we will allow to specify options in the url and then also in the section definitions
        all_opts = updated(url_opts, {o: cfg_.get(sec, o) for o in opts})
        template_opts = {
            k: v
            for k, v in all_opts.items() if not k.startswith('_')
        }
        pipeline_opts = {
            k[1:]: v
            for k, v in all_opts.items() if k.startswith('_')
        }
        assert not set(template_opts).difference(
            {'template', 'func'}), "ATM we understand only 'func'"

        pipeline = load_pipeline_from_template(template_name,
                                               func=template_opts.get(
                                                   'func', None),
                                               kwargs=pipeline_opts)
        break
    if pipeline is None:
        raise IOError("Did not find section %r within %s" %
                      (CRAWLER_PIPELINE_SECTION, path))
    return pipeline