Ejemplo n.º 1
0
    def test_single_file_combined(self):
        agg = SimpleAggregator(
            {'source': FileAccessIndexSource(TEST_EXCL_PATH + 'list1.aclj')})
        access = AccessChecker(agg, default_access='block')

        edx = access.find_access_rule('http://example.com/abc/page.html')
        assert edx['urlkey'] == 'com,example)/abc/page.html'
        assert edx['access'] == 'allow'

        edx = access.find_access_rule('http://example.com/abc/page.htm')
        assert edx['urlkey'] == 'com,example)/abc'
        assert edx['access'] == 'block'

        edx = access.find_access_rule('http://example.com/abc/')
        assert edx['urlkey'] == 'com,example)/abc'
        assert edx['access'] == 'block'

        edx = access.find_access_rule('http://foo.example.com/')
        assert edx['urlkey'] == 'com,example,'
        assert edx['access'] == 'exclude'

        edx = access.find_access_rule('http://example.com/')
        assert edx['urlkey'] == 'com,'
        assert edx['access'] == 'allow'

        edx = access.find_access_rule('foo.net')
        assert edx['urlkey'] == ''
        assert edx['access'] == 'block'

        edx = access.find_access_rule('https://example.net/abc/path/other')
        assert edx['urlkey'] == ''
        assert edx['access'] == 'block'
Ejemplo n.º 2
0
    def load_auto_colls(self):
        if not self.root_dir:
            print('No Root Dir, Skip Auto Colls!')
            return

        dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
                                               base_dir=self.index_paths,
                                               config=self.config)

        access_checker = AccessChecker(
            CacheDirectoryAccessSource(self.acl_paths), self.default_access)

        if self.dedup_index_url:
            source = SimpleAggregator({
                'dedup':
                RedisMultiKeyIndexSource(self.dedup_index_url),
                'dir':
                dir_source
            })

        else:
            source = dir_source

        return DefaultResourceHandler(source,
                                      self.archive_paths,
                                      rules_file=self.rules_file,
                                      access_checker=access_checker)
Ejemplo n.º 3
0
    def test_blocks_only(self):
        agg = SimpleAggregator(
            {'source': FileAccessIndexSource(TEST_EXCL_PATH + 'blocks.aclj')})
        access = AccessChecker(agg)

        edx = access.find_access_rule('https://example.com/foo')
        assert edx['urlkey'] == 'com,example)/foo'
        assert edx['access'] == 'exclude'

        edx = access.find_access_rule('https://example.com/food')
        assert edx['urlkey'] == 'com,example)/foo'
        assert edx['access'] == 'exclude'

        edx = access.find_access_rule('https://example.com/foo/path')
        assert edx['urlkey'] == 'com,example)/foo'
        assert edx['access'] == 'exclude'

        edx = access.find_access_rule('https://example.net/abc/path')
        assert edx['urlkey'] == 'net,example)/abc/path'
        assert edx['access'] == 'block'

        edx = access.find_access_rule('https://example.net/abc/path/other')
        assert edx['urlkey'] == 'net,example)/abc/path'
        assert edx['access'] == 'block'

        edx = access.find_access_rule('https://example.net/fo')
        assert edx['urlkey'] == ''
        assert edx['access'] == 'allow'
Ejemplo n.º 4
0
 def make_live_app():
     app = BaseWarcServer()
     app.add_route(
         '/live',
         DefaultResourceHandler(
             SimpleAggregator({'live': LiveIndexSource()})))
     return app
Ejemplo n.º 5
0
    def test_extra_agg_collB(self):
        agg_source = SimpleAggregator({'dir': self.dir_loader})
        res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})

        exp = [{'source': to_path('dir:colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Ejemplo n.º 6
0
def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None):
    sources = {}
    for n, v in iteritems(source_configs):
        sources[n] = init_index_source(v, source_list=source_list)

    if use_gevent:
        return GeventTimeoutAggregator(sources, timeout=timeout)
    else:
        return SimpleAggregator(sources)
Ejemplo n.º 7
0
    def setup(self):
        app = BaseWarcServer()

        base_url = 'http://localhost:{0}'.format(self.server.port)
        app.add_route(
            '/upstream',
            DefaultResourceHandler(
                SimpleAggregator(
                    {'upstream': UpstreamAggIndexSource(base_url + '/live')})))

        app.add_route(
            '/upstream_opt',
            DefaultResourceHandler(
                SimpleAggregator({
                    'upstream_opt':
                    UpstreamMementoIndexSource.upstream_resource(base_url +
                                                                 '/live')
                })))

        self.base_url = base_url
        self.testapp = webtest.TestApp(app)
Ejemplo n.º 8
0
    def __init__(self, *args, **kwargs):
        redis_url = kwargs.get('redis_url')
        redis = kwargs.get('redis')
        cdx_key_template = kwargs.get('cdx_key_template')

        super(WritableRedisIndexer, self).__init__(redis_url, redis,
                                                   cdx_key_template)

        name = kwargs.get('name', 'recorder')
        self.cdx_lookup = SimpleAggregator({name: self})

        self.rel_path_template = kwargs.get('rel_path_template', '')
        self.file_key_template = kwargs.get('file_key_template', '')
        self.full_warc_prefix = kwargs.get('full_warc_prefix', '')
        self.dupe_policy = kwargs.get('dupe_policy', WriteRevisitDupePolicy())
Ejemplo n.º 9
0
    def test_agg_dir_and_memento(self):
        sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
                   'local': self.dir_loader}
        agg_source = SimpleAggregator(sources)

        res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})

        exp = [
            {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
            {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
            {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
            {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
            {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
            {'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
        ]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Ejemplo n.º 10
0
    def setup_class(cls):
        super(TestBaseWarcServer, cls).setup_class()

        live_source = SimpleAggregator({'live': LiveIndexSource()})
        live_handler = DefaultResourceHandler(live_source)
        app = BaseWarcServer()
        app.add_route('/live', live_handler)

        source1 = GeventTimeoutAggregator(sources)
        handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH)
        app.add_route('/many', handler1)

        app.add_route(
            '/cdx_api',
            DefaultResourceHandler(SimpleAggregator(ia_cdx), TEST_WARC_PATH))

        source2 = SimpleAggregator(
            {'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')})
        handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH)
        app.add_route('/posttest', handler2)

        source3 = SimpleAggregator(
            {'example': FileIndexSource(TEST_CDX_PATH + 'example2.cdxj')})
        handler3 = DefaultResourceHandler(source3, TEST_WARC_PATH)

        app.add_route('/fallback',
                      HandlerSeq([handler3, handler2, live_handler]))

        app.add_route('/seq', HandlerSeq([handler3, handler2]))

        app.add_route(
            '/allredis',
            DefaultResourceHandler(source3, 'redis://localhost/2/test:warc'))

        app.add_route('/empty', HandlerSeq([]))
        app.add_route(
            '/invalid',
            DefaultResourceHandler(
                [SimpleAggregator({'invalid': 'should not be a callable'})]))

        url_agnost = SimpleAggregator({
            'url-agnost':
            FileIndexSource(TEST_CDX_PATH + 'url-agnost-example.cdxj')
        })
        app.add_route(
            '/urlagnost',
            DefaultResourceHandler(url_agnost,
                                   'redis://localhost/2/test:{arg}:warc'))

        cls.testapp = webtest.TestApp(app)
Ejemplo n.º 11
0
    def test_allows_only_default_block(self):
        agg = SimpleAggregator(
            {'source': FileAccessIndexSource(TEST_EXCL_PATH + 'allows.aclj')})
        access = AccessChecker(agg, default_access='block')

        edx = access.find_access_rule('http://example.net')
        assert edx['urlkey'] == 'net,'

        edx = access.find_access_rule('http://foo.example.net/abc')
        assert edx['urlkey'] == 'net,'

        edx = access.find_access_rule('https://example.net/test/')
        assert edx['urlkey'] == 'net,example)/test'

        edx = access.find_access_rule('https://example.org/')
        assert edx['urlkey'] == ''
        assert edx['access'] == 'block'

        edx = access.find_access_rule('https://abc.domain.net/path')
        assert edx['urlkey'] == 'net,domain,'

        edx = access.find_access_rule('https://domain.neta/path')
        assert edx['urlkey'] == ''
        assert edx['access'] == 'block'
    def __init__(self):
        init_logging()

        config = load_wr_config()

        app = BaseWarcServer(debug=True)

        redis_base = os.environ['REDIS_BASE_URL'] + '/'

        rec_url = redis_base + config['cdxj_key_templ']
        coll_url = redis_base + config['coll_cdxj_key_templ']
        warc_url = redis_base + config['warc_key_templ']
        rec_list_key = config['rec_list_key_templ']

        redis_resolver = RedisResolver(redis_url=warc_url,
                                       member_key_templ=rec_list_key)
        redis = redis_resolver.redis
        warc_resolvers = [redis_resolver]

        cache_proxy_url = os.environ.get('CACHE_PROXY_URL', '')
        global PROXY_PREFIX
        PROXY_PREFIX = cache_proxy_url

        timeout = 20.0

        rec_redis_source = RedisIndexSource(timeout=timeout,
                                            redis_url=rec_url,
                                            redis=redis)

        coll_redis_source = RedisIndexSource(timeout=timeout,
                                             redis_url=coll_url,
                                             redis=redis)

        live_rec = DefaultResourceHandler(
            SimpleAggregator({'live': LiveIndexSource()}, ), warc_resolvers,
            cache_proxy_url)

        # Extractable archives (all available)
        wam_loader = WAMSourceLoader(memento_cls=ProxyMementoIndexSource,
                                     remote_cls=ProxyRemoteIndexSource,
                                     wb_memento_cls=ProxyWBMementoIndexSource)

        extractable_archives = wam_loader.sources

        # Extract Source
        extractor = GeventTimeoutAggregator(extractable_archives,
                                            timeout=timeout)
        extract_primary = DefaultResourceHandler(extractor, warc_resolvers,
                                                 cache_proxy_url)

        # Patch fallback archives
        fallback_archives = self.filter_archives(
            extractable_archives, config['patch_archives_index'])

        # patch + live
        #patch_archives = fallback_archives.copy()
        patch_archives = fallback_archives
        patch_archives['live'] = LiveIndexSource()

        extractor2 = GeventTimeoutAggregator(patch_archives,
                                             timeout=timeout,
                                             sources_key='inv_sources',
                                             invert_sources=True)

        extract_other = DefaultResourceHandler(extractor2, warc_resolvers,
                                               cache_proxy_url)

        patcher = GeventTimeoutAggregator(patch_archives, timeout=timeout)
        patch_rec = DefaultResourceHandler(patcher, warc_resolvers,
                                           cache_proxy_url)

        # Single Rec Replay
        replay_rec = DefaultResourceHandler(
            SimpleAggregator({'local': rec_redis_source}), warc_resolvers,
            cache_proxy_url)

        # Coll Replay
        replay_coll = DefaultResourceHandler(
            SimpleAggregator({'local': coll_redis_source}), warc_resolvers,
            cache_proxy_url)

        app.add_route('/live', live_rec)
        app.add_route('/extract',
                      HandlerSeq([extract_primary, extract_other, replay_rec]))
        app.add_route('/replay', replay_rec)
        app.add_route('/replay-coll', replay_coll)
        app.add_route('/patch', HandlerSeq([replay_coll, patch_rec]))

        self.app = app
Ejemplo n.º 13
0
 def query_single_source(source, params):
     string = str(source)
     return SimpleAggregator({'source': source})(params)
Ejemplo n.º 14
0
    'local':
    FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
    'ia':
    MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
    'ait':
    MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'),
    'bl':
    MementoIndexSource.from_timegate_url(
        'http://www.webarchive.org.uk/wayback/archive/'),
    'rhiz':
    MementoIndexSource.from_timegate_url('https://webenact.rhizome.org/vvork/',
                                         path='*')
}

aggs = {
    'simple': SimpleAggregator(sources),
    'gevent': GeventTimeoutAggregator(sources, timeout=5.0),
}

aggs_inv = {
    'simple': SimpleAggregator(sources, invert_sources=True),
    'gevent': GeventTimeoutAggregator(sources,
                                      invert_sources=True,
                                      timeout=5.0),
}

agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.05)}

nf = {'notfound': FileIndexSource('testdata/not-found-x')}
agg_nf = {
    'simple': SimpleAggregator(nf),
Ejemplo n.º 15
0
 def setup_class(cls):
     cls.source = SimpleAggregator({'source': EchoParamsSource()})
     cls.fuzzy = FuzzyMatcher()
Ejemplo n.º 16
0
from mock import patch

from pywb.warcserver.handlers import IndexHandler


# Aggregator Mappings
sources = {
    'local': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
    'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
    'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'),
    'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'),
    'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*')
}

aggs = {'simple': SimpleAggregator(sources),
        'gevent': GeventTimeoutAggregator(sources, timeout=5.0),
       }

aggs_inv = {'simple': SimpleAggregator(sources, invert_sources=True),
            'gevent': GeventTimeoutAggregator(sources, invert_sources=True, timeout=5.0),
           }

agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.05)}

nf = {'notfound': FileIndexSource('testdata/not-found-x')}
agg_nf = {'simple': SimpleAggregator(nf),
          'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
         }

Ejemplo n.º 17
0
 def do_query(self, params):
     return SimpleAggregator(
         {'source':
          XmlQueryIndexSource('http://localhost:8080/path')})(params)