def test_another_remote_not_found(self):
        source = MementoIndexSource.from_timegate_url('https://webenact.rhizome.org/all/')
        url = 'http://x-not-found-x.notfound/'
        res, errs = self.query_single_source(source, dict(url=url, limit=3))


        expected = ''
        assert(key_ts_res(res) == expected)
        assert(errs['source'] == "NotFoundException('https://webenact.rhizome.org/all/timemap/link/http://x-not-found-x.notfound/',)")
Exemple #2
0
    def test_another_remote_not_found(self):
        source = MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/all/')
        url = 'http://x-not-found-x.notfound/'
        res, errs = self.query_single_source(source, dict(url=url, limit=3))


        expected = ''
        assert(key_ts_res(res) == expected)
        assert(errs['source'] == "NotFoundException('http://webenact.rhizome.org/all/timemap/link/http://x-not-found-x.notfound/',)")
    def setup_class(cls):
        super(TestIndexSources, cls).setup_class()
        cls.add_cdx_to_redis(TEST_CDX_PATH + 'iana.cdxj', 'test:rediscdx')

        cls.all_sources = {
            'file': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
            'redis': RedisIndexSource('redis://localhost:6379/2/test:rediscdx'),
            'remote_cdx': RemoteIndexSource('https://webenact.rhizome.org/all/cdx?url={url}',
                              'https://webenact.rhizome.org/all/{timestamp}id_/{url}'),

            'memento': MementoIndexSource('https://webenact.rhizome.org/all/{url}',
                               'https://webenact.rhizome.org/all/timemap/link/{url}',
                               'https://webenact.rhizome.org/all/{timestamp}id_/{url}')
        }
Exemple #4
0
    def add_index(self, replay, apis, pk, collection=''):
        replay = replay.replace('{collection}', collection)
        index = None

        if 'memento' in apis:
            timegate = apis['memento']['timegate'].replace('{collection}', collection) + '{url}'
            timemap = apis['memento']['timemap'].replace('{collection}', collection) + '{url}'
            index = MementoIndexSource(timegate, timemap, replay)
        elif 'cdx' in apis:
            query = apis['cdx']['query'].replace('{collection}', collection)
            index = RemoteIndexSource(query, replay)

        else:
            index = WBMementoIndexSource('', '', replay)

        if index:
            self.all_archives[pk] = index
Exemple #5
0
    def test_agg_dir_and_memento(self):
        sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
                   'local': self.dir_loader}
        agg_source = SimpleAggregator(sources)

        res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})

        exp = [
            {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
            {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
            {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
            {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
            {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
            {'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
        ]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Exemple #6
0
from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq

from pywb.warcserver.index.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
from pywb.warcserver.index.indexsource import RemoteIndexSource

from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator
from pywb.warcserver.index.aggregator import DirectoryIndexSource

from pywb.warcserver.basewarcserver import BaseWarcServer
from pywb.utils.memento import MementoUtils


sources = {
    'local': DirectoryIndexSource(TEST_CDX_PATH),
    'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
    'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'),
    'live': LiveIndexSource(),
}

ia_cdx = {
    'ia-cdx': RemoteIndexSource('http://web.archive.org/cdx?url={url}&closest={closest}&sort=closest',
                                'http://web.archive.org/web/{timestamp}id_/{url}')
}




class TestBaseWarcServer(HttpBinLiveTests, MementoOverrideTests, FakeRedisTests, BaseTestClass):
    @classmethod
    def setup_class(cls):
Exemple #7
0
from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq

from pywb.warcserver.index.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
from pywb.warcserver.index.indexsource import RemoteIndexSource

from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator
from pywb.warcserver.index.aggregator import DirectoryIndexSource

from pywb.warcserver.basewarcserver import BaseWarcServer
from pywb.utils.memento import MementoUtils


sources = {
    'local': DirectoryIndexSource(TEST_CDX_PATH),
    'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
    'rhiz': MementoIndexSource.from_timegate_url('https://webenact.rhizome.org/vvork/'),
    'live': LiveIndexSource(),
}

ia_cdx = {
    'ia-cdx': RemoteIndexSource('http://web.archive.org/cdx?url={url}&closest={closest}&sort=closest',
                                'http://web.archive.org/web/{timestamp}id_/{url}')
}




class TestBaseWarcServer(HttpBinLiveTests, MementoOverrideTests, FakeRedisTests, BaseTestClass):
    @classmethod
    def setup_class(cls):
from pywb.warcserver.test.testutils import to_json_list, to_path, TEST_CDX_PATH, MementoOverrideTests, BaseTestClass

import json
import pytest
import time
import six

from mock import patch

from pywb.warcserver.handlers import IndexHandler


# Aggregator Mappings
sources = {
    'local': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
    'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
    'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'),
    'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'),
    'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*')
}

aggs = {'simple': SimpleAggregator(sources),
        'gevent': GeventTimeoutAggregator(sources, timeout=5.0),
       }

aggs_inv = {'simple': SimpleAggregator(sources, invert_sources=True),
            'gevent': GeventTimeoutAggregator(sources, invert_sources=True, timeout=5.0),
           }

agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.05)}
Exemple #9
0
from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq

from pywb.warcserver.index.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
from pywb.warcserver.index.indexsource import RemoteIndexSource

from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator
from pywb.warcserver.index.aggregator import DirectoryIndexSource

from pywb.warcserver.basewarcserver import BaseWarcServer
from pywb.utils.memento import MementoUtils

sources = {
    'local':
    DirectoryIndexSource(TEST_CDX_PATH),
    'ia':
    MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
    'rhiz':
    MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/',
                                         path='*'),
    'live':
    LiveIndexSource(),
}

ia_cdx = {
    'ia-cdx':
    RemoteIndexSource(
        'http://web.archive.org/cdx?url={url}&closest={closest}&sort=closest',
        'http://web.archive.org/web/{timestamp}id_/{url}')
}