def test_agg_collB_found(self): res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'}) exp = [{'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {})
def test_redis_not_found(self, indexloader): res, errs = indexloader({'url': 'example.com/'}) exp = [] assert(errs == {}) assert(to_json_list(res) == exp)
def test_agg_collB(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'B'}) exp = [] assert(to_json_list(res) == exp) assert(errs == {})
def test_mem_agg_index_3(self, agg): url = 'http://vvork.com/' res, errs = agg(dict(url=url, closest='20141001', limit=5)) exp = [{ "timestamp": "20141006184357", "load_url": "https://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz" }, { "timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source": "ia" }, { "timestamp": "20141020161243", "load_url": "http://web.archive.org/web/20141020161243id_/http://vvork.com/", "source": "ia" }, { "timestamp": "20140806161228", "load_url": "http://web.archive.org/web/20140806161228id_/http://vvork.com/", "source": "ia" }, { "timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait" }] assert (to_json_list(res) == exp) assert (errs == {})
def test_agg_collA_found(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'}) exp = [{'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {})
def test_redis_not_found(self, indexloader): res, errs = indexloader({'url': 'example.com/'}) exp = [] assert (errs == {}) assert (to_json_list(res) == exp)
def test_timeout_skipping(self): assert (self.sources['slow'].calls == 3) assert (self.sources['slower'].calls == 3) agg = GeventTimeoutAggregator(self.sources, timeout=0.40, t_count=2, t_duration=1.0) exp = [{'source': 'slow', 'timestamp': '20160225042329'}] res, errs = agg(dict(url='http://example.com/')) assert (to_json_list(res, fields=['source', 'timestamp']) == exp) assert (self.sources['slow'].calls == 4) assert (self.sources['slower'].calls == 4) assert (errs == {'slower': 'timeout'}) res, errs = agg(dict(url='http://example.com/')) assert (to_json_list(res, fields=['source', 'timestamp']) == exp) assert (self.sources['slow'].calls == 5) assert (self.sources['slower'].calls == 5) assert (errs == {'slower': 'timeout'}) res, errs = agg(dict(url='http://example.com/')) assert (to_json_list(res, fields=['source', 'timestamp']) == exp) assert (self.sources['slow'].calls == 6) assert (self.sources['slower'].calls == 5) assert (errs == {}) res, errs = agg(dict(url='http://example.com/')) assert (to_json_list(res, fields=['source', 'timestamp']) == exp) assert (self.sources['slow'].calls == 7) assert (self.sources['slower'].calls == 5) assert (errs == {}) time.sleep(1.5) res, errs = agg(dict(url='http://example.com/')) assert (to_json_list(res, fields=['source', 'timestamp']) == exp) assert (self.sources['slow'].calls == 8) assert (self.sources['slower'].calls == 6) assert (errs == {'slower': 'timeout'})
def test_agg_no_dir_2(self): loader = DirectoryIndexSource(self.root_dir, '') res, errs = loader({'url': 'example.com/', 'param.coll': 'X'}) exp = [] assert(to_json_list(res) == exp) assert(errs == {})
def test_mem_agg_not_found(self, agg): url = 'http://vvork.com/' res, errs = agg(dict(url=url, closest='20141001', limit=2)) assert (to_json_list(res) == []) assert (errs == { 'notfound': "NotFoundException('testdata/not-found-x',)" })
def test_extra_agg_collB(self): agg_source = SimpleAggregator({'dir': self.dir_loader}) res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'}) exp = [{'source': to_path('dir:colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {})
def test_timeout_skipping(self): assert(self.sources['slow'].calls == 3) assert(self.sources['slower'].calls == 3) agg = GeventTimeoutAggregator(self.sources, timeout=0.40, t_count=2, t_duration=1.0) exp = [{'source': 'slow', 'timestamp': '20160225042329'}] res, errs = agg(dict(url='http://example.com/')) assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(self.sources['slow'].calls == 4) assert(self.sources['slower'].calls == 4) assert(errs == {'slower': 'timeout'}) res, errs = agg(dict(url='http://example.com/')) assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(self.sources['slow'].calls == 5) assert(self.sources['slower'].calls == 5) assert(errs == {'slower': 'timeout'}) res, errs = agg(dict(url='http://example.com/')) assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(self.sources['slow'].calls == 6) assert(self.sources['slower'].calls == 5) assert(errs == {}) res, errs = agg(dict(url='http://example.com/')) assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(self.sources['slow'].calls == 7) assert(self.sources['slower'].calls == 5) assert(errs == {}) time.sleep(1.5) res, errs = agg(dict(url='http://example.com/')) assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(self.sources['slow'].calls == 8) assert(self.sources['slower'].calls == 6) assert(errs == {'slower': 'timeout'})
def test_mem_agg_index_4(self, agg): url = 'http://vvork.com/' res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}, {"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}] assert(to_json_list(res) == exp) assert(errs == {})
def test_mem_agg_index_5_inverse_preset(self, agg): url = 'http://vvork.com/' res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) exp = [{'timestamp': '20141018133107', 'load_url': 'http://web.archive.org/web/20141018133107id_/http://vvork.com/', 'source': 'ia'}] assert(to_json_list(res) == exp) assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://vvork.com/',)"})
def test_timeout_slower_skipped_1(self): agg = GeventTimeoutAggregator(self.sources, timeout=0.40) res, errs = agg(dict(url='http://example.com/')) exp = [{'source': 'slow', 'timestamp': '20160225042329'}] assert (to_json_list(res, fields=['source', 'timestamp']) == exp) assert (errs == {'slower': 'timeout'})
def test_timeout_slower_all_skipped(self): agg = GeventTimeoutAggregator(self.sources, timeout=0.10) res, errs = agg(dict(url='http://example.com/')) exp = [] assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(errs == {'slower': 'timeout', 'slow': 'timeout'})
def test_timeout_slower_skipped_1(self): agg = GeventTimeoutAggregator(self.sources, timeout=0.40) res, errs = agg(dict(url='http://example.com/')) exp = [{'source': 'slow', 'timestamp': '20160225042329'}] assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(errs == {'slower': 'timeout'})
def test_timeout_slower_all_skipped(self): agg = GeventTimeoutAggregator(self.sources, timeout=0.10) res, errs = agg(dict(url='http://example.com/')) exp = [] assert (to_json_list(res, fields=['source', 'timestamp']) == exp) assert (errs == {'slower': 'timeout', 'slow': 'timeout'})
def test_redis_agg_one(self, indexloader): res, errs = indexloader({'url': 'example.com/', 'param.user': '******', 'param.coll': 'dupes'}) exp = [ {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, ] assert(errs == {}) assert(to_json_list(res) == exp)
def test_agg_all_found_2(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'}) exp = [ {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, {'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) assert(errs == {})
def test_agg_all_found_1(self): res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'}) exp = [ {'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, ] assert(to_json_list(res) == exp) assert(errs == {})
def test_timeout_long_all_pass(self): agg = GeventTimeoutAggregator(self.sources, timeout=1.0) res, errs = agg(dict(url='http://example.com/')) exp = [{'source': 'slower', 'timestamp': '20140127171200'}, {'source': 'slower', 'timestamp': '20140127171251'}, {'source': 'slow', 'timestamp': '20160225042329'}] assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(errs == {})
def test_mem_agg_index_2(self, agg): url = 'http://example.com/' res, errs = agg(dict(url=url, closest='20100512', limit=6)) exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"}, {"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"}, {"timestamp": "20100513224108", "load_url": "http://web.archive.org/web/20100513224108id_/http://example.com/", "source": "ia"}, {"timestamp": "20100511201151", 'load_url': "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"}, {"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"}, {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}, ] assert(to_json_list(res) == exp) assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
def test_mem_agg_index_1(self, agg): url = 'http://iana.org/' res, errs = agg(dict(url=url, closest='20140126000000', limit=5)) exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"}, {"timestamp": "20140126200624", "filename": "iana.warc.gz", "source": "local"}, {"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source": "ia"}, {"timestamp": "20140129175203", "load_url": "http://web.archive.org/web/20140129175203id_/http://iana.org/", "source": "ia"}, {"timestamp": "20140107040552", "load_url": "http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/", "source": "ait"} ] assert(to_json_list(res) == exp) assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)", 'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
def test_mem_agg_timeout(self, agg): url = 'http://vvork.com/' orig_source = BaseAggregator.load_child_source def load_child_source(self, name, source, params): time.sleep(0.1) return orig_source(self, name, source, params) BaseAggregator.load_child_source = load_child_source res, errs = agg(dict(url=url, closest='20141001', limit=2)) BaseAggregator.load_child_source = orig_source assert(to_json_list(res) == []) assert(errs == {'local': 'timeout', 'ait': 'timeout', 'bl': 'timeout', 'ia': 'timeout', 'rhiz': 'timeout'})
def test_agg_dir_and_memento(self): sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'local': self.dir_loader} agg_source = SimpleAggregator(sources) res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6}) exp = [ {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, {'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) assert(errs == {})
def test_timeout_long_all_pass(self): agg = GeventTimeoutAggregator(self.sources, timeout=1.0) res, errs = agg(dict(url='http://example.com/')) exp = [{ 'source': 'slower', 'timestamp': '20140127171200' }, { 'source': 'slower', 'timestamp': '20140127171251' }, { 'source': 'slow', 'timestamp': '20160225042329' }] assert (to_json_list(res, fields=['source', 'timestamp']) == exp) assert (errs == {})
def test_redis_agg_one(self, indexloader): res, errs = indexloader({ 'url': 'example.com/', 'param.user': '******', 'param.coll': 'dupes' }) exp = [ { 'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz' }, { 'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz' }, ] assert (errs == {}) assert (to_json_list(res) == exp)
def test_mem_agg_not_found(self, agg): url = 'http://vvork.com/' res, errs = agg(dict(url=url, closest='20141001', limit=2)) assert(to_json_list(res) == []) assert(errs == {'notfound': "NotFoundException('testdata/not-found-x',)"})
def test_agg_no_coll_set(self): res, errs = self.dir_loader(dict(url='example.com/')) assert(to_json_list(res) == []) assert(errs == {})