def test_resolver_dir_and_file(self): a_file = os.path.realpath(__file__) a_dir = os.path.dirname(a_file) # a file -- assume path index res = DefaultResolverMixin.make_best_resolver(a_file) assert isinstance(res, PathIndexResolver) # a dir -- asume prefix res = DefaultResolverMixin.make_best_resolver(a_dir) assert isinstance(res, PrefixResolver) # not a valid file -- default to prefix res = DefaultResolverMixin.make_best_resolver('file://test/x_invalid') assert isinstance(res, PrefixResolver)
def test_resolver_dir_and_file(self): a_file = os.path.realpath(__file__) a_dir = os.path.dirname(a_file) # a file -- assume path index res = DefaultResolverMixin.make_best_resolver(a_file) assert isinstance(res, PathIndexResolver) # a dir -- assume prefix res = DefaultResolverMixin.make_best_resolver(a_dir) assert isinstance(res, PrefixResolver) # not a valid file -- default to prefix res = DefaultResolverMixin.make_best_resolver('file://test/x_invalid') assert isinstance(res, PrefixResolver)
def test_resolver_dir_wildcard(self): resolver = DefaultResolverMixin.make_best_resolver(os.path.join(get_test_dir(), '*', '')) cdx = CDXObject() res = resolver('example.warc.gz', cdx) assert len(res) == 1 assert res[0] == os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
def test_resolver_http_prefix_not_wildcard(self): resolver = DefaultResolverMixin.make_best_resolver( 'http://example.com/*/') cdx = CDXObject() res = resolver('example.warc.gz', cdx) assert res == 'http://example.com/*/example.warc.gz'
def test_resolver_dir_wildcard_as_file_url(self): url = to_file_url(get_test_dir()) + '/*/' resolver = DefaultResolverMixin.make_best_resolver(url) cdx = CDXObject() res = resolver('example.warc.gz', cdx) assert len(res) == 1 assert res[0] == os.path.abspath(os.path.join(get_test_dir(), 'warcs', 'example.warc.gz'))
def test_resolver_dir_wildcard_with_coll(self): resolver = DefaultResolverMixin.make_best_resolver('s3://bucket/colls/*/archives/') cdx = CDXObject() cdx['source'] = 'my-coll/indexes/index.cdxj' cdx['source-coll'] = 'my-coll' res = resolver('example.warc.gz', cdx) assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'
def test_resolver_list(self): paths = [to_file_url(os.path.realpath(__file__)), 'http://myhost.example.com/warcs/', 'redis://localhost:1234/0'] res = DefaultResolverMixin.make_resolvers(paths) assert isinstance(res[0], PathIndexResolver) assert isinstance(res[1], PrefixResolver) assert isinstance(res[2], RedisResolver)
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False, failed_files=None): resolve_loader = ResolvingLoader(DefaultResolverMixin.make_resolvers(test_warc_dir)) cdx = CDXObject(cdx.encode('utf-8')) try: (headers, stream) = resolve_loader(cdx, failed_files, revisit_func) print(repr_format(headers)) sys.stdout.write(stream.readline().decode('utf-8')) sys.stdout.write(stream.readline().decode('utf-8')) except ArchiveLoadFailed as e: if reraise: raise else: print('Exception: ' + e.__class__.__name__)
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False, failed_files=None): resolve_loader = ResolvingLoader( DefaultResolverMixin.make_resolvers(test_warc_dir)) cdx = CDXObject(cdx.encode('utf-8')) try: (headers, stream) = resolve_loader(cdx, failed_files, revisit_func) print(repr_format(headers)) sys.stdout.write(stream.readline().decode('utf-8')) sys.stdout.write(stream.readline().decode('utf-8')) except ArchiveLoadFailed as e: if reraise: raise else: print('Exception: ' + e.__class__.__name__)
def test_make_best_resolver_redis(self): res = DefaultResolverMixin.make_best_resolver('redis://myhost.example.com:1234/1') assert isinstance(res, RedisResolver) assert repr(res) == "RedisResolver('redis://myhost.example.com:1234/1')"
def test_make_best_resolver_pathindex(self): path = os.path.join(get_test_dir(), 'text_content', 'pathindex.txt') res = DefaultResolverMixin.make_best_resolver(path) assert isinstance(res, PathIndexResolver) assert repr(res) == "PathIndexResolver('{0}')".format(path)
def test_make_best_resolver_redis(self): res = DefaultResolverMixin.make_best_resolver( 'redis://myhost.example.com:1234/1') assert isinstance(res, RedisResolver) assert repr( res) == "RedisResolver('redis://myhost.example.com:1234/1')"
def test_make_best_resolver_http(self): res = DefaultResolverMixin.make_best_resolver( 'http://myhost.example.com/warcs/') assert isinstance(res, PrefixResolver) assert repr( res) == "PrefixResolver('http://myhost.example.com/warcs/')"
def test_resolver_http_prefix_not_wildcard(self): resolver = DefaultResolverMixin.make_best_resolver('http://example.com/*/') cdx = CDXObject() res = resolver('example.warc.gz', cdx) assert res == 'http://example.com/*/example.warc.gz'
def test_make_best_resolver_http(self): res = DefaultResolverMixin.make_best_resolver('http://myhost.example.com/warcs/') assert isinstance(res, PrefixResolver) assert repr(res) == "PrefixResolver('http://myhost.example.com/warcs/')"