def test_resolveRevisits(self): """ with ``resolveRevisits=true``, server adds three fields pointing to the *original* capture. """ resp = self.query('http://www.iana.org/_css/2013.1/print.css', resolveRevisits='true' ) assert resp.status_code == 200 assert resp.content_type == 'text/x-cdxj' cdxes = resp.text.splitlines() originals = {} for cdx in cdxes: cdx = CDXObject(cdx.encode('utf-8')) assert len(cdx) == 16 # orig.* fields are either all '-' or (int, int, filename) # check if orig.* fields are equals to corresponding fields # for the original capture. sha = cdx['digest'] if cdx['orig.length'] == '-': assert cdx['orig.offset'] == '-' and cdx['orig.filename'] == '-' originals[sha] = (int(cdx['length']), int(cdx['offset']), cdx['filename']) else: orig = originals.get(sha) assert orig == (int(cdx['orig.length']), int(cdx['orig.offset']), cdx['orig.filename'])
def test_unicode_url(): x = CDXObject(u'com,example,cafe)/ 123 {"url": "http://example.com/café/path"}'.encode('utf-8')) assert x['urlkey'] == 'com,example,cafe)/' assert x['timestamp'] == '123' assert x['url'] == 'http://example.com/caf%C3%A9/path' assert x.to_cdxj() == 'com,example,cafe)/ 123 {"url": "http://example.com/caf%C3%A9/path"}\n'
def test_limit(self): resp = self.query('http://www.iana.org/_css/2013.1/screen.css', limit='1') assert resp.status_code == 200 assert resp.content_type == 'text/x-cdxj' cdxes = resp.text.splitlines() assert len(cdxes) == 1 cdx = CDXObject(cdxes[0].encode('utf-8')) assert cdx['urlkey'] == 'org,iana)/_css/2013.1/screen.css' assert cdx['timestamp'] == '20140126200625' assert cdx['mime'] == 'text/css' resp = self.query('http://www.iana.org/_css/2013.1/screen.css', limit='1', reverse='1') assert resp.status_code == 200 assert resp.content_type == 'text/x-cdxj' cdxes = resp.text.splitlines() assert len(cdxes) == 1 cdx = CDXObject(cdxes[0].encode('utf-8')) assert cdx['urlkey'] == 'org,iana)/_css/2013.1/screen.css' assert cdx['timestamp'] == '20140127171239' assert cdx['mime'] == 'warc/revisit'
def test_record_param_user_coll_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp( self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/USER/COLL/', 1) resp = self._test_warc_write( recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 if b'warc/revisit' in res[0]: cdx = CDXObject(res[0]) else: cdx = CDXObject(res[1]) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'warc/revisit' assert cdx['offset'] == '0' assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename']) warcs = r.hgetall('USER:COLL:warc') assert len(warcs) == 2 assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode( 'utf-8') with open(fullwarc, 'rb') as fh: decomp = DecompressingBufferedReader(fh) # Test refers-to headers status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp) assert status_headers.get_header('WARC-Type') == 'revisit' assert status_headers.get_header( 'WARC-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header( 'WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Refers-To-Date') != ''
def test_unicode_url(): x = CDXObject( u'com,example,cafe)/ 123 {"url": "http://example.com/café/path"}'. encode('utf-8')) assert x['urlkey'] == 'com,example,cafe)/' assert x['timestamp'] == '123' assert x['url'] == 'http://example.com/caf%C3%A9/path' assert x.to_cdxj( ) == 'com,example,cafe)/ 123 {"url": "http://example.com/caf%C3%A9/path"}\n'
def test_lt_le(): A = CDXObject(b'ca,example)/ 2016 {"url": "http://example.com/"}') B = CDXObject(b'com,example)/ 2015 {"url": "http://example.com/"}') C = CDXObject(b'com,example)/ 2016 {"url": "http://example.com/"}') assert A < B assert B < C assert B >= A assert C >= A assert A < C
def find_access_rule(self, url, ts=None, urlkey=None, collection=None): """Attempts to find the access control rule for the supplied URL otherwise returns the default rule :param str url: The URL for the rule to be found :param str|None ts: A timestamp (not used) :param str|None urlkey: The access control url key :return: The access control rule for the supplied URL if one exists otherwise the default rule :rtype: CDXObject """ params = { 'url': url, 'urlkey': urlkey, 'nosource': 'true', 'exact_match_suffix': self.EXACT_SUFFIX_SEARCH_B } if collection: params['param.coll'] = collection acl_iter, errs = self.aggregator(params) if errs: print(errs) key = params['key'] key_exact = key + self.EXACT_SUFFIX_B tld = key.split(b',')[0] for acl in acl_iter: # skip empty/invalid lines if not acl: continue acl_key = acl.split(b' ')[0] if key_exact == acl_key: return CDXObject(acl) if key.startswith(acl_key): return CDXObject(acl) # if acl key already less than first tld, # no match can be found if acl_key < tld: break return self.default_rule
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False, failed_files=None): resolve_loader = ResolvingLoader(DefaultResolverMixin.make_resolvers(test_warc_dir)) cdx = CDXObject(cdx.encode('utf-8')) try: (headers, stream) = resolve_loader(cdx, failed_files, revisit_func) print(repr_format(headers)) sys.stdout.write(stream.readline().decode('utf-8')) sys.stdout.write(stream.readline().decode('utf-8')) except ArchiveLoadFailed as e: if reraise: raise else: print('Exception: ' + e.__class__.__name__)
def load_index(self, params): # no fuzzy match for live resources if params.get('is_fuzzy'): raise NotFoundException(params['url'] + '*') cdx = CDXObject() cdx['urlkey'] = params.get('key').decode('utf-8') cdx['timestamp'] = timestamp_now() cdx['url'] = params['url'] cdx['load_url'] = self.get_load_url(params) cdx['is_live'] = 'true' mime = params.get('content_type', '') if params.get('filter') and not mime: try: res = self.sesh.head(cdx['load_url']) if res.status_code != 405: cdx['status'] = str(res.status_code) content_type = res.headers.get('Content-Type') if content_type: mime = content_type.split(';')[0] except Exception as e: pass cdx['mime'] = mime return iter([cdx])
def test_resolver_dir_wildcard(self): resolver = DefaultResolverMixin.make_best_resolver(os.path.join(get_test_dir(), '*', '')) cdx = CDXObject() res = resolver('example.warc.gz', cdx) assert len(res) == 1 assert res[0] == os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
def test_fields(self): """ retrieve subset of fields with ``fields`` parameter. """ resp = self.query('http://www.iana.org/_css/2013.1/print.css', fields='urlkey,timestamp,status') assert resp.status_code == 200 cdxes = resp.text.splitlines() for cdx in cdxes: cdx = CDXObject(cdx.encode('utf-8')) assert cdx['urlkey'] == 'org,iana)/_css/2013.1/print.css' assert re.match(r'\d{14}$', cdx['timestamp']) assert re.match(r'\d{3}|-', cdx['status'])
def rewrite_record(self, headers, content, ts, url='http://example.com/', prefix='http://localhost:8080/prefix/', warc_headers=None, request_url=None, is_live=None): record = self._create_response_record(url, headers, content, warc_headers) wburl = WbUrl(ts + '/' + (request_url or url)) url_rewriter = UrlRewriter(wburl, prefix) cdx = CDXObject() cdx['url'] = url cdx['timestamp'] = ts cdx['urlkey'] = canonicalize(url) if request_url != url: cdx['is_fuzzy'] = '1' cdx['is_live'] = is_live return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
def handle_timegate(self, params, timestamp): url = params['url'] load_url = self.timegate_url.format(url=url, timestamp=timestamp) res = None try: headers = self._get_headers(params) res = self.sesh.head(load_url, headers=headers) except Exception as e: no_except_close(res) raise NotFoundException(url) if res and res.headers.get('Memento-Datetime'): if res.status_code >= 400: no_except_close(res) raise NotFoundException(url) if res.status_code >= 300: info = self._extract_location(url, res.headers.get('Location')) else: info = self._extract_location( url, res.headers.get('Content-Location')) url, timestamp, load_url = info cdx = CDXObject() cdx['urlkey'] = canonicalize(url) cdx['timestamp'] = timestamp cdx['url'] = url cdx['load_url'] = load_url if 'Referer' in headers: cdx['set_referrer'] = headers['Referer'] return iter([cdx])
def detect_pages(self, coll, rec): """Find pages in recording. :param str coll: collection ID :param str rec: recording ID :returns: pages :rtype: list """ key = self.cdxj_key.format(coll=coll, rec=rec) pages = [] #for member, score in self.redis.zscan_iter(key): for member in self.redis.zrange(key, 0, -1): cdxj = CDXObject(member.encode('utf-8')) if ((not self.max_detect_pages or len(pages) < self.max_detect_pages) and self.is_page(cdxj)): pages.append( dict(url=cdxj['url'], title=cdxj['url'], timestamp=cdxj['timestamp'])) return pages
def test_record_param_user_coll_write_dupe_no_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy()) writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) resp = self._test_warc_write( recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 3) r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 3 mimes = [CDXObject(x)['mime'] for x in res] assert sorted(mimes) == [ 'application/json', 'application/json', 'warc/revisit' ] assert len(writer.fh_cache) == 0
def test_record_param_user_coll(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/USER/COLL/', None) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 1) r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 1 cdx = CDXObject(res[0]) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'application/json' assert cdx['offset'] == '0' assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') warcs = r.hgetall('USER:COLL:warc') full_path = to_path(self.root_dir + '/warcs/' + cdx['filename']) assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
def test_resolver_http_prefix_not_wildcard(self): resolver = DefaultResolverMixin.make_best_resolver( 'http://example.com/*/') cdx = CDXObject() res = resolver('example.warc.gz', cdx) assert res == 'http://example.com/*/example.warc.gz'
def detect_pages(self, coll, rec, upload_key, total_size): """Find pages in recording. :param str coll: collection ID :param str rec: recording ID :returns: pages :rtype: list """ key = self.cdxj_key.format(coll=coll, rec=rec) pages = [] count = 0 total_cdx = self.redis.zcard(key) incr = int((total_size * 0.25) / total_cdx) count = 0 for member, score in self.redis.zscan_iter(key, match='*', count=100): cdxj = CDXObject(member.encode('utf-8')) count += 1 self.redis.hincrby(upload_key, 'size', incr) if self.is_page(cdxj): pages.append(dict(url=cdxj['url'], title=cdxj['url'], timestamp=cdxj['timestamp'])) if self.max_detect_pages and len(pages) > self.max_detect_pages: self.redis.hincrby(upload_key, 'size', incr * (total_cdx - count)) break return pages
def rewrite_record(self, headers, content, ts, url='http://example.com/', prefix='http://localhost:8080/prefix/', warc_headers=None, request_url=None, is_live=None, use_js_proxy=True, environ=None): record = self._create_response_record(url, headers, content, warc_headers) wburl = WbUrl(ts + '/' + (request_url or url)) url_rewriter = UrlRewriter(wburl, prefix) cdx = CDXObject() cdx['url'] = url cdx['timestamp'] = ts cdx['urlkey'] = canonicalize(url) if request_url != url: cdx['is_fuzzy'] = '1' cdx['is_live'] = is_live def insert_func(rule, cdx): return '' if use_js_proxy: rewriter = self.js_proxy_content_rewriter else: rewriter = self.content_rewriter return rewriter(record, url_rewriter, cookie_rewriter=None, head_insert_func=insert_func, cdx=cdx, environ=environ)
def test_redis_resolver_multi_key_with_member_hash(self): resolver = RedisResolver('redis://127.0.0.1:6379/0/*:warc', member_key_templ='member_hash') cdx = CDXObject() assert resolver('example.warc.gz', cdx) == None resolver.redis.hset('A:warc', 'example.warc.gz', 'some_path/example.warc.gz') resolver.redis.hset('B:warc', 'example-2.warc.gz', 'some_path/example-2.warc.gz') resolver.redis.hset('member_hash', '1', 'A') # only A:warc used assert resolver('example.warc.gz', cdx) == 'some_path/example.warc.gz' assert resolver('example-2.warc.gz', cdx) == None resolver.redis.hset('member_hash', '2', 'B') # A:warc and B:warc used assert resolver('example.warc.gz', cdx) == 'some_path/example.warc.gz' assert resolver('example-2.warc.gz', cdx) == 'some_path/example-2.warc.gz' assert resolver.member_key_type == 'hash'
def test_anon_download_coll(self): res = self._get_anon('/temp/$download') assert res.headers['Content-Disposition'].startswith( "attachment; filename*=UTF-8''temp-") warcin = self._get_dechunked(res.body) cdxout = BytesIO() write_cdx_index(cdxout, warcin, 'temp.warc.gz', include_all=True, cdxj=True) cdx = [ CDXObject(cdx) for cdx in cdxout.getvalue().rstrip().split(b'\n') ] assert len(cdx) == 10 # response cdx[0]['url'] = 'http://httpbin.org/get?food=bar' cdx[0]['mime'] = 'application/json' # request cdx[1]['url'] = 'http://httpbin.org/get?food=bar' cdx[1]['mime'] = '-' # response cdx[2]['url'] = 'http://httpbin.org/get?bood=far' cdx[2]['mime'] = 'application/json' # request cdx[3]['url'] = 'http://httpbin.org/get?bood=far' cdx[3]['mime'] = '-'
def test_path_index_resolvers(self): path = os.path.join(get_test_dir(), 'text_content', 'pathindex.txt') path_index = PathIndexResolver(path) cdx = CDXObject() assert list(path_index('example.warc.gz', cdx)) == ['invalid_path', 'sample_archive/warcs/example.warc.gz'] assert list(path_index('iana.warc.gz', cdx)) == ['sample_archive/warcs/iana.warc.gz'] assert list(path_index('not-found.gz', cdx)) == []
def test_resolver_dir_wildcard_as_file_url(self): url = to_file_url(get_test_dir()) + '/*/' resolver = DefaultResolverMixin.make_best_resolver(url) cdx = CDXObject() res = resolver('example.warc.gz', cdx) assert len(res) == 1 assert res[0] == os.path.abspath(os.path.join(get_test_dir(), 'warcs', 'example.warc.gz'))
def do_load(lines): for line in lines: if not line: continue cdx = CDXObject(line) self._set_load_url(cdx, params) yield cdx
def _add_rule(self, url, access, exact_match=False, user=None): """Adds an rule to the acl file :param str url: The URL for the rule :param str access: The access value for the rule :param bool exact_match: Is the rule to be added an exact match :rtype: None """ if not self.validate_access(access): return acl = CDXObject() acl['urlkey'] = self.to_key(url, exact_match) acl['timestamp'] = '-' acl['access'] = access acl['url'] = url if user: acl['user'] = user i = 0 replace = False for rule in self.rules: if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule[ 'timestamp'] and acl.get('user') == rule.get('user'): replace = True break if acl > rule: break i += 1 if replace: print('Existing Rule Found, Replacing:') self.print_rule(self.rules[i]) print('with:') self.print_rule(acl) self.rules[i] = acl else: print('Added new Rule:') self.print_rule(acl) self.rules.insert(i, acl) self.save_acl()
def test_resolver_dir_wildcard_with_coll(self): resolver = DefaultResolverMixin.make_best_resolver('s3://bucket/colls/*/archives/') cdx = CDXObject() cdx['source'] = 'my-coll/indexes/index.cdxj' cdx['source-coll'] = 'my-coll' res = resolver('example.warc.gz', cdx) assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False, failed_files=None): resolve_loader = ResolvingLoader( DefaultResolverMixin.make_resolvers(test_warc_dir)) cdx = CDXObject(cdx.encode('utf-8')) try: (headers, stream) = resolve_loader(cdx, failed_files, revisit_func) print(repr_format(headers)) sys.stdout.write(stream.readline().decode('utf-8')) sys.stdout.write(stream.readline().decode('utf-8')) except ArchiveLoadFailed as e: if reraise: raise else: print('Exception: ' + e.__class__.__name__)
def test_redis_resolver(self): resolver = RedisResolver('redis://127.0.0.1:6379/0/warc_map') cdx = CDXObject() assert resolver('example.warc.gz', cdx) == None resolver.redis.hset(resolver.redis_key_template, 'example.warc.gz', 'some_path/example.warc.gz') assert resolver('example.warc.gz', cdx) == 'some_path/example.warc.gz'
def load_index(self, params): cdx = CDXObject() cdx['urlkey'] = params.get('key').decode('utf-8') closest = params.get('closest') cdx['timestamp'] = closest if closest else timestamp_now() cdx['url'] = params['url'] cdx['load_url'] = res_template(self.proxy_url, params) cdx['memento_url'] = cdx['load_url'] return self._do_load(cdx, params)
def test_redis_resolver_multi_key(self): resolver = RedisResolver('redis://127.0.0.1:6379/0/*:warc') cdx = CDXObject() assert resolver('example.warc.gz', cdx) == None resolver.redis.hset('A:warc', 'example.warc.gz', 'some_path/example.warc.gz') resolver.redis.hset('B:warc', 'example-2.warc.gz', 'some_path/example-2.warc.gz') assert resolver('example.warc.gz', cdx) == 'some_path/example.warc.gz' assert resolver('example-2.warc.gz', cdx) == 'some_path/example-2.warc.gz'
def test_resolveRevisits_orig_fields(self): """ when resolveRevisits=true, extra three fields are named ``orig.length``, ``orig.offset`` and ``orig.filename``, respectively. it is possible to filter fields by these names. """ resp = self.query('http://www.iana.org/_css/2013.1/print.css', resolveRevisits='1', fields='urlkey,orig.length,orig.offset,orig.filename' ) assert resp.status_code == 200 assert resp.content_type == 'text/x-cdxj' cdxes = resp.text.splitlines() cdx = cdxes[0] cdx = CDXObject(cdx.encode('utf-8')) assert cdx['orig.offset'] == '-' assert cdx['orig.length'] == '-' assert cdx['orig.filename'] == '-' for cdx in cdxes[1:]: cdx = CDXObject(cdx.encode('utf-8')) assert cdx['orig.offset'] != '-' assert cdx['orig.length'] != '-' assert cdx['orig.filename'] == 'iana.warc.gz'
def test_collapseTime_resolveRevisits_reverse(self): resp = self.query('http://www.iana.org/_css/2013.1/print.css', collapseTime='11', resolveRevisits='true', reverse='true') cdxes = [CDXObject(l) for l in resp.body.splitlines()] assert len(cdxes) == 3 # timestamp is in descending order for i in range(len(cdxes) - 1): assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']