def test_cdx_all_coll(self): res = self.testapp.get('/all/cdx?url=http://httpbin.org/get*&output=json') cdxj_lines = [json.loads(line) for line in res.text.rstrip().split('\n')] assert len(cdxj_lines) == 4 assert cdxj_lines[0]['url'] == 'http://httpbin.org/get?A=B' assert cdxj_lines[1]['url'] == 'http://httpbin.org/get?A=B' assert cdxj_lines[2]['url'] == 'http://httpbin.org/get?C=D' assert cdxj_lines[3]['url'] == 'http://httpbin.org/get?C=D2' assert cdxj_lines[0]['urlkey'] == 'org,httpbin)/get?__pywb_method=head&a=b' assert cdxj_lines[1]['urlkey'] == 'org,httpbin)/get?a=b' assert cdxj_lines[2]['urlkey'] == 'org,httpbin)/get?c=d' assert cdxj_lines[3]['urlkey'] == 'org,httpbin)/get?c=d2' assert cdxj_lines[0]['source'] == to_path('test/indexes/autoindex.cdxj') assert cdxj_lines[1]['source'] == to_path('test/indexes/autoindex.cdxj') assert cdxj_lines[2]['source'] == to_path('test2/indexes/autoindex.cdxj') assert cdxj_lines[3]['source'] == to_path('test/indexes/autoindex.cdxj') assert cdxj_lines[0]['source-coll'] == 'test' assert cdxj_lines[1]['source-coll'] == 'test' assert cdxj_lines[2]['source-coll'] == 'test2' assert cdxj_lines[3]['source-coll'] == 'test' assert cdxj_lines[1]['filename'] == cdxj_lines[3]['filename']
def test_agg_dir_sources_2(self): res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'}) exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file', to_path('colls:C/indexes/dupes.cdxj'): 'file'} } assert(res == exp)
def test_timemap_all_coll(self): res = self.testapp.get('/all/timemap/link/http://httpbin.org/get?C=D') link_lines = res.text.rstrip().split('\n') assert len(link_lines) == 5 assert to_path('collection="test2"') in link_lines[3] assert to_path('collection="test"') in link_lines[4]
def test_record_param_user_coll(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/USER/COLL/', None) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 1) r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 1 cdx = CDXObject(res[0]) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'application/json' assert cdx['offset'] == '0' assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') warcs = r.hgetall('USER:COLL:warc') full_path = to_path(self.root_dir + '/warcs/' + cdx['filename']) assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
def test_cache_dir_sources_1(self): exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file', to_path('colls:B/indexes/iana.cdxj'): 'file', to_path('colls:C/indexes/dupes.cdxj'): 'file'} } res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) assert(res == exp) res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) assert(res == exp) new_file = os.path.join(self.root_dir, to_path('colls/C/indexes/empty.cdxj')) # ensure new file is created at least a second later time.sleep(1.0) with open(new_file, 'a') as fh: os.utime(new_file, None) res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) # New File Included exp['sources'][to_path('colls:C/indexes/empty.cdxj')] = 'file' assert(res == exp)
def test_record_param_user_coll_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp( self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/USER/COLL/', 1) resp = self._test_warc_write( recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 if b'warc/revisit' in res[0]: cdx = CDXObject(res[0]) else: cdx = CDXObject(res[1]) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'warc/revisit' assert cdx['offset'] == '0' assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename']) warcs = r.hgetall('USER:COLL:warc') assert len(warcs) == 2 assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode( 'utf-8') with open(fullwarc, 'rb') as fh: decomp = DecompressingBufferedReader(fh) # Test refers-to headers status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp) assert status_headers.get_header('WARC-Type') == 'revisit' assert status_headers.get_header( 'WARC-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header( 'WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Refers-To-Date') != ''
def test_agg_all_found_2(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'}) exp = [ {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, {'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) assert(errs == {})
def test_agg_all_found_1(self): res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'}) exp = [ {'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, ] assert(to_json_list(res) == exp) assert(errs == {})
def test_record_param_user_coll_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/USER/COLL/', 1) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 if b'warc/revisit' in res[0]: cdx = CDXObject(res[0]) else: cdx = CDXObject(res[1]) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'warc/revisit' assert cdx['offset'] == '0' assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename']) warcs = r.hgetall('USER:COLL:warc') assert len(warcs) == 2 assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8') with open(fullwarc, 'rb') as fh: decomp = DecompressingBufferedReader(fh) # Test refers-to headers status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp) assert status_headers.get_header('WARC-Type') == 'revisit' assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Refers-To-Date') != ''
def test_record_video_metadata(self): pytest.importorskip('youtube_dl') warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) params = {'param.recorder.user': '******', 'param.recorder.coll': 'VIDEO', 'content_type': 'application/vnd.youtube-dl_formats+json' } resp = self._test_warc_write(recorder_app, 'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params), link_url='metadata://www.youtube.com/v/BfBgWtAIbRc') r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('USER:VIDEO:warc') assert len(warcs) == 1 filename = list(warcs.values())[0] with open(filename, 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream(decomp) status_headers = record.rec_headers assert status_headers.get_header('WARC-Type') == 'metadata' assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
def test_record_param_user_coll_skip(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy()) recorder_app = RecorderApp( self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) # No new entries written self._test_all_warcs('/warcs/USER/COLL/', 2) resp = self._test_warc_write( recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2
def test_agg_collB_found(self): res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'}) exp = [{'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {})
def test_record_param_user_coll_write_dupe_no_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy()) writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) resp = self._test_warc_write( recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 3) r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 3 mimes = [CDXObject(x)['mime'] for x in res] assert sorted(mimes) == [ 'application/json', 'application/json', 'warc/revisit' ] assert len(writer.fh_cache) == 0
def test_record_param_user_coll_write_dupe_no_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy()) writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 3) r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 3 mimes = [CDXObject(x)['mime'] for x in res] assert sorted(mimes) == ['application/json', 'application/json', 'warc/revisit'] assert len(writer.fh_cache) == 0
def test_record_skip_http_only_cookies_header(self): warc_path = to_path(self.root_dir + '/warcs/cookieskip_httponly/') header_filter = ExcludeHttpOnlyCookieHeaders() recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter( warc_path, header_filter=header_filter), accept_colls='live') resp = self._test_warc_write(recorder_app, 'www.google.com', '/') assert b'HTTP/1.1 302' in resp.body buff = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream(buff) non_http_only, http_only = self._get_http_only_cookies(record) # both httponly and other cookies assert http_only != None assert non_http_only != None stored_req, stored_resp = self._load_resp_req(warc_path) non_http_only, http_only = self._get_http_only_cookies(stored_resp) # no httponly cookies assert http_only == None assert non_http_only != None assert ('X-Other', 'foo') in stored_req.http_headers.headers assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers self._test_all_warcs('/warcs/cookieskip_httponly/', 1)
def test_record_skip_http_only_cookies_header(self): warc_path = to_path(self.root_dir + '/warcs/cookieskip_httponly/') header_filter = ExcludeHttpOnlyCookieHeaders() recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(warc_path, header_filter=header_filter), accept_colls='live') resp = self._test_warc_write(recorder_app, 'www.google.com', '/') assert b'HTTP/1.1 302' in resp.body buff = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream(buff) non_http_only, http_only = self._get_http_only_cookies(record) # both httponly and other cookies assert http_only != None assert non_http_only != None stored_req, stored_resp = self._load_resp_req(warc_path) non_http_only, http_only = self._get_http_only_cookies(stored_resp) # no httponly cookies assert http_only == None assert non_http_only != None assert ('X-Other', 'foo') in stored_req.http_headers.headers assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers self._test_all_warcs('/warcs/cookieskip_httponly/', 1)
def test_agg_collA_found(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'}) exp = [{'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {})
def test_record_skip_all_cookies_header(self): warc_path = to_path(self.root_dir + '/warcs/cookieskip/') header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie']) recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter( warc_path, header_filter=header_filter), accept_colls='live') resp = self._test_warc_write( recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') assert b'HTTP/1.1 302' in resp.body buff = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream(buff) assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers stored_req, stored_resp = self._load_resp_req(warc_path) assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.http_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.http_headers.headers assert ('X-Other', 'foo') in stored_req.http_headers.headers assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers self._test_all_warcs('/warcs/cookieskip/', 1)
def test_record_custom_record(self): dedup_index = self._get_dedup_index(user=False) warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz') writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) req_url = '/live/resource/postreq?url=custom://httpbin.org¶m.recorder.coll=META&put_record=resource' buff = b'Some Data' testapp = webtest.TestApp(recorder_app) headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo'} resp = testapp.put(req_url, headers=headers, params=buff) assert resp.json['success'] == 'true' assert resp.json['WARC-Date'] != '' self._test_all_warcs('/warcs/meta', 1) r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('META:warc') assert len(warcs) == 1 warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8') with open(warcs[warc_key], 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream( decomp, ensure_http_headers=True) status_headers = record.rec_headers assert len(record.rec_headers.headers) == 9 assert status_headers.get_header('WARC-Type') == 'resource' assert status_headers.get_header( 'WARC-Target-URI') == 'custom://httpbin.org' assert status_headers.get_header('WARC-Record-ID') != '' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header( 'WARC-Block-Digest') == status_headers.get_header( 'WARC-Payload-Digest') assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) assert status_headers.get_header('WARC-Custom') == 'foo' assert record.raw_stream.read() == buff status_headers = record.http_headers assert len(record.http_headers.headers) == 2 assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) writer.close() assert len(writer.fh_cache) == 0
def setup_class(cls): super(TestRecorder, cls).setup_class() cls.warcs_dir = to_path(cls.root_dir + '/warcs') os.makedirs(cls.warcs_dir) cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
def test_record_multiple_writes_rollover_idle(self): warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz') rel_path = to_path(self.root_dir + '/warcs/') dedup_index = self._get_dedup_index(user=False) writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index, max_idle_secs=0.9) recorder_app = RecorderApp(self.upstream_url, writer) # First Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body # Second Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"boo": "far"' in resp.body self._test_all_warcs('/warcs/GOO/', 1) time.sleep(1.0) writer.close_idle_files() # Third Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?goo=bar', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"goo": "bar"' in resp.body self._test_all_warcs('/warcs/GOO/', 2) writer.close() assert len(writer.fh_cache) == 0
def test_extra_agg_collB(self): agg_source = SimpleAggregator({'dir': self.dir_loader}) res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'}) exp = [{'source': to_path('dir:colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {})
def test_record_skip_wrong_coll(self): recorder_app = RecorderApp(self.upstream_url, writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/', 2)
def test_record_custom_record(self): dedup_index = self._get_dedup_index(user=False) warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz') writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) req_url = '/live/resource/postreq?url=custom://httpbin.org¶m.recorder.coll=META&put_record=resource' buff = b'Some Data' testapp = webtest.TestApp(recorder_app) headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo' } resp = testapp.put(req_url, headers=headers, params=buff) assert resp.json['success'] == 'true' assert resp.json['WARC-Date'] != '' self._test_all_warcs('/warcs/meta', 1) r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('META:warc') assert len(warcs) == 1 warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8') with open(warcs[warc_key], 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream(decomp, ensure_http_headers=True) status_headers = record.rec_headers assert len(record.rec_headers.headers) == 9 assert status_headers.get_header('WARC-Type') == 'resource' assert status_headers.get_header('WARC-Target-URI') == 'custom://httpbin.org' assert status_headers.get_header('WARC-Record-ID') != '' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest') assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) assert status_headers.get_header('WARC-Custom') == 'foo' assert record.raw_stream.read() == buff status_headers = record.http_headers assert len(record.http_headers.headers) == 2 assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) writer.close() assert len(writer.fh_cache) == 0
def test_agg_dir_and_memento(self): sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'local': self.dir_loader} agg_source = SimpleAggregator(sources) res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6}) exp = [ {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, {'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) assert(errs == {})
def test_record_warc_1(self): recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(to_path(self.root_dir + '/warcs/'))) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/', 1)
def test_error_url(self): recorder_app = RecorderApp(self.upstream_url + '01', PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live') testapp = webtest.TestApp(recorder_app) resp = testapp.get('/live/resource?url=http://example.com/', status=400) assert resp.json['error'] != '' self._test_all_warcs('/warcs/', 2)
def _test_all_warcs(self, dirname, num): coll_dir = to_path(self.root_dir + dirname) assert os.path.isdir(coll_dir) == (num != None) if num is None: return files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))] assert len(files) == num assert all(x.endswith('.warc.gz') for x in files) self._verify_content_len(coll_dir, files) return files, coll_dir
def test_record_file_warc_keep_open(self): path = to_path(self.root_dir + '/warcs/A.warc.gz') writer = MultiFileWARCWriter(path) recorder_app = RecorderApp(self.upstream_url, writer) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert os.path.isfile(path) assert len(writer.fh_cache) == 1 writer.close() assert len(writer.fh_cache) == 0
def _get_dedup_index(self, dupe_policy=WriteRevisitDupePolicy(), user=True): if user: file_key_template = '{user}:{coll}:warc' redis_url = 'redis://localhost/2/{user}:{coll}:cdxj' else: file_key_template = '{coll}:warc' redis_url = 'redis://localhost/2/{coll}:cdxj' dedup_index = WritableRedisIndexer(redis_url=redis_url, file_key_template=file_key_template, rel_path_template=to_path(self.root_dir + '/warcs/'), dupe_policy=dupe_policy) return dedup_index
def setup_class(cls, extra_config_file='test_no_invites_config.yaml', init_anon=True, **kwargs): super(BaseWRTests, cls).setup_class() cls.warcs_dir = to_path(cls.root_dir + '/warcs/') os.makedirs(cls.warcs_dir) os.environ['RECORD_ROOT'] = cls.warcs_dir os.environ['WR_CONFIG'] = 'pkg://webrecorder/config/wr.yaml' if extra_config_file: os.environ['WR_USER_CONFIG'] = os.path.join( cls.get_curr_dir(), extra_config_file) os.environ['REDIS_BASE_URL'] = 'redis://*****:*****@localhost') cls.set_nx_env('EMAIL_SMTP_URL', 'smtp://[email protected]:test@localhost:25') cls.redis = FakeStrictRedis.from_url(os.environ['REDIS_BASE_URL'], decode_responses=True) cls.custom_init(kwargs) if kwargs.get('no_app'): return cls.appcont = AppController() cls.testapp = webtest.TestApp(cls.appcont.app) if init_anon: res = cls.testapp.get('/api/v1/anon_user') cls.anon_user = res.json['anon_user'] else: cls.anon_user = None
def test_record_param_user_coll_same_dir(self): warc_path = to_path(self.root_dir + '/warcs2/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index, key_template='{user}:{coll}')) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.user=USER2¶m.recorder.coll=COLL2') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.user=USER2¶m.recorder.coll=COLL3') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs2', 2)
def test_record_cookies_header(self): base_path = to_path(self.root_dir + '/warcs/cookiecheck/') recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(base_path), accept_colls='live') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') assert b'HTTP/1.1 302' in resp.body buff = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream(buff) assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers stored_req, stored_resp = self._load_resp_req(base_path) assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.http_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.http_headers.headers assert ('X-Other', 'foo') in stored_req.http_headers.headers assert ('Cookie', 'boo=far') in stored_req.http_headers.headers self._test_all_warcs('/warcs/cookiecheck/', 1)
def test_record_param_user_coll_skip(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy()) recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) # No new entries written self._test_all_warcs('/warcs/USER/COLL/', 2) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2
def setup_class(cls): super(TestDirAgg, cls).setup_class() coll_A = to_path(cls.root_dir + '/colls/A/indexes') coll_B = to_path(cls.root_dir + '/colls/B/indexes') coll_C = to_path(cls.root_dir + '/colls/C/indexes') os.makedirs(coll_A) os.makedirs(coll_B) os.makedirs(coll_C) dir_prefix = os.path.join(cls.root_dir, 'colls') dir_path = '{coll}/indexes' dir_name = 'colls' shutil.copy(to_path(TEST_CDX_PATH + 'example2.cdxj'), coll_A) shutil.copy(to_path(TEST_CDX_PATH + 'iana.cdxj'), coll_B) shutil.copy(to_path(TEST_CDX_PATH + 'dupes.cdxj'), coll_C) with open(to_path(cls.root_dir) + '/somefile', 'w') as fh: fh.write('foo') cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path, dir_name) cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path, dir_name)
def test_timemap_all_coll(self): res = self.testapp.get('/all/timemap/link/http://httpbin.org/get?C=D') link_lines = res.text.rstrip().split('\n') assert len(link_lines) == 4 assert to_path('collection="test2"') in link_lines[3]
def test_record_multiple_writes_keep_open(self): warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz') rel_path = to_path(self.root_dir + '/warcs/') dedup_index = self._get_dedup_index(user=False) writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) # First Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body # Second Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"boo": "far"' in resp.body self._test_all_warcs('/warcs/FOO/', 1) # Check two records in WARC r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1) fullname = coll_dir + files[0] cdxout = BytesIO() with open(fullname, 'rb') as fh: filename = os.path.relpath(fullname, rel_path) write_cdx_index(cdxout, fh, filename, cdxj=True, append_post=True, sort=True) res = [CDXObject(x) for x in res] cdxres = cdxout.getvalue().strip() cdxres = cdxres.split(b'\n') cdxres = [CDXObject(x) for x in cdxres] assert cdxres == res assert len(writer.fh_cache) == 1 writer.close_key(to_path(self.root_dir + '/warcs/FOO/')) assert len(writer.fh_cache) == 0 writer.close() resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') self._test_all_warcs('/warcs/FOO/', 2) warcs = r.hgetall('FOO:warc') assert len(warcs) == 2 writer.close() assert len(writer.fh_cache) == 0
def setup_class(cls, extra_config_file='test_no_invites_config.yaml', init_anon=True, **kwargs): super(BaseWRTests, cls).setup_class() cls.warcs_dir = to_path(cls.root_dir + '/warcs/') cls.storage_dir = os.path.join(to_path(cls.root_dir + '/storage/')) os.makedirs(cls.warcs_dir) os.environ['RECORD_ROOT'] = cls.warcs_dir os.environ['STORAGE_ROOT'] = cls.storage_dir cls.storage_today = os.path.join(cls.storage_dir, today_str()) os.environ['WR_CONFIG'] = 'pkg://webrecorder/config/wr.yaml' if extra_config_file: os.environ['WR_USER_CONFIG'] = os.path.join(cls.get_curr_dir(), extra_config_file) os.environ['REDIS_BASE_URL'] = 'redis://*****:*****@localhost') cls.set_nx_env('EMAIL_SMTP_URL', 'smtp://[email protected]:test@localhost:25') cls.set_nx_env('NO_REMOTE_BROWSERS', '1') def load_wr_config(): config = load_overlay_config('WR_CONFIG', 'pkg://webrecorder/config/wr.yaml', 'WR_USER_CONFIG', '') config['dyn_stats_key_templ'] = { 'rec': 'r:{rec}:<sesh_id>:stats:', 'coll': 'c:{coll}:<sesh_id>:stats:' } config['dyn_ref_templ'] = { 'rec': 'r:{rec}:<sesh_id>:ref:', 'coll': 'c:{coll}:<sesh_id>:ref:', } return config import webrecorder.maincontroller webrecorder.maincontroller.load_wr_config = load_wr_config cls.redis = FakeStrictRedis.from_url(os.environ['REDIS_BASE_URL'], decode_responses=True) cls.sesh_redis = FakeStrictRedis.from_url(os.environ['REDIS_SESSION_URL'], decode_responses=True) cls.custom_init(kwargs) if kwargs.get('no_app'): return cls.maincont = MainController() cls.testapp = webtest.TestApp(cls.maincont.app) if init_anon: res = cls.testapp.post('/api/v1/auth/anon_user') cls.anon_user = res.json['user']['username'] cls.assert_temp_user_sesh(cls.anon_user) else: cls.anon_user = None