def test_resolver_dir_wildcard(self): resolver = DefaultResolverMixin.make_best_resolver(os.path.join(get_test_dir(), '*', '')) cdx = CDXObject() res = resolver('example.warc.gz', cdx) assert len(res) == 1 assert res[0] == os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
def test_resolver_dir_wildcard_as_file_url(self): url = to_file_url(get_test_dir()) + '/*/' resolver = DefaultResolverMixin.make_best_resolver(url) cdx = CDXObject() res = resolver('example.warc.gz', cdx) assert len(res) == 1 assert res[0] == os.path.abspath(os.path.join(get_test_dir(), 'warcs', 'example.warc.gz'))
def setup_class(cls): super(TestZipnumAutoDir, cls).setup_class('config_test.yaml') manager(['init', 'testzip']) cls.archive_dir = os.path.join(cls.root_dir, '_test_colls', 'testzip', 'archive') cls.index_dir = os.path.join(cls.root_dir, '_test_colls', 'testzip', 'indexes') zip_cdx = os.path.join(get_test_dir(), 'zipcdx') shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.idx'), cls.index_dir) shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.cdx.gz'), cls.index_dir) shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.loc'), cls.index_dir) shutil.copy(os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz'), cls.archive_dir)
def test_hls_custom_max_bandwidth(self): headers = {'Content-Type': 'application/x-mpegURL'} with open( os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh: content = fh.read() metadata = {'adaptive_max_bandwidth': 2000000} headers, gen, is_rw = self.rewrite_record( headers, content, ts='201701oe_', url='http://example.com/path/master.m3u8', warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)}) assert headers.headers == [('Content-Type', 'application/x-mpegURL')] filtered = """\ #EXTM3U #EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/" #EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1002000,RESOLUTION=640x360,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT" http://example.com/video_4.m3u8 """ assert b''.join(gen).decode('utf-8') == filtered
def test_dash_default_max(self): headers = {'Content-Type': 'application/dash+xml'} with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh: content = fh.read() headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_', url='http://example.com/path/manifest.mpd') assert headers.headers == [('Content-Type', 'application/dash+xml')] filtered = """\ <?xml version='1.0' encoding='UTF-8'?> <MPD xmlns="urn:mpeg:dash:schema:mpd:2011" mediaPresentationDuration="PT0H3M1.63S" minBufferTime="PT1.5S" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011" type="static"> <Period duration="PT0H3M1.63S" start="PT0S"> <AdaptationSet> <ContentComponent contentType="video" id="1" /> <Representation bandwidth="4190760" codecs="avc1.640028" height="1080" id="1" mimeType="video/mp4" width="1920"> <BaseURL>http://example.com/video-10.mp4</BaseURL> <SegmentBase indexRange="674-1149"> <Initialization range="0-673" /> </SegmentBase> </Representation> </AdaptationSet> <AdaptationSet> <ContentComponent contentType="audio" id="2" /> <Representation bandwidth="255236" codecs="mp4a.40.2" id="7" mimeType="audio/mp4" numChannels="2" sampleRate="44100"> <BaseURL>http://example.com/audio-2.mp4</BaseURL> <SegmentBase indexRange="592-851"> <Initialization range="0-591" /> </SegmentBase> </Representation> </AdaptationSet> </Period> </MPD>""" assert b''.join(gen).decode('utf-8') == filtered
def test_cdxj_resolve_revisit_2(): # Resolve Revisit -- cdxj minimal -- output also json results = cdx_ops_test_data( url="http://example.com/?example=1", sources=[get_test_dir() + "cdxj/example-no-digest.cdxj"], resolveRevisits=True, ) assert len(results) == 2 assert dict(results[0]) == { "urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-", } assert dict(results[1]) == { "urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-", }
def test_cdxj_resolve_revisit(): # Resolve Revisit -- cdxj minimal -- output also json results = cdx_ops_test_data(url='http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True) assert (len(results) == 2) assert (dict(results[0]) == { "urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-" }) assert (dict(results[1]) == { "urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz" })
def test_local_no_head_only_title(): status_headers, buff = get_rewritten( get_test_dir() + 'text_content/sample_no_head_2.html', urlrewriter, head_insert_func, 'com,example,test)/') # wombat insert added assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
def test_zip_prefix_load(): tmpdir = tempfile.mkdtemp() try: shutil.copy(test_zipnum, tmpdir) shutil.copy(get_test_dir() + 'zipcdx/zipnum-sample.cdx.gz', os.path.join(tmpdir, 'zipnum')) config={} config['shard_index_loc'] = dict(match='(.*)', replace=r'\1') server = CDXServer(os.path.join(tmpdir, 'zipnum-sample.idx'), config=config) # Test Page Count results = server.load_cdx(url='iana.org/', matchType='domain', showNumPages=True) results = list(results) assert len(results) == 1, results assert json.loads(results[0]) == {"blocks": 38, "pages": 4, "pageSize": 10} # Test simple query results = server.load_cdx(url='iana.org/') results = list(results) assert len(results) ==3, results assert '20140126200624' in results[0] assert '20140127171238' in results[1] assert 'warc/revisit' in results[2] finally: shutil.rmtree(tmpdir)
def test_cdxj_resolve_revisit_2(): # Resolve Revisit -- cdxj minimal -- output also json results = cdx_ops_test_data( url='http://example.com/?example=1', sources={'nd-file': get_test_dir() + 'cdxj/example-no-digest.cdxj'}, resolveRevisits=True) assert (len(results) == 2) assert (dict(results[0]) == { "urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-" }) assert (dict(results[1]) == { "urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-" })
def test_path_index_resolvers(self): path = os.path.join(get_test_dir(), 'text_content', 'pathindex.txt') path_index = PathIndexResolver(path) cdx = CDXObject() assert list(path_index('example.warc.gz', cdx)) == ['invalid_path', 'sample_archive/warcs/example.warc.gz'] assert list(path_index('iana.warc.gz', cdx)) == ['sample_archive/warcs/iana.warc.gz'] assert list(path_index('not-found.gz', cdx)) == []
def test_local_no_head_only_title(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head_2.html', urlrewriter, head_insert_func, 'com,example,test)/') # wombat insert added assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
def setup_class(cls): super(MementoOverrideTests, cls).setup_class() # Load expected link headers MementoOverrideTests.link_header_data = None with open(to_path(get_test_dir() + '/text_content/link_headers.yaml')) as fh: MementoOverrideTests.link_header_data = yaml.load(fh) MementoOverrideTests.orig_get_timegate_links = MementoIndexSource.get_timegate_links
def test_local_unclosed_script(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_unclosed_script.html', urlrewriter, head_insert_func, 'com,example,test)/') # wombat insert added assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff, buff # JS location and JS link rewritten assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html";\n}\n</script>' in buff, buff
def test_local_no_head(): status_headers, buff = get_rewritten( get_test_dir() + 'text_content/sample_no_head.html', urlrewriter, head_insert_func, 'com,example,test)/') # wombat insert added assert '<script src="/static/__pywb/wombat.js"> </script>' in buff, buff # location rewritten assert 'window.WB_wombat_location = "/other.html"' in buff, buff # link rewritten assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff, buff
def test_local_2_link_only_rewrite(): status_headers, buff = get_rewritten( get_test_dir() + 'text_content/sample.html', urlrewriter, head_insert_func, 'example,example,test)/nolocation_rewrite') # no wombat insert assert '<head><script src="/static/default/wombat.js"> </script>' not in buff # JS location NOT rewritten, JS link rewritten assert 'window.location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff # still link rewrite assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_2_no_rewrite(): status_headers, buff = get_rewritten( get_test_dir() + 'text_content/sample.html', urlrewriter, head_insert_func, 'example,example,test,norewrite)/') # wombat insert added assert '<script src="/static/__pywb/wombat.js"> </script>' in buff # JS location NOT rewritten, JS link NOT rewritten assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff # still link rewrite in HTML assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_1(): status_headers, buff = get_rewritten( get_test_dir() + 'text_content/sample.html', urlrewriter, head_insert_func, 'example,example,test,all)/') # wombat insert added assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff, buff # JS location and JS link rewritten assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff # link rewritten assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_no_head_banner_only(): status_headers, buff = get_rewritten( get_test_dir() + 'text_content/sample_no_head.html', bn_urlrewriter, head_insert_func, 'com,example,test)/') # wombat insert added assert '<script src="/static/__pywb/wombat.js"> </script>' in buff # location NOT rewritten assert 'window.location = "/other.html"' in buff # link NOT rewritten assert '"/some/path/another.html"' in buff
def test_local_banner_only_no_rewrite(): status_headers, buff = get_rewritten( get_test_dir() + 'text_content/sample.html', bn_urlrewriter, head_insert_func, 'com,example,test)/') # wombat insert added assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff # JS location NOT rewritten, JS link NOT rewritten assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff # link NOT rewritten assert '"/some/path/another.html"' in buff
def test_local_no_head_banner_only(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html', bn_urlrewriter, head_insert_func, 'com,example,test)/') # wombat insert added assert '<script src="/static/__pywb/wombat.js"> </script>' in buff # location NOT rewritten assert 'window.location = "/other.html"' in buff # link NOT rewritten assert '"another.html"' in buff
def test_local_2_link_only_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, head_insert_func, 'example,example,test)/nolocation_rewrite') # no wombat insert assert '<head><script src="/static/default/wombat.js"> </script>' not in buff # JS location NOT rewritten, JS link rewritten assert 'window.location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff # still link rewrite assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_2_no_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, head_insert_func, 'example,example,test,norewrite)/') # wombat insert added assert '<script src="/static/__pywb/wombat.js"> </script>' in buff # JS location NOT rewritten, JS link NOT rewritten assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff # still link rewrite in HTML assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_banner_only_no_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', bn_urlrewriter, head_insert_func, 'com,example,test)/') # wombat insert added assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff # JS location NOT rewritten, JS link NOT rewritten assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff # link NOT rewritten assert '"another.html"' in buff
def test_local_1(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, head_insert_func, 'example,example,test,all)/') # wombat insert added assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff, buff # JS location and JS link rewritten assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff # link rewritten assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_no_head(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html', urlrewriter, head_insert_func, 'com,example,test)/') # wombat insert added assert '<script src="/static/__pywb/wombat.js"> </script>' in buff, buff # location rewritten assert 'window.WB_wombat_location = "/other.html"' in buff, buff # link rewritten assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff, buff
def test_dash_custom_max_resolution(self): headers = {'Content-Type': 'application/dash+xml'} with open( os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh: content = fh.read() metadata = { 'adaptive_max_resolution': 921600, 'adaptive_max_bandwidth': 2000000 } headers, gen, is_rw = self.rewrite_record( headers, content, ts='201701oe_', url='http://example.com/path/manifest.mpd', warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)}) assert headers.headers == [('Content-Type', 'application/dash+xml')] filtered = """\ <?xml version='1.0' encoding='UTF-8'?> <MPD xmlns="urn:mpeg:dash:schema:mpd:2011" mediaPresentationDuration="PT0H3M1.63S" minBufferTime="PT1.5S" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011" type="static"> <Period duration="PT0H3M1.63S" start="PT0S"> <AdaptationSet> <ContentComponent contentType="video" id="1" /> <Representation bandwidth="2073921" codecs="avc1.4d401f" height="720" id="2" mimeType="video/mp4" width="1280"> <BaseURL>http://example.com/video-9.mp4</BaseURL> <SegmentBase indexRange="708-1183"> <Initialization range="0-707" /> </SegmentBase> </Representation> </AdaptationSet> <AdaptationSet> <ContentComponent contentType="audio" id="2" /> <Representation bandwidth="255236" codecs="mp4a.40.2" id="7" mimeType="audio/mp4" numChannels="2" sampleRate="44100"> <BaseURL>http://example.com/audio-2.mp4</BaseURL> <SegmentBase indexRange="592-851"> <Initialization range="0-591" /> </SegmentBase> </Representation> </AdaptationSet> </Period> </MPD>""" assert b''.join(gen).decode('utf-8') == filtered
def test_hls_default_max(self): headers = {'Content-Type': 'application/vnd.apple.mpegurl'} with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh: content = fh.read() headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_', url='http://example.com/path/master.m3u8') assert headers.headers == [('Content-Type', 'application/vnd.apple.mpegurl')] filtered = """\ #EXTM3U #EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/" #EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=4495000,RESOLUTION=1920x1080,CODECS="avc1.640028, mp4a.40.2",SUBTITLES="WebVTT" http://example.com/video_6.m3u8 """ assert b''.join(gen).decode('utf-8') == filtered
def test_dash_fb_in_js(self): headers = {'Content-Type': 'text/javascript'} with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh: content = 'dash_manifest:"' + fh.read().encode('unicode-escape').decode('utf-8') rep_ids = r'\n",dash_prefetched_representation_ids:["4","5"]' content += rep_ids headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', url='http://facebook.com/example/dash/manifest.mpd') assert headers.headers == [('Content-Type', 'text/javascript')] result = b''.join(gen).decode('utf-8') # 4, 5 representations removed, replaced with default 1, 7 assert 'dash_prefetched_representation_ids:["1", "7"]' in result assert rep_ids not in result
def test_hls_custom_max_bandwidth(self): headers = {'Content-Type': 'application/x-mpegURL'} with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh: content = fh.read() metadata = {'adaptive_max_bandwidth': 2000000} headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_', url='http://example.com/path/master.m3u8', warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)}) assert headers.headers == [('Content-Type', 'application/x-mpegURL')] filtered = """\ #EXTM3U #EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/" #EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1002000,RESOLUTION=640x360,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT" http://example.com/video_4.m3u8 """ assert b''.join(gen).decode('utf-8') == filtered
def test_dash_custom_max_resolution(self): headers = {'Content-Type': 'application/dash+xml'} with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh: content = fh.read() metadata = {'adaptive_max_resolution': 921600, 'adaptive_max_bandwidth': 2000000} headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_', url='http://example.com/path/manifest.mpd', warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)}) assert headers.headers == [('Content-Type', 'application/dash+xml')] filtered = """\ <?xml version='1.0' encoding='UTF-8'?> <MPD xmlns="urn:mpeg:dash:schema:mpd:2011" mediaPresentationDuration="PT0H3M1.63S" minBufferTime="PT1.5S" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011" type="static"> <Period duration="PT0H3M1.63S" start="PT0S"> <AdaptationSet> <ContentComponent contentType="video" id="1" /> <Representation bandwidth="2073921" codecs="avc1.4d401f" height="720" id="2" mimeType="video/mp4" width="1280"> <BaseURL>http://example.com/video-9.mp4</BaseURL> <SegmentBase indexRange="708-1183"> <Initialization range="0-707" /> </SegmentBase> </Representation> </AdaptationSet> <AdaptationSet> <ContentComponent contentType="audio" id="2" /> <Representation bandwidth="255236" codecs="mp4a.40.2" id="7" mimeType="audio/mp4" numChannels="2" sampleRate="44100"> <BaseURL>http://example.com/audio-2.mp4</BaseURL> <SegmentBase indexRange="592-851"> <Initialization range="0-591" /> </SegmentBase> </Representation> </AdaptationSet> </Period> </MPD>""" assert b''.join(gen).decode('utf-8') == filtered
def test_zip_prefix_load(): tmpdir = tempfile.mkdtemp() try: shutil.copy(test_zipnum, tmpdir) shutil.copy(get_test_dir() + 'zipcdx/zipnum-sample.cdx.gz', os.path.join(tmpdir, 'zipnum')) config={} config['shard_index_loc'] = dict(match='(.*)', replace=r'\1') config['path'] = os.path.join(tmpdir, 'zipnum-sample.idx') config['type'] = 'zipnum' server = init_index_agg({'zip': config}) # Test Page Count results = server(dict(url='iana.org/', matchType='domain', showNumPages=True)) cdx_iter, err = results results = list(cdx_iter) assert len(results) == 1, results assert results[0] == {"blocks": 38, "pages": 4, "pageSize": 10} # Test simple query results = server(dict(url='iana.org/')) cdx_iter, err = results results = list(cdx_iter) assert len(results) == 3, results assert '20140126200624' == results[0]['timestamp'] assert '20140127171238' == results[1]['timestamp'] assert 'warc/revisit' == results[2]['mime'] finally: shutil.rmtree(tmpdir)
def test_zip_prefix_load(): tmpdir = tempfile.mkdtemp() try: shutil.copy(test_zipnum, tmpdir) shutil.copy(get_test_dir() + 'zipcdx/zipnum-sample.cdx.gz', os.path.join(tmpdir, 'zipnum')) config = {} config['shard_index_loc'] = dict(match='(.*)', replace=r'\1') config['path'] = os.path.join(tmpdir, 'zipnum-sample.idx') config['type'] = 'zipnum' server = init_index_agg({'zip': config}) # Test Page Count results = server( dict(url='iana.org/', matchType='domain', showNumPages=True)) cdx_iter, err = results results = list(cdx_iter) assert len(results) == 1, results assert results[0] == {"blocks": 38, "pages": 4, "pageSize": 10} # Test simple query results = server(dict(url='iana.org/')) cdx_iter, err = results results = list(cdx_iter) assert len(results) == 3, results assert '20140126200624' == results[0]['timestamp'] assert '20140127171238' == results[1]['timestamp'] assert 'warc/revisit' == results[2]['mime'] finally: shutil.rmtree(tmpdir)
def test_zip_prefix_load(): tmpdir = tempfile.mkdtemp() try: shutil.copy(test_zipnum, tmpdir) shutil.copy(get_test_dir() + 'zipcdx/zipnum-sample.cdx.gz', os.path.join(tmpdir, 'zipnum')) config = {} config['shard_index_loc'] = dict(match='(.*)', replace=r'\1') server = CDXServer(os.path.join(tmpdir, 'zipnum-sample.idx'), config=config) # Test Page Count results = server.load_cdx(url='iana.org/', matchType='domain', showNumPages=True) results = list(results) assert len(results) == 1, results assert json.loads(results[0]) == { "blocks": 38, "pages": 4, "pageSize": 10 } # Test simple query results = server.load_cdx(url='iana.org/') results = list(results) assert len(results) == 3, results assert '20140126200624' in results[0] assert '20140127171238' in results[1] assert 'warc/revisit' in results[2] finally: shutil.rmtree(tmpdir)
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True) org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - """ # ================================================================= from pywb.cdx.cdxserver import CDXServer import os import sys import six from pywb import get_test_dir test_cdx_dir = get_test_dir() + "cdx/" def cdx_ops_test_data(url, sources=[test_cdx_dir + "iana.cdx"], **kwparams): kwparams["url"] = url if not "output" in kwparams: kwparams["output"] = "cdxobject" server = CDXServer(sources) results = server.load_cdx(**kwparams) return list(results) def cdx_ops_test(*args, **kwargs): results = cdx_ops_test_data(*args, **kwargs)
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"} {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"} """ #================================================================= from pywb.cdx.cdxserver import CDXServer import os import sys from pywb import get_test_dir test_cdx_dir = get_test_dir() + 'cdx/' def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): kwparams['url'] = url if not 'output' in kwparams: kwparams['output'] = 'cdxobject' fields = kwparams.get('fields') if fields: fields = fields.split(',') server = CDXServer(sources) results = server.load_cdx(**kwparams) for x in results: if not isinstance(x, str):
>>> print_binsearch_results_range('org,iana)/protocols', 'z-', iter_range) org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz """ #================================================================= import os from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range from pywb import get_test_dir #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/' test_cdx_dir = get_test_dir() + 'cdx/' def print_binsearch_results(key, iter_func): with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx: for line in iter_func(cdx, key): print line def print_binsearch_results_range(key, end_key, iter_func, prev_size=0): with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx: for line in iter_func(cdx, key, end_key, prev_size=prev_size): print line if __name__ == "__main__":
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename from pywb.cdx.cdxobject import CDXObject from io import BytesIO import sys import os import shutil import tempfile from pytest import raises TEST_CDX_DIR = get_test_dir() + 'cdx/' TEST_WARC_DIR = get_test_dir() + 'warcs/' def read_fully(cdx): with open(TEST_CDX_DIR + cdx, 'rb') as fh: curr = BytesIO() while True: b = fh.read() if not b: break curr.write(b) return curr.getvalue() def cdx_index(warc, **options): buff = BytesIO()
test_3: http://cdxserver.example.com/cdx test_4: !!python/object:pywb.cdx.cdxsource.RemoteCDXSource {{ remote_url: 'http://cdxserver.example.com/cdx', cookie: custom_token=value, remote_processing: true, }} test_5: {0}cdx/example.cdx test_6: index_paths: invalid://abc """.format(get_test_dir()) def test_cdxserver_config(): config = yaml.load(yaml_config) cdxserver = create_cdx_server(config.get('test_1')) assert(isinstance(cdxserver, CDXServer)) sources = cdxserver.sources assert len(sources) == 5 assert type(sources[0]) == CDXFile assert sources[0].filename.endswith('example.cdx') # remote source with no remote processing assert type(sources[1]) == RemoteCDXSource assert sources[1].remote_url == 'http://cdxserver.example.com/cdx' assert sources[1].remote_processing == False
from pywb import get_test_dir from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename from pywb.cdx.cdxobject import CDXObject from io import BytesIO import sys import os import shutil import tempfile from pytest import raises TEST_CDX_DIR = get_test_dir() + 'cdx/' TEST_WARC_DIR = get_test_dir() + 'warcs/' def read_fully(cdx): with open(TEST_CDX_DIR + cdx, 'rb') as fh: curr = BytesIO() while True: b = fh.read() if not b: break curr.write(b) return curr.getvalue() def cdx_index(warc, **options):
""" from pywb import get_test_dir from pywb.warcserver.index.test.test_cdxops import cdx_ops_test, cdx_ops_test_data from pywb.warcserver.warcserver import init_index_agg import shutil import tempfile import os import json import pytest test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx' def zip_ops_test_data(url, **kwargs): sources = {'zip': test_zipnum} res = cdx_ops_test_data(url, sources, **kwargs) if res: return res[0] def zip_ops_test(url, **kwargs): sources = {'zip': test_zipnum} cdx_ops_test(url, sources, **kwargs) def zip_test_err(url, **kwargs): sources = {'zip': get_test_dir() + 'zipcdx/zipnum-bad.idx'} cdx_ops_test(url, sources, **kwargs)
def zip_test_err(url, **kwargs): sources = {'zip': get_test_dir() + 'zipcdx/zipnum-bad.idx'} cdx_ops_test(url, sources, **kwargs)
def _get_sample_warc(self, name): return os.path.join(get_test_dir(), 'warcs', name)
def test_wombat_top(): #status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter) status_headers, buff = get_rewritten(get_test_dir() + 'text_content/toptest.js', urlrewriter) assert 'WB_wombat_top!==window' in buff
""" import os import sys import pprint from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed from pywb.warc.pathresolvers import make_best_resolvers from pywb.warc.resolvingloader import ResolvingLoader from pywb.cdx.cdxobject import CDXObject from pywb import get_test_dir #============================================================================== test_warc_dir = get_test_dir() + 'warcs/' URL_AGNOSTIC_ORIG_CDX = 'org,iana,example)/ 20130702195402 http://example.iana.org/ \ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \ 1001 353 example-url-agnostic-orig.warc.gz' URL_AGNOSTIC_REVISIT_CDX = 'com,example)/ 20130729195151 http://[email protected]/ \ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \ 591 355 example-url-agnostic-revisit.warc.gz' URL_AGNOSTIC_REVISIT_NO_DIGEST_CDX = 'com,example)/ 20130729195151 http://[email protected]/ \ warc/revisit - - - - \ 591 355 example-url-agnostic-revisit.warc.gz' BAD_ORIG_CDX = 'org,iana,example)/ 20130702195401 http://example.iana.org/ \
def zip_test_err(url, **kwargs): sources = get_test_dir() + 'zipcdx/zipnum-bad.idx' cdx_ops_test(url, sources, **kwargs)
def test_wombat_top(): #status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter) status_headers, buff = get_rewritten( get_test_dir() + 'text_content/toptest.js', urlrewriter) assert 'WB_wombat_top!==window' in buff
org,iana)/domains/int 20140126201239 zipnum 8884 353 36 org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37 >>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix') org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz """ from test_cdxops import cdx_ops_test from pywb import get_test_dir test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx' def zip_ops_test(url, **kwargs): sources = test_zipnum cdx_ops_test(url, sources, **kwargs) if __name__ == "__main__": import doctest doctest.testmod()
# ============================================================================ def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']): return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist]) def key_ts_res(cdxlist, extra='filename'): return '\n'.join([cdx['urlkey'] + ' ' + cdx['timestamp'] + ' ' + cdx[extra] for cdx in cdxlist]) def to_path(path): if os.path.sep != '/': path = path.replace('/', os.path.sep) return path # ============================================================================ TEST_CDX_PATH = to_path(get_test_dir() + '/cdxj/') TEST_WARC_PATH = to_path(get_test_dir() + '/warcs/') # ============================================================================ class BaseTestClass(object): @classmethod def setup_class(cls): pass @classmethod def teardown_class(cls): pass # ============================================================================