def test_zip_prefix_load(): tmpdir = tempfile.mkdtemp() try: shutil.copy(test_zipnum, tmpdir) shutil.copy(get_test_dir() + 'zipcdx/zipnum-sample.cdx.gz', os.path.join(tmpdir, 'zipnum')) config={} config['shard_index_loc'] = dict(match='(.*)', replace=r'\1') server = CDXServer(os.path.join(tmpdir, 'zipnum-sample.idx'), config=config) # Test Page Count results = server.load_cdx(url='iana.org/', matchType='domain', showNumPages=True) results = list(results) assert len(results) == 1, results assert json.loads(results[0]) == {"blocks": 38, "pages": 4, "pageSize": 10} # Test simple query results = server.load_cdx(url='iana.org/') results = list(results) assert len(results) ==3, results assert '20140126200624' in results[0] assert '20140127171238' in results[1] assert 'warc/revisit' in results[2] finally: shutil.rmtree(tmpdir)
def cdx_ops_test_data(url, sources=[test_cdx_dir + "iana.cdx"], **kwparams): kwparams["url"] = url if not "output" in kwparams: kwparams["output"] = "cdxobject" server = CDXServer(sources) results = server.load_cdx(**kwparams) return list(results)
def test2_fuzzy_no_match_3(): # special fuzzy rule, matches prefix test.example.example., # but doesn't match rule regex with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen): server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) with raises(NotFoundException): server.load_cdx(url='http://test.example.example/', allowFuzzy=True)
def cdx_ops_test_data(url, sources=[test_cdx_dir + 'iana.cdx'], **kwparams): kwparams['url'] = url if not 'output' in kwparams: kwparams['output'] = 'cdxobject' server = CDXServer(sources) results = server.load_cdx(**kwparams) return list(results)
def test2_fuzzy_no_match_3(): # special fuzzy rule, matches prefix test.example.example., # but doesn't match rule regex with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) with raises(NotFoundException): server.load_cdx(url='http://test.example.example/', allowFuzzy=True)
def test_fuzzy_no_match_1(): # no match, no fuzzy with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) with raises(NotFoundException): server.load_cdx(url='http://notfound.example.com/', output='cdxobject', reverse=True, allowFuzzy=True)
def test_match(): # Local CDX Server assert_cdx_match(CDXServer([TEST_CDX_DIR])) # Remote CDX Source, Local Filtering assert_cdx_match(CDXServer(CDX_SERVER_URL)) # Remote CDX Query (Remote Filtering) assert_cdx_match(RemoteCDXServer(CDX_SERVER_URL))
def test_fuzzy_no_match_1(): # no match, no fuzzy with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen): server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) with raises(NotFoundException): server.load_cdx(url='http://notfound.example.com/', output='cdxobject', reverse=True, allowFuzzy=True)
def test_fuzzy_no_match_2(): # fuzzy rule, but no actual match with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) with raises(NotFoundException): server.load_cdx(url='http://notfound.example.com/?_=1234', closest='2014', reverse=True, output='cdxobject', allowFuzzy=True)
def test_fuzzy_no_match_2(): # fuzzy rule, but no actual match with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen): server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) with raises(NotFoundException): server.load_cdx(url='http://notfound.example.com/?_=1234', closest='2014', reverse=True, output='cdxobject', allowFuzzy=True)
def cdx_ops_test(url, sources=[test_cdx_dir + 'iana.cdx'], **kwparams): kwparams['url'] = url kwparams['output'] = 'cdxobject' fields = kwparams.get('fields') if fields: fields = fields.split(',') server = CDXServer(sources) results = server.load_cdx(**kwparams) for x in results: l = x.to_text(fields).replace('\t', ' ') sys.stdout.write(l)
def lookup(index, domain): """ looks up index in ccc index """ config = {'archive_paths': 'https:/s3.amazonaws.com/commoncrawl', 'enable_cdx_api': '-index', 'enable_memento': True, 'framed_replay': False, 'max_blocks': 5, 'shard_index_loc': {'match': '.*(collections/[^/]+/)', 'replace': 'https://s3.amazonaws.com/commoncrawl/cc-index/\\1'}} try: server = CDXServer('collections/{}/indexes/cluster.idx'.format(index), config=config) return server.load_cdx(url='{}/*'.format(domain)) except NotFoundException: log('domain not found: {}'.format(domain)) return list()
def test_fuzzy_match(): # Local CDX Server assert_cdx_fuzzy_match( CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)) # Remote CDX Source, Local Filtering # two calls to remote, first exact with 404, # then fuzzy with 200 assert_cdx_fuzzy_match( CDXServer(CDX_SERVER_URL, ds_rules_file=DEFAULT_RULES_FILE), mock_urlopen_fuzzy) # Remote CDX Query (Remote Filtering) # fuzzy match handled on remote, single response assert_cdx_fuzzy_match( RemoteCDXServer(CDX_SERVER_URL, ds_rules_file=DEFAULT_RULES_FILE))
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): kwparams['url'] = url if not 'output' in kwparams: kwparams['output'] = 'cdxobject' fields = kwparams.get('fields') if fields: fields = fields.split(',') server = CDXServer(sources) results = server.load_cdx(**kwparams) for x in results: if not isinstance(x, str): l = x.to_text(fields).replace('\t', ' ') else: l = x sys.stdout.write(l)
def init_redis_server_key_file(): source = RedisCDXSource('redis://127.0.0.1:6379/0/key') for f in os.listdir(test_cdx_dir): if f.endswith('.cdx'): load_cdx_into_redis(source, f, source.cdx_key) return CDXServer([source])
def test_zip_prefix_load(): tmpdir = tempfile.mkdtemp() try: shutil.copy(test_zipnum, tmpdir) shutil.copy(get_test_dir() + 'zipcdx/zipnum-sample.cdx.gz', os.path.join(tmpdir, 'zipnum')) config = {} config['shard_index_loc'] = dict(match='(.*)', replace=r'\1') server = CDXServer(os.path.join(tmpdir, 'zipnum-sample.idx'), config=config) # Test Page Count results = server.load_cdx(url='iana.org/', matchType='domain', showNumPages=True) results = list(results) assert len(results) == 1, results assert json.loads(results[0]) == { "blocks": 38, "pages": 4, "pageSize": 10 } # Test simple query results = server.load_cdx(url='iana.org/') results = list(results) assert len(results) == 3, results assert '20140126200624' in results[0] assert '20140127171238' in results[1] assert 'warc/revisit' in results[2] finally: shutil.rmtree(tmpdir)
def test_err_404(): # Test local for consistency with raises(NotFoundException): assert_404(CDXServer([TEST_CDX_DIR])) assert_error(assert_404, NotFoundException)
def assert_error(func, exception): with raises(exception): func(CDXServer(CDX_SERVER_URL)) with raises(exception): func(RemoteCDXServer(CDX_SERVER_URL))