Exemple #1
0
def test_zip_prefix_load():

    tmpdir = tempfile.mkdtemp()
    try:
        shutil.copy(test_zipnum, tmpdir)
        shutil.copy(get_test_dir() + 'zipcdx/zipnum-sample.cdx.gz',
                    os.path.join(tmpdir, 'zipnum'))

        config={}
        config['shard_index_loc'] = dict(match='(.*)',
                                         replace=r'\1')
        server = CDXServer(os.path.join(tmpdir, 'zipnum-sample.idx'),
                           config=config)


        # Test Page Count
        results = server.load_cdx(url='iana.org/',
                                  matchType='domain',
                                  showNumPages=True)

        results = list(results)
        assert len(results) == 1, results
        assert json.loads(results[0]) == {"blocks": 38, "pages": 4, "pageSize": 10}


        # Test simple query
        results = server.load_cdx(url='iana.org/')
        results = list(results)
        assert len(results) ==3, results
        assert '20140126200624' in results[0]
        assert '20140127171238' in results[1]
        assert 'warc/revisit' in results[2]

    finally:
        shutil.rmtree(tmpdir)
Exemple #2
0
def cdx_ops_test_data(url, sources=[test_cdx_dir + "iana.cdx"], **kwparams):
    kwparams["url"] = url
    if not "output" in kwparams:
        kwparams["output"] = "cdxobject"

    server = CDXServer(sources)
    results = server.load_cdx(**kwparams)
    return list(results)
Exemple #3
0
def test2_fuzzy_no_match_3():
    # special fuzzy rule, matches prefix test.example.example.,
    # but doesn't match rule regex
    with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
        server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
        with raises(NotFoundException):
            server.load_cdx(url='http://test.example.example/',
                            allowFuzzy=True)
Exemple #4
0
def cdx_ops_test_data(url, sources=[test_cdx_dir + 'iana.cdx'], **kwparams):
    kwparams['url'] = url
    if not 'output' in kwparams:
        kwparams['output'] = 'cdxobject'

    server = CDXServer(sources)
    results = server.load_cdx(**kwparams)
    return list(results)
Exemple #5
0
def test2_fuzzy_no_match_3():
    # special fuzzy rule, matches prefix test.example.example.,
    # but doesn't match rule regex
    with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
        server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
        with raises(NotFoundException):
            server.load_cdx(url='http://test.example.example/',
                            allowFuzzy=True)
Exemple #6
0
def test_fuzzy_no_match_1():
    # no match, no fuzzy
    with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
        server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
        with raises(NotFoundException):
            server.load_cdx(url='http://notfound.example.com/',
                            output='cdxobject',
                            reverse=True,
                            allowFuzzy=True)
Exemple #7
0
def test_match():
    # Local CDX Server
    assert_cdx_match(CDXServer([TEST_CDX_DIR]))

    # Remote CDX Source, Local Filtering
    assert_cdx_match(CDXServer(CDX_SERVER_URL))

    # Remote CDX Query (Remote Filtering)
    assert_cdx_match(RemoteCDXServer(CDX_SERVER_URL))
Exemple #8
0
def test_fuzzy_no_match_1():
    # no match, no fuzzy
    with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
        server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
        with raises(NotFoundException):
            server.load_cdx(url='http://notfound.example.com/',
                            output='cdxobject',
                            reverse=True,
                            allowFuzzy=True)
Exemple #9
0
def test_fuzzy_no_match_2():
    # fuzzy rule, but no actual match
    with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
        server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
        with raises(NotFoundException):
            server.load_cdx(url='http://notfound.example.com/?_=1234',
                            closest='2014',
                            reverse=True,
                            output='cdxobject',
                            allowFuzzy=True)
Exemple #10
0
def test_fuzzy_no_match_2():
    # fuzzy rule, but no actual match
    with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
        server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
        with raises(NotFoundException):
            server.load_cdx(url='http://notfound.example.com/?_=1234',
                            closest='2014',
                            reverse=True,
                            output='cdxobject',
                            allowFuzzy=True)
Exemple #11
0
def cdx_ops_test(url, sources=[test_cdx_dir + 'iana.cdx'], **kwparams):
    kwparams['url'] = url
    kwparams['output'] = 'cdxobject'
    fields = kwparams.get('fields')
    if fields:
        fields = fields.split(',')

    server = CDXServer(sources)
    results = server.load_cdx(**kwparams)

    for x in results:
        l = x.to_text(fields).replace('\t', '    ')
        sys.stdout.write(l)
Exemple #12
0
def lookup(index, domain):
    """ looks up index in ccc index """
    config = {'archive_paths': 'https:/s3.amazonaws.com/commoncrawl',
	             'enable_cdx_api': '-index',
	             'enable_memento': True,
	             'framed_replay': False,
	             'max_blocks': 5,
	             'shard_index_loc': {'match': '.*(collections/[^/]+/)',
	                                 'replace': 'https://s3.amazonaws.com/commoncrawl/cc-index/\\1'}}
    try:
        server = CDXServer('collections/{}/indexes/cluster.idx'.format(index), config=config)
        return server.load_cdx(url='{}/*'.format(domain))
    except NotFoundException:
        log('domain not found: {}'.format(domain))
        return list()
Exemple #13
0
def test_fuzzy_match():
    # Local CDX Server
    assert_cdx_fuzzy_match(
        CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE))

    # Remote CDX Source, Local Filtering
    # two calls to remote, first exact with 404,
    # then fuzzy with 200
    assert_cdx_fuzzy_match(
        CDXServer(CDX_SERVER_URL, ds_rules_file=DEFAULT_RULES_FILE),
        mock_urlopen_fuzzy)

    # Remote CDX Query (Remote Filtering)
    # fuzzy match handled on remote, single response
    assert_cdx_fuzzy_match(
        RemoteCDXServer(CDX_SERVER_URL, ds_rules_file=DEFAULT_RULES_FILE))
Exemple #14
0
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
    kwparams['url'] = url
    if not 'output' in kwparams:
        kwparams['output'] = 'cdxobject'
    fields = kwparams.get('fields')
    if fields:
        fields = fields.split(',')

    server = CDXServer(sources)
    results = server.load_cdx(**kwparams)

    for x in results:
        if not isinstance(x, str):
            l = x.to_text(fields).replace('\t', '    ')
        else:
            l = x
        sys.stdout.write(l)
Exemple #15
0
def init_redis_server_key_file():
    source = RedisCDXSource('redis://127.0.0.1:6379/0/key')

    for f in os.listdir(test_cdx_dir):
        if f.endswith('.cdx'):
            load_cdx_into_redis(source, f, source.cdx_key)

    return CDXServer([source])
Exemple #16
0
def test_zip_prefix_load():

    tmpdir = tempfile.mkdtemp()
    try:
        shutil.copy(test_zipnum, tmpdir)
        shutil.copy(get_test_dir() + 'zipcdx/zipnum-sample.cdx.gz',
                    os.path.join(tmpdir, 'zipnum'))

        config = {}
        config['shard_index_loc'] = dict(match='(.*)', replace=r'\1')
        server = CDXServer(os.path.join(tmpdir, 'zipnum-sample.idx'),
                           config=config)

        # Test Page Count
        results = server.load_cdx(url='iana.org/',
                                  matchType='domain',
                                  showNumPages=True)

        results = list(results)
        assert len(results) == 1, results
        assert json.loads(results[0]) == {
            "blocks": 38,
            "pages": 4,
            "pageSize": 10
        }

        # Test simple query
        results = server.load_cdx(url='iana.org/')
        results = list(results)
        assert len(results) == 3, results
        assert '20140126200624' in results[0]
        assert '20140127171238' in results[1]
        assert 'warc/revisit' in results[2]

    finally:
        shutil.rmtree(tmpdir)
Exemple #17
0
def test_err_404():
    # Test local for consistency
    with raises(NotFoundException):
        assert_404(CDXServer([TEST_CDX_DIR]))

    assert_error(assert_404, NotFoundException)
Exemple #18
0
def assert_error(func, exception):
    with raises(exception):
        func(CDXServer(CDX_SERVER_URL))

    with raises(exception):
        func(RemoteCDXServer(CDX_SERVER_URL))