コード例 #1
0
    def test_agg_local_revisit(self):
        resp = self.testapp.get(
            '/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local'
        )

        assert resp.headers['Warcserver-Source-Coll'] == 'local:dupes.cdxj'

        buff = BytesIO(resp.body)
        status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
        assert status_headers.get_header(
            'WARC-Target-URI') == 'http://example.com'
        assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z'
        assert status_headers.get_header(
            'WARC-Refers-To-Target-URI') == 'http://example.com'
        assert status_headers.get_header(
            'WARC-Refers-To-Date') == '2014-01-27T17:12:00Z'

        assert resp.headers['Link'] == MementoUtils.make_link(
            'http://example.com', 'original')
        assert resp.headers[
            'Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'<!doctype html>' in resp.body

        assert 'ResErrors' not in resp.headers
コード例 #2
0
ファイル: test_handlers.py プロジェクト: ikreymer/pywb
 def _check_uri_date(self, resp, uri, dt):
     buff = BytesIO(resp.body)
     buff = ChunkedDataReader(buff)
     status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
     assert status_headers.get_header('WARC-Target-URI') == uri
     if dt == True:
         assert status_headers.get_header('WARC-Date') != ''
     else:
         assert status_headers.get_header('WARC-Date') == dt
コード例 #3
0
    def test_record_param_user_coll_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(
            self.upstream_url,
            PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/user-agent',
            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 2

        if b'warc/revisit' in res[0]:
            cdx = CDXObject(res[0])
        else:
            cdx = CDXObject(res[1])

        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'warc/revisit'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])

        warcs = r.hgetall('USER:COLL:warc')
        assert len(warcs) == 2
        assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode(
            'utf-8')

        with open(fullwarc, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            # Test refers-to headers
            status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
            assert status_headers.get_header('WARC-Type') == 'revisit'
            assert status_headers.get_header(
                'WARC-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Date') != ''
            assert status_headers.get_header(
                'WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Refers-To-Date') != ''
コード例 #4
0
ファイル: test_handlers.py プロジェクト: mirrorweb/pywb
 def _check_uri_date(self, resp, uri, dt):
     buff = BytesIO(resp.body)
     buff = ChunkedDataReader(buff)
     status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
     assert status_headers.get_header('WARC-Target-URI') == uri
     if dt == True:
         assert status_headers.get_header('WARC-Date') != ''
     else:
         assert status_headers.get_header('WARC-Date') == dt
コード例 #5
0
ファイル: test_recorder.py プロジェクト: ikreymer/pywb
    def test_record_param_user_coll_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
                                    '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        if b'warc/revisit' in res[0]:
            cdx = CDXObject(res[0])
        else:
            cdx = CDXObject(res[1])

        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'warc/revisit'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])

        warcs = r.hgetall('USER:COLL:warc')
        assert len(warcs) == 2
        assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8')

        with open(fullwarc, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            # Test refers-to headers
            status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
            assert status_headers.get_header('WARC-Type') == 'revisit'
            assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Date') != ''
            assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Refers-To-Date') != ''
コード例 #6
0
ファイル: test_handlers.py プロジェクト: ikreymer/pywb
    def test_agg_local_revisit(self):
        resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local')

        assert resp.headers['Warcserver-Source-Coll'] == 'local:dupes.cdxj'

        buff = BytesIO(resp.body)
        status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
        assert status_headers.get_header('WARC-Target-URI') == 'http://example.com'
        assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z'
        assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://example.com'
        assert status_headers.get_header('WARC-Refers-To-Date') == '2014-01-27T17:12:00Z'

        assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original')
        assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'<!doctype html>' in resp.body

        assert 'ResErrors' not in resp.headers