Exemple #1
0
    def test_record_param_user_coll(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', None)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
                            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 1

        cdx = CDXObject(res[0])
        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'application/json'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        warcs = r.hgetall('USER:COLL:warc')
        full_path = to_path(self.root_dir + '/warcs/' + cdx['filename'])
        assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
Exemple #2
0
    def test_record_param_user_coll_skip(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy())

        recorder_app = RecorderApp(
            self.upstream_url,
            PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        # No new entries written
        self._test_all_warcs('/warcs/USER/COLL/', 2)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/user-agent',
            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body
        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 2
Exemple #3
0
    def test_redis_warc_1(self):
        f = FakeStrictRedis.from_url('redis://localhost/2')
        f.hset('test:warc', 'example2.warc.gz', TEST_WARC_PATH + 'example2.warc.gz')

        resp = self.testapp.get('/allredis/resource?url=http://www.example.com/')

        assert resp.headers['Warcserver-Source-Coll'] == 'example'
Exemple #4
0
    def test_error_redis_file_not_found(self):
        f = FakeStrictRedis.from_url('redis://localhost/2')
        f.hset('test:warc', 'example2.warc.gz',
               './x-no-such-dir/example2.warc.gz')

        resp = self.testapp.get(
            '/allredis/resource?url=http://www.example.com/', status=503)
        assert resp.json[
            'message'] == "example2.warc.gz: [Errno 2] No such file or directory: './x-no-such-dir/example2.warc.gz'"

        f.hdel('test:warc', 'example2.warc.gz')
        resp = self.testapp.get(
            '/allredis/resource?url=http://www.example.com/', status=503)

        assert resp.json == {
            'message': 'example2.warc.gz: Archive File Not Found',
            'errors': {
                'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'
            }
        }

        f.delete('test:warc')
        resp = self.testapp.get(
            '/allredis/resource?url=http://www.example.com/', status=503)

        assert resp.json == {
            'message': 'example2.warc.gz: Archive File Not Found',
            'errors': {
                'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'
            }
        }
Exemple #5
0
    def test_record_param_user_coll_write_dupe_no_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy())

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/get?foo=bar',
            '&param.recorder.user=USER&param.recorder.coll=COLL')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 3)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 3

        mimes = [CDXObject(x)['mime'] for x in res]

        assert sorted(mimes) == [
            'application/json', 'application/json', 'warc/revisit'
        ]

        assert len(writer.fh_cache) == 0
    def __init__(self, old_redis_url, new_redis_url, dry_run=True,
                 per_recording_list=False, s3_import=False, s3_root=None):
        self.old_redis = StrictRedis.from_url(old_redis_url, decode_responses=True)
        self.dry_run = dry_run
        self.per_recording_list = per_recording_list
        self.s3_import = s3_import

        if s3_import:
            assert(s3_root)
            import boto3
            self.s3_root = s3_root
            self.s3 = boto3.client('s3')
        else:
            self.s3_root = None
            self.s3 = None

        if self.dry_run:
            import redis
            redis.StrictRedis = fakeredis.FakeStrictRedis
            self.redis = FakeStrictRedis.from_url(new_redis_url, decode_responses=True)
        else:
            self.redis = StrictRedis.from_url(new_redis_url, decode_responses=True)

        print('Redis Inited')

        self.cli = CLIUserManager(new_redis_url)
Exemple #7
0
    def test_redis_warc_1(self):
        f = FakeStrictRedis.from_url('redis://localhost/2')
        f.hset('test:warc', 'example2.warc.gz', TEST_WARC_PATH + 'example2.warc.gz')

        resp = self.testapp.get('/allredis/resource?url=http://www.example.com/')

        assert resp.headers['Warcserver-Source-Coll'] == 'example'
Exemple #8
0
    def test_record_param_user_coll(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', None)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
                            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 1

        cdx = CDXObject(res[0])
        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'application/json'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        warcs = r.hgetall('USER:COLL:warc')
        full_path = to_path(self.root_dir + '/warcs/' + cdx['filename'])
        assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
Exemple #9
0
    def __init__(self,
                 old_redis_url,
                 new_redis_url,
                 dry_run=True,
                 per_recording_list=False,
                 s3_import=False,
                 s3_root=None):
        self.old_redis = StrictRedis.from_url(old_redis_url,
                                              decode_responses=True)
        self.dry_run = dry_run
        self.per_recording_list = per_recording_list
        self.s3_import = s3_import

        if s3_import:
            assert (s3_root)
            import boto3
            self.s3_root = s3_root
            self.s3 = boto3.client('s3')
        else:
            self.s3_root = None
            self.s3 = None

        if self.dry_run:
            import redis
            redis.StrictRedis = fakeredis.FakeStrictRedis
            self.redis = FakeStrictRedis.from_url(new_redis_url,
                                                  decode_responses=True)
        else:
            self.redis = StrictRedis.from_url(new_redis_url,
                                              decode_responses=True)

        print('Redis Inited')

        self.cli = CLIUserManager(new_redis_url)
Exemple #10
0
    def test_record_video_metadata(self):
        pytest.importorskip('youtube_dl')
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        params = {'param.recorder.user': '******',
                  'param.recorder.coll': 'VIDEO',
                  'content_type': 'application/vnd.youtube-dl_formats+json'
                 }

        resp = self._test_warc_write(recorder_app,
            'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params),
            link_url='metadata://www.youtube.com/v/BfBgWtAIbRc')

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('USER:VIDEO:warc')
        assert len(warcs) == 1

        filename = list(warcs.values())[0]

        with open(filename, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp)

        status_headers = record.rec_headers
        assert status_headers.get_header('WARC-Type') == 'metadata'
        assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json'
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
Exemple #11
0
    def test_record_video_metadata(self):
        pytest.importorskip('youtube_dl')
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        params = {'param.recorder.user': '******',
                  'param.recorder.coll': 'VIDEO',
                  'content_type': 'application/vnd.youtube-dl_formats+json'
                 }

        resp = self._test_warc_write(recorder_app,
            'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params),
            link_url='metadata://www.youtube.com/v/BfBgWtAIbRc')

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('USER:VIDEO:warc')
        assert len(warcs) == 1

        filename = list(warcs.values())[0]

        with open(filename, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp)

        status_headers = record.rec_headers
        assert status_headers.get_header('WARC-Type') == 'metadata'
        assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json'
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
Exemple #12
0
    def test_record_param_user_coll_write_dupe_no_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy())

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 3)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 3

        mimes = [CDXObject(x)['mime'] for x in res]

        assert sorted(mimes) == ['application/json', 'application/json', 'warc/revisit']

        assert len(writer.fh_cache) == 0
Exemple #13
0
    def test_record_custom_record(self):
        dedup_index = self._get_dedup_index(user=False)

        warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'

        buff = b'Some Data'

        testapp = webtest.TestApp(recorder_app)
        headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo'}

        resp = testapp.put(req_url, headers=headers, params=buff)

        assert resp.json['success'] == 'true'
        assert resp.json['WARC-Date'] != ''

        self._test_all_warcs('/warcs/meta', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('META:warc')
        assert len(warcs) == 1

        warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')

        with open(warcs[warc_key], 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(
                decomp, ensure_http_headers=True)

        status_headers = record.rec_headers
        assert len(record.rec_headers.headers) == 9
        assert status_headers.get_header('WARC-Type') == 'resource'
        assert status_headers.get_header(
            'WARC-Target-URI') == 'custom://httpbin.org'
        assert status_headers.get_header('WARC-Record-ID') != ''
        assert status_headers.get_header('WARC-Date') != ''
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header(
            'WARC-Block-Digest') == status_headers.get_header(
                'WARC-Payload-Digest')
        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))
        assert status_headers.get_header('WARC-Custom') == 'foo'

        assert record.raw_stream.read() == buff

        status_headers = record.http_headers
        assert len(record.http_headers.headers) == 2

        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))

        writer.close()
        assert len(writer.fh_cache) == 0
Exemple #14
0
    def test_record_custom_record(self):
        dedup_index = self._get_dedup_index(user=False)

        warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'

        buff = b'Some Data'

        testapp = webtest.TestApp(recorder_app)
        headers = {'content-type': 'text/plain',
                   'WARC-Custom': 'foo'
                  }

        resp = testapp.put(req_url, headers=headers, params=buff)

        assert resp.json['success'] == 'true'
        assert resp.json['WARC-Date'] != ''

        self._test_all_warcs('/warcs/meta', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('META:warc')
        assert len(warcs) == 1

        warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')

        with open(warcs[warc_key], 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp, ensure_http_headers=True)

        status_headers = record.rec_headers
        assert len(record.rec_headers.headers) == 9
        assert status_headers.get_header('WARC-Type') == 'resource'
        assert status_headers.get_header('WARC-Target-URI') == 'custom://httpbin.org'
        assert status_headers.get_header('WARC-Record-ID') != ''
        assert status_headers.get_header('WARC-Date') != ''
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))
        assert status_headers.get_header('WARC-Custom') == 'foo'

        assert record.raw_stream.read() == buff

        status_headers = record.http_headers
        assert len(record.http_headers.headers) == 2

        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))

        writer.close()
        assert len(writer.fh_cache) == 0
    def test_anon_auto_delete(self):
        sesh_redis = FakeStrictRedis.from_url('redis://localhost:6379/0')
        sesh_redis.flushdb()

        time.sleep(4.0)

        assert set(self.redis.keys()) == set([b'h:roles', b'h:defaults', b'h:temp-usage'])

        assert glob.glob(os.path.join(self.warcs_dir, 'temp$*')) == []
Exemple #16
0
    def test_record_param_user_coll_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(
            self.upstream_url,
            PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/user-agent',
            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 2

        if b'warc/revisit' in res[0]:
            cdx = CDXObject(res[0])
        else:
            cdx = CDXObject(res[1])

        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'warc/revisit'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])

        warcs = r.hgetall('USER:COLL:warc')
        assert len(warcs) == 2
        assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode(
            'utf-8')

        with open(fullwarc, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            # Test refers-to headers
            status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
            assert status_headers.get_header('WARC-Type') == 'revisit'
            assert status_headers.get_header(
                'WARC-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Date') != ''
            assert status_headers.get_header(
                'WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Refers-To-Date') != ''
Exemple #17
0
    def setup_class(cls, redis_url='redis://localhost:6379/2'):
        super(FakeRedisTests, cls).setup_class()

        del PUBSUBS[:]
        DATABASES.clear()

        cls.redismock = patch('redis.StrictRedis', FakeStrictRedisSharedPubSub)
        cls.redismock.start()

        cls.redis = FakeStrictRedis.from_url(redis_url)
    def test_anon_auto_delete(self):
        sesh_redis = FakeStrictRedis.from_url('redis://localhost:6379/0')
        sesh_redis.flushdb()

        def assert_empty_keys():
            assert set(self.redis.keys()) == set(
                ['h:roles', 'h:defaults', 'h:temp-usage'])
            assert glob.glob(os.path.join(self.warcs_dir, 'temp$*')) == []

        self.sleep_try(0.1, 10.0, assert_empty_keys)
Exemple #19
0
    def setup_class(cls, redis_url='redis://localhost:6379/2'):
        super(FakeRedisTests, cls).setup_class()

        del PUBSUBS[:]
        DATABASES.clear()

        cls.redismock = patch('redis.StrictRedis', FakeStrictRedisSharedPubSub)
        cls.redismock.start()

        cls.redis = FakeStrictRedis.from_url(redis_url)
Exemple #20
0
    def test_url_agnost(self):
        f = FakeStrictRedis.from_url('redis://localhost/2')
        f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-revisit.warc.gz')
        f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-orig.warc.gz')

        resp = self.testapp.get('/urlagnost/resource?url=http://example.com/&param.arg=foo')

        assert resp.status_int == 200
        assert resp.headers['Link'] == MementoUtils.make_link('http://[email protected]/', 'original')
        assert resp.headers['Warcserver-Source-Coll'] == 'url-agnost'
        assert resp.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
Exemple #21
0
    def test_url_agnost(self):
        f = FakeStrictRedis.from_url('redis://localhost/2')
        f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-revisit.warc.gz')
        f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-orig.warc.gz')

        resp = self.testapp.get('/urlagnost/resource?url=http://example.com/&param.arg=foo')

        assert resp.status_int == 200
        assert resp.headers['Link'] == MementoUtils.make_link('http://[email protected]/', 'original')
        assert resp.headers['Warcserver-Source-Coll'] == 'url-agnost'
        assert resp.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
Exemple #22
0
    def test_record_param_user_coll_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
                                    '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        if b'warc/revisit' in res[0]:
            cdx = CDXObject(res[0])
        else:
            cdx = CDXObject(res[1])

        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'warc/revisit'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])

        warcs = r.hgetall('USER:COLL:warc')
        assert len(warcs) == 2
        assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8')

        with open(fullwarc, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            # Test refers-to headers
            status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
            assert status_headers.get_header('WARC-Type') == 'revisit'
            assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Date') != ''
            assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Refers-To-Date') != ''
    def test_anon_auto_delete(self):
        sesh_redis = FakeStrictRedis.from_url('redis://localhost:6379/0')
        sesh_redis.flushdb()

        def assert_empty_keys():
            assert set(self.redis.keys()) == set(self.POST_DEL_KEYS)
            assert glob.glob(os.path.join(self.warcs_dir, 'temp$*')) == []

        self.sleep_try(0.1, 10.0, assert_empty_keys)

        def assert_dir_delete():
            assert not os.path.isdir(os.path.join(self.warcs_dir, self.anon_user))

        self.sleep_try(0.1, 5.0, assert_dir_delete)
Exemple #24
0
    def test_anon_auto_delete(self):
        sesh_redis = FakeStrictRedis.from_url('redis://localhost:6379/0')
        sesh_redis.flushdb()

        def assert_empty_keys():
            assert set(self.redis.keys()) == set(self.POST_DEL_KEYS)
            assert glob.glob(os.path.join(self.warcs_dir, 'temp$*')) == []

        self.sleep_try(0.1, 10.0, assert_empty_keys)

        def assert_dir_delete():
            assert not os.path.isdir(
                os.path.join(self.warcs_dir, self.anon_user))

        self.sleep_try(0.1, 5.0, assert_dir_delete)
Exemple #25
0
    def setup_class(cls,
                    extra_config_file='test_no_invites_config.yaml',
                    init_anon=True,
                    **kwargs):
        super(BaseWRTests, cls).setup_class()

        cls.warcs_dir = to_path(cls.root_dir + '/warcs/')

        os.makedirs(cls.warcs_dir)
        os.environ['RECORD_ROOT'] = cls.warcs_dir

        os.environ['WR_CONFIG'] = 'pkg://webrecorder/config/wr.yaml'
        if extra_config_file:
            os.environ['WR_USER_CONFIG'] = os.path.join(
                cls.get_curr_dir(), extra_config_file)

        os.environ['REDIS_BASE_URL'] = 'redis://*****:*****@localhost')
        cls.set_nx_env('EMAIL_SMTP_URL',
                       'smtp://[email protected]:test@localhost:25')

        cls.redis = FakeStrictRedis.from_url(os.environ['REDIS_BASE_URL'],
                                             decode_responses=True)

        cls.custom_init(kwargs)

        if kwargs.get('no_app'):
            return

        cls.appcont = AppController()
        cls.testapp = webtest.TestApp(cls.appcont.app)

        if init_anon:
            res = cls.testapp.get('/api/v1/anon_user')
            cls.anon_user = res.json['anon_user']
        else:
            cls.anon_user = None
Exemple #26
0
    def test_error_redis_file_not_found(self):
        f = FakeStrictRedis.from_url('redis://localhost/2')
        f.hset('test:warc', 'example2.warc.gz', './x-no-such-dir/example2.warc.gz')

        resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
        assert resp.json['message'] == "example2.warc.gz: [Errno 2] No such file or directory: './x-no-such-dir/example2.warc.gz'"

        f.hdel('test:warc', 'example2.warc.gz')
        resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)

        assert resp.json == {'message': 'example2.warc.gz: Archive File Not Found',
                             'errors': {'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'}}

        f.delete('test:warc')
        resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)

        assert resp.json == {'message': 'example2.warc.gz: Archive File Not Found',
                             'errors': {'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'}}
    def test_record_param_user_coll_skip(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy())

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        # No new entries written
        self._test_all_warcs('/warcs/', 2)

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2
Exemple #28
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body


        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?boo=far', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout, fh, filename,
                            cdxj=True, append_post=True, sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                '/get?boo=far', '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2

        writer.close()
        assert len(writer.fh_cache) == 0
Exemple #29
0
 def add_cdx_to_redis(filename, key, redis_url='redis://localhost:6379/2'):
     r = FakeStrictRedis.from_url(redis_url)
     with open(filename, 'rb') as fh:
         for line in fh:
             r.zadd(key, 0, line.rstrip())
 def setup_class(cls):
     super(TestRecordDedup, cls).setup_class('config_test_record_dedup.yaml', custom_config={'recorder': 'live'})
     cls.redis = FakeStrictRedis.from_url("redis://localhost/0")
def setup_module():
    r = FakeStrictRedis.from_url('redis://localhost:6379/2')
    r.delete('test:rediscdx')
    with open('testdata/iana.cdxj', 'rb') as fh:
        for line in fh:
            r.zadd('test:rediscdx', 0, line.rstrip())
Exemple #32
0
def get_config():
    config = TestCacheConfig()
    config.cache_redis_client = FakeStrictRedis.from_url(
        config.CACHE_ALCHEMY_REDIS_URL, decode_responses=True)
    return config
Exemple #33
0
    def setup_class(cls, extra_config_file='test_no_invites_config.yaml',
                    init_anon=True,
                    **kwargs):
        super(BaseWRTests, cls).setup_class()

        cls.warcs_dir = to_path(cls.root_dir + '/warcs/')
        cls.storage_dir = os.path.join(to_path(cls.root_dir + '/storage/'))

        os.makedirs(cls.warcs_dir)
        os.environ['RECORD_ROOT'] = cls.warcs_dir
        os.environ['STORAGE_ROOT'] = cls.storage_dir

        cls.storage_today = os.path.join(cls.storage_dir, today_str())

        os.environ['WR_CONFIG'] = 'pkg://webrecorder/config/wr.yaml'
        if extra_config_file:
            os.environ['WR_USER_CONFIG'] = os.path.join(cls.get_curr_dir(), extra_config_file)

        os.environ['REDIS_BASE_URL'] = 'redis://*****:*****@localhost')
        cls.set_nx_env('EMAIL_SMTP_URL', 'smtp://[email protected]:test@localhost:25')

        cls.set_nx_env('NO_REMOTE_BROWSERS', '1')

        def load_wr_config():
            config = load_overlay_config('WR_CONFIG', 'pkg://webrecorder/config/wr.yaml', 'WR_USER_CONFIG', '')
            config['dyn_stats_key_templ'] = {
                 'rec': 'r:{rec}:<sesh_id>:stats:',
                 'coll': 'c:{coll}:<sesh_id>:stats:'
            }

            config['dyn_ref_templ'] = {
                 'rec': 'r:{rec}:<sesh_id>:ref:',
                 'coll': 'c:{coll}:<sesh_id>:ref:',
            }
            return config

        import webrecorder.maincontroller
        webrecorder.maincontroller.load_wr_config = load_wr_config

        cls.redis = FakeStrictRedis.from_url(os.environ['REDIS_BASE_URL'], decode_responses=True)
        cls.sesh_redis = FakeStrictRedis.from_url(os.environ['REDIS_SESSION_URL'], decode_responses=True)

        cls.custom_init(kwargs)

        if kwargs.get('no_app'):
            return

        cls.maincont = MainController()
        cls.testapp = webtest.TestApp(cls.maincont.app)

        if init_anon:
            res = cls.testapp.post('/api/v1/auth/anon_user')
            cls.anon_user = res.json['user']['username']
            cls.assert_temp_user_sesh(cls.anon_user)
        else:
            cls.anon_user = None
Exemple #34
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir +
                            '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?foo=bar',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout,
                            fh,
                            filename,
                            cdxj=True,
                            append_post=True,
                            sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2

        writer.close()
        assert len(writer.fh_cache) == 0