def test_cdx_all_coll(self):
        res = self.testapp.get('/all/cdx?url=http://httpbin.org/get*&output=json')

        cdxj_lines = [json.loads(line) for line in res.text.rstrip().split('\n')]

        assert len(cdxj_lines) == 4

        assert cdxj_lines[0]['url'] == 'http://httpbin.org/get?A=B'
        assert cdxj_lines[1]['url'] == 'http://httpbin.org/get?A=B'
        assert cdxj_lines[2]['url'] == 'http://httpbin.org/get?C=D'
        assert cdxj_lines[3]['url'] == 'http://httpbin.org/get?C=D2'

        assert cdxj_lines[0]['urlkey'] == 'org,httpbin)/get?__pywb_method=head&a=b'
        assert cdxj_lines[1]['urlkey'] == 'org,httpbin)/get?a=b'
        assert cdxj_lines[2]['urlkey'] == 'org,httpbin)/get?c=d'
        assert cdxj_lines[3]['urlkey'] == 'org,httpbin)/get?c=d2'

        assert cdxj_lines[0]['source'] == to_path('test/indexes/autoindex.cdxj')
        assert cdxj_lines[1]['source'] == to_path('test/indexes/autoindex.cdxj')
        assert cdxj_lines[2]['source'] == to_path('test2/indexes/autoindex.cdxj')
        assert cdxj_lines[3]['source'] == to_path('test/indexes/autoindex.cdxj')

        assert cdxj_lines[0]['source-coll'] == 'test'
        assert cdxj_lines[1]['source-coll'] == 'test'
        assert cdxj_lines[2]['source-coll'] == 'test2'
        assert cdxj_lines[3]['source-coll'] == 'test'

        assert cdxj_lines[1]['filename'] == cdxj_lines[3]['filename']
Exemple #2
0
    def test_agg_dir_sources_2(self):
        res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
        exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file',
                           to_path('colls:C/indexes/dupes.cdxj'): 'file'}
              }

        assert(res == exp)
Exemple #3
0
    def test_timemap_all_coll(self):
        res = self.testapp.get('/all/timemap/link/http://httpbin.org/get?C=D')
        link_lines = res.text.rstrip().split('\n')
        assert len(link_lines) == 5

        assert to_path('collection="test2"') in link_lines[3]
        assert to_path('collection="test"') in link_lines[4]
Exemple #4
0
    def test_record_param_user_coll(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', None)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
                            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 1

        cdx = CDXObject(res[0])
        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'application/json'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        warcs = r.hgetall('USER:COLL:warc')
        full_path = to_path(self.root_dir + '/warcs/' + cdx['filename'])
        assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
Exemple #5
0
    def test_cache_dir_sources_1(self):
        exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file',
                           to_path('colls:B/indexes/iana.cdxj'): 'file',
                           to_path('colls:C/indexes/dupes.cdxj'): 'file'}
              }

        res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
        assert(res == exp)

        res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
        assert(res == exp)

        new_file = os.path.join(self.root_dir, to_path('colls/C/indexes/empty.cdxj'))

        # ensure new file is created at least a second later
        time.sleep(1.0)

        with open(new_file, 'a') as fh:
            os.utime(new_file, None)

        res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})

        # New File Included
        exp['sources'][to_path('colls:C/indexes/empty.cdxj')] = 'file'
        assert(res == exp)
Exemple #6
0
    def test_record_param_user_coll(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', None)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
                            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 1

        cdx = CDXObject(res[0])
        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'application/json'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        warcs = r.hgetall('USER:COLL:warc')
        full_path = to_path(self.root_dir + '/warcs/' + cdx['filename'])
        assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
Exemple #7
0
    def test_record_param_user_coll_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(
            self.upstream_url,
            PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/user-agent',
            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 2

        if b'warc/revisit' in res[0]:
            cdx = CDXObject(res[0])
        else:
            cdx = CDXObject(res[1])

        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'warc/revisit'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])

        warcs = r.hgetall('USER:COLL:warc')
        assert len(warcs) == 2
        assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode(
            'utf-8')

        with open(fullwarc, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            # Test refers-to headers
            status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
            assert status_headers.get_header('WARC-Type') == 'revisit'
            assert status_headers.get_header(
                'WARC-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Date') != ''
            assert status_headers.get_header(
                'WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Refers-To-Date') != ''
Exemple #8
0
    def test_agg_all_found_2(self):
        res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'})

        exp = [
            {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
            {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
            {'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
        ]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Exemple #9
0
    def test_agg_all_found_1(self):
        res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'})

        exp = [
            {'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
            {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
            {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
        ]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Exemple #10
0
    def test_record_param_user_coll_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
                                    '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        if b'warc/revisit' in res[0]:
            cdx = CDXObject(res[0])
        else:
            cdx = CDXObject(res[1])

        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'warc/revisit'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])

        warcs = r.hgetall('USER:COLL:warc')
        assert len(warcs) == 2
        assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8')

        with open(fullwarc, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            # Test refers-to headers
            status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
            assert status_headers.get_header('WARC-Type') == 'revisit'
            assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Date') != ''
            assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Refers-To-Date') != ''
Exemple #11
0
    def test_record_video_metadata(self):
        pytest.importorskip('youtube_dl')
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        params = {'param.recorder.user': '******',
                  'param.recorder.coll': 'VIDEO',
                  'content_type': 'application/vnd.youtube-dl_formats+json'
                 }

        resp = self._test_warc_write(recorder_app,
            'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params),
            link_url='metadata://www.youtube.com/v/BfBgWtAIbRc')

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('USER:VIDEO:warc')
        assert len(warcs) == 1

        filename = list(warcs.values())[0]

        with open(filename, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp)

        status_headers = record.rec_headers
        assert status_headers.get_header('WARC-Type') == 'metadata'
        assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json'
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
Exemple #12
0
    def test_record_param_user_coll_skip(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy())

        recorder_app = RecorderApp(
            self.upstream_url,
            PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        # No new entries written
        self._test_all_warcs('/warcs/USER/COLL/', 2)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/user-agent',
            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body
        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 2
Exemple #13
0
    def test_agg_collB_found(self):
        res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'})

        exp = [{'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Exemple #14
0
    def test_record_param_user_coll_write_dupe_no_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy())

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/get?foo=bar',
            '&param.recorder.user=USER&param.recorder.coll=COLL')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 3)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 3

        mimes = [CDXObject(x)['mime'] for x in res]

        assert sorted(mimes) == [
            'application/json', 'application/json', 'warc/revisit'
        ]

        assert len(writer.fh_cache) == 0
Exemple #15
0
    def test_record_param_user_coll_write_dupe_no_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy())

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 3)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 3

        mimes = [CDXObject(x)['mime'] for x in res]

        assert sorted(mimes) == ['application/json', 'application/json', 'warc/revisit']

        assert len(writer.fh_cache) == 0
Exemple #16
0
    def test_record_skip_http_only_cookies_header(self):
        warc_path = to_path(self.root_dir + '/warcs/cookieskip_httponly/')
        header_filter = ExcludeHttpOnlyCookieHeaders()
        recorder_app = RecorderApp(self.upstream_url,
                                   PerRecordWARCWriter(
                                       warc_path, header_filter=header_filter),
                                   accept_colls='live')

        resp = self._test_warc_write(recorder_app, 'www.google.com', '/')
        assert b'HTTP/1.1 302' in resp.body

        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff)

        non_http_only, http_only = self._get_http_only_cookies(record)
        # both httponly and other cookies
        assert http_only != None
        assert non_http_only != None

        stored_req, stored_resp = self._load_resp_req(warc_path)

        non_http_only, http_only = self._get_http_only_cookies(stored_resp)
        # no httponly cookies
        assert http_only == None
        assert non_http_only != None

        assert ('X-Other', 'foo') in stored_req.http_headers.headers
        assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers

        self._test_all_warcs('/warcs/cookieskip_httponly/', 1)
Exemple #17
0
    def test_record_skip_http_only_cookies_header(self):
        warc_path = to_path(self.root_dir + '/warcs/cookieskip_httponly/')
        header_filter = ExcludeHttpOnlyCookieHeaders()
        recorder_app = RecorderApp(self.upstream_url,
                         PerRecordWARCWriter(warc_path, header_filter=header_filter),
                            accept_colls='live')

        resp = self._test_warc_write(recorder_app, 'www.google.com', '/')
        assert b'HTTP/1.1 302' in resp.body

        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff)

        non_http_only, http_only = self._get_http_only_cookies(record)
        # both httponly and other cookies
        assert http_only != None
        assert non_http_only != None

        stored_req, stored_resp = self._load_resp_req(warc_path)

        non_http_only, http_only = self._get_http_only_cookies(stored_resp)
        # no httponly cookies
        assert http_only == None
        assert non_http_only != None


        assert ('X-Other', 'foo') in stored_req.http_headers.headers
        assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers

        self._test_all_warcs('/warcs/cookieskip_httponly/', 1)
Exemple #18
0
    def test_agg_collA_found(self):
        res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})

        exp = [{'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Exemple #19
0
    def test_record_skip_all_cookies_header(self):
        warc_path = to_path(self.root_dir + '/warcs/cookieskip/')
        header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
        recorder_app = RecorderApp(self.upstream_url,
                                   PerRecordWARCWriter(
                                       warc_path, header_filter=header_filter),
                                   accept_colls='live')

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org',
            '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
        assert b'HTTP/1.1 302' in resp.body

        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff)
        assert ('Set-Cookie',
                'name=value; Path=/') in record.http_headers.headers
        assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers

        stored_req, stored_resp = self._load_resp_req(warc_path)

        assert ('Set-Cookie',
                'name=value; Path=/') not in stored_resp.http_headers.headers
        assert ('Set-Cookie',
                'foo=bar; Path=/') not in stored_resp.http_headers.headers

        assert ('X-Other', 'foo') in stored_req.http_headers.headers
        assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers

        self._test_all_warcs('/warcs/cookieskip/', 1)
Exemple #20
0
    def test_record_video_metadata(self):
        pytest.importorskip('youtube_dl')
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        params = {'param.recorder.user': '******',
                  'param.recorder.coll': 'VIDEO',
                  'content_type': 'application/vnd.youtube-dl_formats+json'
                 }

        resp = self._test_warc_write(recorder_app,
            'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params),
            link_url='metadata://www.youtube.com/v/BfBgWtAIbRc')

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('USER:VIDEO:warc')
        assert len(warcs) == 1

        filename = list(warcs.values())[0]

        with open(filename, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp)

        status_headers = record.rec_headers
        assert status_headers.get_header('WARC-Type') == 'metadata'
        assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json'
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
Exemple #21
0
    def test_record_custom_record(self):
        dedup_index = self._get_dedup_index(user=False)

        warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'

        buff = b'Some Data'

        testapp = webtest.TestApp(recorder_app)
        headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo'}

        resp = testapp.put(req_url, headers=headers, params=buff)

        assert resp.json['success'] == 'true'
        assert resp.json['WARC-Date'] != ''

        self._test_all_warcs('/warcs/meta', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('META:warc')
        assert len(warcs) == 1

        warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')

        with open(warcs[warc_key], 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(
                decomp, ensure_http_headers=True)

        status_headers = record.rec_headers
        assert len(record.rec_headers.headers) == 9
        assert status_headers.get_header('WARC-Type') == 'resource'
        assert status_headers.get_header(
            'WARC-Target-URI') == 'custom://httpbin.org'
        assert status_headers.get_header('WARC-Record-ID') != ''
        assert status_headers.get_header('WARC-Date') != ''
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header(
            'WARC-Block-Digest') == status_headers.get_header(
                'WARC-Payload-Digest')
        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))
        assert status_headers.get_header('WARC-Custom') == 'foo'

        assert record.raw_stream.read() == buff

        status_headers = record.http_headers
        assert len(record.http_headers.headers) == 2

        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))

        writer.close()
        assert len(writer.fh_cache) == 0
Exemple #22
0
    def setup_class(cls):
        super(TestRecorder, cls).setup_class()

        cls.warcs_dir = to_path(cls.root_dir + '/warcs')

        os.makedirs(cls.warcs_dir)

        cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
Exemple #23
0
    def test_record_multiple_writes_rollover_idle(self):
        warc_path = to_path(self.root_dir +
                            '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path,
                                     dedup_index=dedup_index,
                                     max_idle_secs=0.9)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?foo=bar',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 1)

        time.sleep(1.0)
        writer.close_idle_files()

        # Third Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?goo=bar',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"goo": "bar"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 2)

        writer.close()
        assert len(writer.fh_cache) == 0
Exemple #24
0
    def setup_class(cls):
        super(TestRecorder, cls).setup_class()

        cls.warcs_dir = to_path(cls.root_dir + '/warcs')

        os.makedirs(cls.warcs_dir)

        cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
Exemple #25
0
    def test_extra_agg_collB(self):
        agg_source = SimpleAggregator({'dir': self.dir_loader})
        res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})

        exp = [{'source': to_path('dir:colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Exemple #26
0
    def test_record_skip_wrong_coll(self):
        recorder_app = RecorderApp(self.upstream_url,
                        writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/', 2)
Exemple #27
0
    def test_record_custom_record(self):
        dedup_index = self._get_dedup_index(user=False)

        warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'

        buff = b'Some Data'

        testapp = webtest.TestApp(recorder_app)
        headers = {'content-type': 'text/plain',
                   'WARC-Custom': 'foo'
                  }

        resp = testapp.put(req_url, headers=headers, params=buff)

        assert resp.json['success'] == 'true'
        assert resp.json['WARC-Date'] != ''

        self._test_all_warcs('/warcs/meta', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('META:warc')
        assert len(warcs) == 1

        warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')

        with open(warcs[warc_key], 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp, ensure_http_headers=True)

        status_headers = record.rec_headers
        assert len(record.rec_headers.headers) == 9
        assert status_headers.get_header('WARC-Type') == 'resource'
        assert status_headers.get_header('WARC-Target-URI') == 'custom://httpbin.org'
        assert status_headers.get_header('WARC-Record-ID') != ''
        assert status_headers.get_header('WARC-Date') != ''
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))
        assert status_headers.get_header('WARC-Custom') == 'foo'

        assert record.raw_stream.read() == buff

        status_headers = record.http_headers
        assert len(record.http_headers.headers) == 2

        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))

        writer.close()
        assert len(writer.fh_cache) == 0
Exemple #28
0
    def test_agg_dir_and_memento(self):
        sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
                   'local': self.dir_loader}
        agg_source = SimpleAggregator(sources)

        res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})

        exp = [
            {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
            {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
            {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
            {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
            {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
            {'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
        ]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Exemple #29
0
    def test_record_warc_1(self):
        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')))

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/', 1)
Exemple #30
0
    def test_record_skip_wrong_coll(self):
        recorder_app = RecorderApp(self.upstream_url,
                        writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/', 2)
Exemple #31
0
    def test_record_warc_1(self):
        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')))

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/', 1)
Exemple #32
0
    def test_error_url(self):
        recorder_app = RecorderApp(self.upstream_url + '01',
                        PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')


        testapp = webtest.TestApp(recorder_app)
        resp = testapp.get('/live/resource?url=http://example.com/', status=400)

        assert resp.json['error'] != ''

        self._test_all_warcs('/warcs/', 2)
Exemple #33
0
    def test_error_url(self):
        recorder_app = RecorderApp(self.upstream_url + '01',
                        PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')


        testapp = webtest.TestApp(recorder_app)
        resp = testapp.get('/live/resource?url=http://example.com/', status=400)

        assert resp.json['error'] != ''

        self._test_all_warcs('/warcs/', 2)
Exemple #34
0
    def _test_all_warcs(self, dirname, num):
        coll_dir = to_path(self.root_dir + dirname)
        assert os.path.isdir(coll_dir) == (num != None)
        if num is None:
            return

        files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))]
        assert len(files) == num
        assert all(x.endswith('.warc.gz') for x in files)

        self._verify_content_len(coll_dir, files)
        return files, coll_dir
Exemple #35
0
    def _test_all_warcs(self, dirname, num):
        coll_dir = to_path(self.root_dir + dirname)
        assert os.path.isdir(coll_dir) == (num != None)
        if num is None:
            return

        files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))]
        assert len(files) == num
        assert all(x.endswith('.warc.gz') for x in files)

        self._verify_content_len(coll_dir, files)
        return files, coll_dir
Exemple #36
0
    def test_record_multiple_writes_rollover_idle(self):
        warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index, max_idle_secs=0.9)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?boo=far', '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 1)

        time.sleep(1.0)
        writer.close_idle_files()

        # Third Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?goo=bar', '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"goo": "bar"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 2)

        writer.close()
        assert len(writer.fh_cache) == 0
Exemple #37
0
    def test_record_file_warc_keep_open(self):
        path = to_path(self.root_dir + '/warcs/A.warc.gz')
        writer = MultiFileWARCWriter(path)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        assert os.path.isfile(path)
        assert len(writer.fh_cache) == 1

        writer.close()
        assert len(writer.fh_cache) == 0
Exemple #38
0
    def _get_dedup_index(self, dupe_policy=WriteRevisitDupePolicy(), user=True):
        if user:
            file_key_template = '{user}:{coll}:warc'
            redis_url = 'redis://localhost/2/{user}:{coll}:cdxj'
        else:
            file_key_template = '{coll}:warc'
            redis_url = 'redis://localhost/2/{coll}:cdxj'

        dedup_index = WritableRedisIndexer(redis_url=redis_url,
                        file_key_template=file_key_template,
                        rel_path_template=to_path(self.root_dir + '/warcs/'),
                        dupe_policy=dupe_policy)

        return dedup_index
Exemple #39
0
    def _get_dedup_index(self, dupe_policy=WriteRevisitDupePolicy(), user=True):
        if user:
            file_key_template = '{user}:{coll}:warc'
            redis_url = 'redis://localhost/2/{user}:{coll}:cdxj'
        else:
            file_key_template = '{coll}:warc'
            redis_url = 'redis://localhost/2/{coll}:cdxj'

        dedup_index = WritableRedisIndexer(redis_url=redis_url,
                        file_key_template=file_key_template,
                        rel_path_template=to_path(self.root_dir + '/warcs/'),
                        dupe_policy=dupe_policy)

        return dedup_index
Exemple #40
0
    def test_record_file_warc_keep_open(self):
        path = to_path(self.root_dir + '/warcs/A.warc.gz')
        writer = MultiFileWARCWriter(path)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        assert os.path.isfile(path)
        assert len(writer.fh_cache) == 1

        writer.close()
        assert len(writer.fh_cache) == 0
Exemple #41
0
    def setup_class(cls,
                    extra_config_file='test_no_invites_config.yaml',
                    init_anon=True,
                    **kwargs):
        super(BaseWRTests, cls).setup_class()

        cls.warcs_dir = to_path(cls.root_dir + '/warcs/')

        os.makedirs(cls.warcs_dir)
        os.environ['RECORD_ROOT'] = cls.warcs_dir

        os.environ['WR_CONFIG'] = 'pkg://webrecorder/config/wr.yaml'
        if extra_config_file:
            os.environ['WR_USER_CONFIG'] = os.path.join(
                cls.get_curr_dir(), extra_config_file)

        os.environ['REDIS_BASE_URL'] = 'redis://*****:*****@localhost')
        cls.set_nx_env('EMAIL_SMTP_URL',
                       'smtp://[email protected]:test@localhost:25')

        cls.redis = FakeStrictRedis.from_url(os.environ['REDIS_BASE_URL'],
                                             decode_responses=True)

        cls.custom_init(kwargs)

        if kwargs.get('no_app'):
            return

        cls.appcont = AppController()
        cls.testapp = webtest.TestApp(cls.appcont.app)

        if init_anon:
            res = cls.testapp.get('/api/v1/anon_user')
            cls.anon_user = res.json['anon_user']
        else:
            cls.anon_user = None
Exemple #42
0
    def test_record_param_user_coll_same_dir(self):
        warc_path = to_path(self.root_dir + '/warcs2/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index, key_template='{user}:{coll}'))

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL2')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL3')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs2', 2)
Exemple #43
0
    def test_record_param_user_coll_same_dir(self):
        warc_path = to_path(self.root_dir + '/warcs2/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index, key_template='{user}:{coll}'))

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL2')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL3')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs2', 2)
Exemple #44
0
    def test_record_cookies_header(self):
        base_path = to_path(self.root_dir + '/warcs/cookiecheck/')
        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(base_path), accept_colls='live')

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
        assert b'HTTP/1.1 302' in resp.body

        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff)
        assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
        assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers

        stored_req, stored_resp = self._load_resp_req(base_path)

        assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.http_headers.headers
        assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.http_headers.headers

        assert ('X-Other', 'foo') in stored_req.http_headers.headers
        assert ('Cookie', 'boo=far') in stored_req.http_headers.headers

        self._test_all_warcs('/warcs/cookiecheck/', 1)
Exemple #45
0
    def test_record_param_user_coll_skip(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy())

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        # No new entries written
        self._test_all_warcs('/warcs/USER/COLL/', 2)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
                            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body
        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2
Exemple #46
0
    def setup_class(cls):
        super(TestDirAgg, cls).setup_class()
        coll_A = to_path(cls.root_dir + '/colls/A/indexes')
        coll_B = to_path(cls.root_dir + '/colls/B/indexes')
        coll_C = to_path(cls.root_dir + '/colls/C/indexes')

        os.makedirs(coll_A)
        os.makedirs(coll_B)
        os.makedirs(coll_C)

        dir_prefix = os.path.join(cls.root_dir, 'colls')
        dir_path = '{coll}/indexes'
        dir_name = 'colls'

        shutil.copy(to_path(TEST_CDX_PATH + 'example2.cdxj'), coll_A)
        shutil.copy(to_path(TEST_CDX_PATH + 'iana.cdxj'), coll_B)
        shutil.copy(to_path(TEST_CDX_PATH + 'dupes.cdxj'), coll_C)

        with open(to_path(cls.root_dir) + '/somefile', 'w') as fh:
            fh.write('foo')

        cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path, dir_name)
        cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path, dir_name)
Exemple #47
0
    def test_timemap_all_coll(self):
        res = self.testapp.get('/all/timemap/link/http://httpbin.org/get?C=D')
        link_lines = res.text.rstrip().split('\n')
        assert len(link_lines) == 4

        assert to_path('collection="test2"') in link_lines[3]
Exemple #48
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body


        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?boo=far', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout, fh, filename,
                            cdxj=True, append_post=True, sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                '/get?boo=far', '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2

        writer.close()
        assert len(writer.fh_cache) == 0
Exemple #49
0
    def setup_class(cls, extra_config_file='test_no_invites_config.yaml',
                    init_anon=True,
                    **kwargs):
        super(BaseWRTests, cls).setup_class()

        cls.warcs_dir = to_path(cls.root_dir + '/warcs/')
        cls.storage_dir = os.path.join(to_path(cls.root_dir + '/storage/'))

        os.makedirs(cls.warcs_dir)
        os.environ['RECORD_ROOT'] = cls.warcs_dir
        os.environ['STORAGE_ROOT'] = cls.storage_dir

        cls.storage_today = os.path.join(cls.storage_dir, today_str())

        os.environ['WR_CONFIG'] = 'pkg://webrecorder/config/wr.yaml'
        if extra_config_file:
            os.environ['WR_USER_CONFIG'] = os.path.join(cls.get_curr_dir(), extra_config_file)

        os.environ['REDIS_BASE_URL'] = 'redis://*****:*****@localhost')
        cls.set_nx_env('EMAIL_SMTP_URL', 'smtp://[email protected]:test@localhost:25')

        cls.set_nx_env('NO_REMOTE_BROWSERS', '1')

        def load_wr_config():
            config = load_overlay_config('WR_CONFIG', 'pkg://webrecorder/config/wr.yaml', 'WR_USER_CONFIG', '')
            config['dyn_stats_key_templ'] = {
                 'rec': 'r:{rec}:<sesh_id>:stats:',
                 'coll': 'c:{coll}:<sesh_id>:stats:'
            }

            config['dyn_ref_templ'] = {
                 'rec': 'r:{rec}:<sesh_id>:ref:',
                 'coll': 'c:{coll}:<sesh_id>:ref:',
            }
            return config

        import webrecorder.maincontroller
        webrecorder.maincontroller.load_wr_config = load_wr_config

        cls.redis = FakeStrictRedis.from_url(os.environ['REDIS_BASE_URL'], decode_responses=True)
        cls.sesh_redis = FakeStrictRedis.from_url(os.environ['REDIS_SESSION_URL'], decode_responses=True)

        cls.custom_init(kwargs)

        if kwargs.get('no_app'):
            return

        cls.maincont = MainController()
        cls.testapp = webtest.TestApp(cls.maincont.app)

        if init_anon:
            res = cls.testapp.post('/api/v1/auth/anon_user')
            cls.anon_user = res.json['user']['username']
            cls.assert_temp_user_sesh(cls.anon_user)
        else:
            cls.anon_user = None
Exemple #50
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir +
                            '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?foo=bar',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout,
                            fh,
                            filename,
                            cdxj=True,
                            append_post=True,
                            sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2

        writer.close()
        assert len(writer.fh_cache) == 0