Example #1
0
    def test_record_warc_2(self):
        recorder_app = RecorderApp(self.upstream_url,
                                   PerRecordWARCWriter(
                                       to_path(self.root_dir + '/warcs/')),
                                   accept_colls='live')

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/', 2)
Example #2
0
    def test_error_url(self):
        recorder_app = RecorderApp(self.upstream_url + '01',
                                   PerRecordWARCWriter(
                                       to_path(self.root_dir + '/warcs/')),
                                   accept_colls='live')

        testapp = webtest.TestApp(recorder_app)
        resp = testapp.get('/live/resource?url=http://example.com/',
                           status=400)

        assert resp.json['error'] != ''

        self._test_all_warcs('/warcs/', 2)
Example #3
0
    def test_record_param_user_coll_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
                                    '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        if b'warc/revisit' in res[0]:
            cdx = CDXObject(res[0])
        else:
            cdx = CDXObject(res[1])

        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'warc/revisit'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])

        warcs = r.hgetall('USER:COLL:warc')
        assert len(warcs) == 2
        assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8')

        with open(fullwarc, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            # Test refers-to headers
            status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
            assert status_headers.get_header('WARC-Type') == 'revisit'
            assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Date') != ''
            assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Refers-To-Date') != ''
Example #4
0
    def test_record_file_warc_keep_open(self):
        path = to_path(self.root_dir + '/warcs/A.warc.gz')
        writer = MultiFileWARCWriter(path)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        assert os.path.isfile(path)
        assert len(writer.fh_cache) == 1

        writer.close()
        assert len(writer.fh_cache) == 0
Example #5
0
    def test_record_multiple_writes_rollover_idle(self):
        warc_path = to_path(self.root_dir +
                            '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path,
                                     dedup_index=dedup_index,
                                     max_idle_secs=0.9)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?foo=bar',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 1)

        time.sleep(1.0)
        writer.close_idle_files()

        # Third Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?goo=bar',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"goo": "bar"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 2)

        writer.close()
        assert len(writer.fh_cache) == 0
Example #6
0
    def test_record_param_user_coll_same_dir(self):
        warc_path = to_path(self.root_dir + '/warcs2/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index, key_template='{user}:{coll}'))

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL2')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL3')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs2', 2)
Example #7
0
    def init_recorder(self):
        self.dedup_index = self.init_indexer()

        writer = SkipCheckingMultiFileWARCWriter(
            dir_template=self.warc_path_templ,
            filename_template=self.warc_name_templ,
            dedup_index=self.dedup_index,
            redis=self.redis,
            skip_key_templ=self.skip_key_templ,
            key_template=self.info_keys['rec'],
            header_filter=ExcludeHttpOnlyCookieHeaders())

        self.writer = writer
        recorder_app = RecorderApp(self.upstream_url,
                                   writer,
                                   accept_colls='(live|mount:)',
                                   create_buff_func=self.create_buffer)

        self.recorder = recorder_app
Example #8
0
    def test_record_video_metadata(self):
        pytest.importorskip('youtube_dl')
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        params = {
            'param.recorder.user': '******',
            'param.recorder.coll': 'VIDEO',
            'content_type': 'application/vnd.youtube-dl_formats+json'
        }

        resp = self._test_warc_write(
            recorder_app,
            'www.youtube.com',
            '/v/BfBgWtAIbRc',
            '&' + urlencode(params),
            link_url='metadata://www.youtube.com/v/BfBgWtAIbRc')

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('USER:VIDEO:warc')
        assert len(warcs) == 1

        filename = list(warcs.values())[0]

        with open(filename, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp)

        status_headers = record.rec_headers
        assert status_headers.get_header('WARC-Type') == 'metadata'
        assert status_headers.get_header(
            'Content-Type') == 'application/vnd.youtube-dl_formats+json'
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header(
            'WARC-Block-Digest') == status_headers.get_header(
                'WARC-Payload-Digest')
Example #9
0
    def test_record_param_user_coll(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(
            self.upstream_url,
            PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', None)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/user-agent',
            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 1

        cdx = CDXObject(res[0])
        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'application/json'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        warcs = r.hgetall('USER:COLL:warc')
        full_path = to_path(self.root_dir + '/warcs/' + cdx['filename'])
        assert warcs == {
            cdx['filename'].encode('utf-8'): full_path.encode('utf-8')
        }
Example #10
0
    def test_record_cookies_header(self):
        base_path = to_path(self.root_dir + '/warcs/cookiecheck/')
        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(base_path), accept_colls='live')

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
        assert b'HTTP/1.1 302' in resp.body

        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff)
        assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
        assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers

        stored_req, stored_resp = self._load_resp_req(base_path)

        assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.http_headers.headers
        assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.http_headers.headers

        assert ('X-Other', 'foo') in stored_req.http_headers.headers
        assert ('Cookie', 'boo=far') in stored_req.http_headers.headers

        self._test_all_warcs('/warcs/cookiecheck/', 1)
Example #11
0
    def init_recorder(self):
        self.dedup_index = self.init_indexer()

        writer = SkipCheckingMultiFileWARCWriter(dir_template=self.warc_path_templ,
                                     dedup_index=self.dedup_index,
                                     redis=self.redis,
                                     key_template=self.info_keys['rec'],
                                     header_filter=ExcludeHttpOnlyCookieHeaders(),
                                     config=self.config)

        self.writer = writer

        skip_filters = [SkipRangeRequestFilter(),
                        ExtractPatchingFilter()]

        recorder_app = RecorderApp(self.upstream_url,
                                   writer,
                                   skip_filters=skip_filters,
                                   #accept_colls=self.accept_colls,
                                   create_buff_func=writer.create_write_buffer)

        self.recorder = recorder_app
Example #12
0
    def test_record_param_user_coll_skip(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy())

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        # No new entries written
        self._test_all_warcs('/warcs/USER/COLL/', 2)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
                            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body
        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2
Example #13
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir +
                            '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?foo=bar',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout,
                            fh,
                            filename,
                            cdxj=True,
                            append_post=True,
                            sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2

        writer.close()
        assert len(writer.fh_cache) == 0
Example #14
0
    def init_recorder(self, recorder_config):
        """Initialize the recording functionality of pywb. If recording_config is None this function is a no op

        :param str|dict|None recorder_config: The configuration for the recorder app
        :rtype: None
        """
        if not recorder_config:
            self.recorder = None
            self.recorder_path = None
            return

        if isinstance(recorder_config, str):
            recorder_coll = recorder_config
            recorder_config = {}
        else:
            recorder_coll = recorder_config['source_coll']

        # cache mode
        self.rec_cache_mode = recorder_config.get('cache', 'default')

        dedup_policy = recorder_config.get('dedup_policy')
        dedup_by_url = False

        if dedup_policy == 'none':
            dedup_policy = ''

        if dedup_policy == 'keep':
            dedup_policy = WriteDupePolicy()
        elif dedup_policy == 'revisit':
            dedup_policy = WriteRevisitDupePolicy()
        elif dedup_policy == 'skip':
            dedup_policy = SkipDupePolicy()
            dedup_by_url = True
        elif dedup_policy:
            msg = 'Invalid option for dedup_policy: {0}'
            raise Exception(msg.format(dedup_policy))

        if dedup_policy:
            dedup_index = WritableRedisIndexer(
                redis_url=self.warcserver.dedup_index_url,
                dupe_policy=dedup_policy,
                rel_path_template=self.warcserver.root_dir + '/{coll}/archive')
        else:
            dedup_index = None

        warc_writer = MultiFileWARCWriter(
            self.warcserver.archive_paths,
            max_size=int(recorder_config.get('rollover_size', 1000000000)),
            max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
            filename_template=recorder_config.get('filename_template'),
            dedup_index=dedup_index,
            dedup_by_url=dedup_by_url)

        if dedup_policy:
            pending_counter = self.warcserver.dedup_index_url.replace(
                ':cdxj', ':pending')
            pending_timeout = recorder_config.get('pending_timeout', 30)
            create_buff_func = lambda params, name: RedisPendingCounterTempBuffer(
                512 * 1024, pending_counter, params, name, pending_timeout)
        else:
            create_buff_func = None

        self.recorder = RecorderApp(
            self.RECORD_SERVER % str(self.warcserver_server.port),
            warc_writer,
            accept_colls=recorder_config.get('source_filter'),
            create_buff_func=create_buff_func)

        recorder_server = GeventServer(self.recorder, port=0)

        self.recorder_path = self.RECORD_API % (recorder_server.port,
                                                recorder_coll)

        # enable PUT of custom data as 'resource' records
        if recorder_config.get('enable_put_custom_record'):
            self.put_custom_record_path = self.recorder_path + '&put_record={rec_type}&url={url}'
Example #15
0
    def init_recorder(self, recorder_config):
        """Initialize the recording functionality of pywb. If recording_config is None this function is a no op

        :param str|dict|None recorder_config: The configuration for the recorder app
        :rtype: None
        """
        if not recorder_config:
            self.recorder = None
            self.recorder_path = None
            return

        if isinstance(recorder_config, str):
            recorder_coll = recorder_config
            recorder_config = {}
        else:
            recorder_coll = recorder_config['source_coll']

        # cache mode
        self.rec_cache_mode = recorder_config.get('cache', 'default')

        dedup_policy = recorder_config.get('dedup_policy')
        dedup_by_url = False

        if dedup_policy == 'none':
            dedup_policy = ''

        if dedup_policy == 'keep':
            dedup_policy = WriteDupePolicy()
        elif dedup_policy == 'revisit':
            dedup_policy = WriteRevisitDupePolicy()
        elif dedup_policy == 'skip':
            dedup_policy = SkipDupePolicy()
            dedup_by_url = True
        elif dedup_policy:
            msg = 'Invalid option for dedup_policy: {0}'
            raise Exception(msg.format(dedup_policy))

        if dedup_policy:
            dedup_index = WritableRedisIndexer(
                redis_url=self.warcserver.dedup_index_url,
                dupe_policy=dedup_policy,
                rel_path_template=self.warcserver.root_dir + '/{coll}/archive')
        else:
            dedup_index = None

        warc_writer = MultiFileWARCWriter(
            self.warcserver.archive_paths,
            max_size=int(recorder_config.get('rollover_size', 1000000000)),
            max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
            filename_template=recorder_config.get('filename_template'),
            dedup_index=dedup_index,
            dedup_by_url=dedup_by_url)

        self.recorder = RecorderApp(
            self.RECORD_SERVER % str(self.warcserver_server.port),
            warc_writer,
            accept_colls=recorder_config.get('source_filter'))

        recorder_server = GeventServer(self.recorder, port=0)

        self.recorder_path = self.RECORD_API % (recorder_server.port,
                                                recorder_coll)