Ejemplo n.º 1
0
    def test_record_custom_record(self):
        dedup_index = self._get_dedup_index(user=False)

        warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'

        buff = b'Some Data'

        testapp = webtest.TestApp(recorder_app)
        headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo'}

        resp = testapp.put(req_url, headers=headers, params=buff)

        assert resp.json['success'] == 'true'
        assert resp.json['WARC-Date'] != ''

        self._test_all_warcs('/warcs/meta', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('META:warc')
        assert len(warcs) == 1

        warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')

        with open(warcs[warc_key], 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(
                decomp, ensure_http_headers=True)

        status_headers = record.rec_headers
        assert len(record.rec_headers.headers) == 9
        assert status_headers.get_header('WARC-Type') == 'resource'
        assert status_headers.get_header(
            'WARC-Target-URI') == 'custom://httpbin.org'
        assert status_headers.get_header('WARC-Record-ID') != ''
        assert status_headers.get_header('WARC-Date') != ''
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header(
            'WARC-Block-Digest') == status_headers.get_header(
                'WARC-Payload-Digest')
        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))
        assert status_headers.get_header('WARC-Custom') == 'foo'

        assert record.raw_stream.read() == buff

        status_headers = record.http_headers
        assert len(record.http_headers.headers) == 2

        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))

        writer.close()
        assert len(writer.fh_cache) == 0
Ejemplo n.º 2
0
    def test_record_custom_record(self):
        dedup_index = self._get_dedup_index(user=False)

        warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'

        buff = b'Some Data'

        testapp = webtest.TestApp(recorder_app)
        headers = {'content-type': 'text/plain',
                   'WARC-Custom': 'foo'
                  }

        resp = testapp.put(req_url, headers=headers, params=buff)

        assert resp.json['success'] == 'true'
        assert resp.json['WARC-Date'] != ''

        self._test_all_warcs('/warcs/meta', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('META:warc')
        assert len(warcs) == 1

        warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')

        with open(warcs[warc_key], 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp, ensure_http_headers=True)

        status_headers = record.rec_headers
        assert len(record.rec_headers.headers) == 9
        assert status_headers.get_header('WARC-Type') == 'resource'
        assert status_headers.get_header('WARC-Target-URI') == 'custom://httpbin.org'
        assert status_headers.get_header('WARC-Record-ID') != ''
        assert status_headers.get_header('WARC-Date') != ''
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))
        assert status_headers.get_header('WARC-Custom') == 'foo'

        assert record.raw_stream.read() == buff

        status_headers = record.http_headers
        assert len(record.http_headers.headers) == 2

        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))

        writer.close()
        assert len(writer.fh_cache) == 0
Ejemplo n.º 3
0
    def test_record_file_warc_keep_open(self):
        path = to_path(self.root_dir + '/warcs/A.warc.gz')
        writer = MultiFileWARCWriter(path)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        assert os.path.isfile(path)
        assert len(writer.fh_cache) == 1

        writer.close()
        assert len(writer.fh_cache) == 0
Ejemplo n.º 4
0
    def test_record_file_warc_keep_open(self):
        path = to_path(self.root_dir + '/warcs/A.warc.gz')
        writer = MultiFileWARCWriter(path)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        assert os.path.isfile(path)
        assert len(writer.fh_cache) == 1

        writer.close()
        assert len(writer.fh_cache) == 0
Ejemplo n.º 5
0
def main():
    upstream_url = 'http://localhost:8080'

    target = tempfile.mkdtemp(prefix='tmprec') + '/'

    print('Recording to ' + target)

    def rm_target():
        print('Removing ' + target)
        shutil.rmtree(target)

    atexit.register(rm_target)

    local_r = redis.StrictRedis.from_url('redis://localhost/2')
    local_r.delete('rec:cdxj')
    local_r.delete('rec:warc')

    #target = './_recordings/'

    dedup_index = WritableRedisIndexer(
                    redis_url='redis://localhost/2/rec:cdxj',
                    file_key_template='rec:warc',
                    rel_path_template=target,
                    dupe_policy=SkipDupePolicy())

    recorder_app = RecorderApp(upstream_url,
                    MultiFileWARCWriter(target, dedup_index=dedup_index),
                     accept_colls='live')

    return recorder_app
Ejemplo n.º 6
0
    def init_recorder(self, recorder_config):
        """Initialize the recording functionality of pywb. If recording_config is None this function is a no op"""
        if not recorder_config:
            self.recorder = None
            self.recorder_path = None
            return

        if isinstance(recorder_config, str):
            recorder_coll = recorder_config
            recorder_config = {}
        else:
            recorder_coll = recorder_config['source_coll']

        # TODO: support dedup
        dedup_index = None
        warc_writer = MultiFileWARCWriter(
            self.warcserver.archive_paths,
            max_size=int(recorder_config.get('rollover_size', 1000000000)),
            max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
            filename_template=recorder_config.get('filename_template'),
            dedup_index=dedup_index)

        self.recorder = RecorderApp(
            self.RECORD_SERVER % str(self.warcserver_server.port),
            warc_writer,
            accept_colls=recorder_config.get('source_filter'))

        recorder_server = GeventServer(self.recorder, port=0)

        self.recorder_path = self.RECORD_API % (recorder_server.port,
                                                recorder_coll)
Ejemplo n.º 7
0
    def init_recorder(self, recorder_config):
        if not recorder_config:
            self.recorder = None
            self.recorder_path = None
            return

        if isinstance(recorder_config, str):
            recorder_coll = recorder_config
            recorder_config = {}
        else:
            recorder_coll = recorder_config['source_coll']

        # TODO: support dedup
        dedup_index = None
        warc_writer = MultiFileWARCWriter(
            self.warcserver.archive_paths,
            max_size=int(recorder_config.get('rollover_size', 1000000000)),
            max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
            filename_template=recorder_config.get('filename_template'),
            dedup_index=dedup_index)

        self.recorder = RecorderApp(
            self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer)

        recorder_server = GeventServer(self.recorder, port=0)

        self.recorder_path = self.RECORD_API % (recorder_server.port,
                                                recorder_coll)
Ejemplo n.º 8
0
    def test_record_multiple_writes_rollover_idle(self):
        warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index, max_idle_secs=0.9)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?boo=far', '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 1)

        time.sleep(1.0)
        writer.close_idle_files()

        # Third Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?goo=bar', '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"goo": "bar"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 2)

        writer.close()
        assert len(writer.fh_cache) == 0
Ejemplo n.º 9
0
    def test_record_multiple_writes_rollover_idle(self):
        warc_path = to_path(self.root_dir +
                            '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path,
                                     dedup_index=dedup_index,
                                     max_idle_secs=0.9)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?foo=bar',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 1)

        time.sleep(1.0)
        writer.close_idle_files()

        # Third Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?goo=bar',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"goo": "bar"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 2)

        writer.close()
        assert len(writer.fh_cache) == 0
Ejemplo n.º 10
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir +
                            '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?foo=bar',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout,
                            fh,
                            filename,
                            cdxj=True,
                            append_post=True,
                            sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2

        writer.close()
        assert len(writer.fh_cache) == 0
Ejemplo n.º 11
0
    def init_recorder(self, recorder_config):
        """Initialize the recording functionality of pywb. If recording_config is None this function is a no op

        :param str|dict|None recorder_config: The configuration for the recorder app
        :rtype: None
        """
        if not recorder_config:
            self.recorder = None
            self.recorder_path = None
            return

        if isinstance(recorder_config, str):
            recorder_coll = recorder_config
            recorder_config = {}
        else:
            recorder_coll = recorder_config['source_coll']

        # cache mode
        self.rec_cache_mode = recorder_config.get('cache', 'default')

        dedup_policy = recorder_config.get('dedup_policy')
        dedup_by_url = False

        if dedup_policy == 'none':
            dedup_policy = ''

        if dedup_policy == 'keep':
            dedup_policy = WriteDupePolicy()
        elif dedup_policy == 'revisit':
            dedup_policy = WriteRevisitDupePolicy()
        elif dedup_policy == 'skip':
            dedup_policy = SkipDupePolicy()
            dedup_by_url = True
        elif dedup_policy:
            msg = 'Invalid option for dedup_policy: {0}'
            raise Exception(msg.format(dedup_policy))

        if dedup_policy:
            dedup_index = WritableRedisIndexer(
                redis_url=self.warcserver.dedup_index_url,
                dupe_policy=dedup_policy,
                rel_path_template=self.warcserver.root_dir + '/{coll}/archive')
        else:
            dedup_index = None

        warc_writer = MultiFileWARCWriter(
            self.warcserver.archive_paths,
            max_size=int(recorder_config.get('rollover_size', 1000000000)),
            max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
            filename_template=recorder_config.get('filename_template'),
            dedup_index=dedup_index,
            dedup_by_url=dedup_by_url)

        if dedup_policy:
            pending_counter = self.warcserver.dedup_index_url.replace(
                ':cdxj', ':pending')
            pending_timeout = recorder_config.get('pending_timeout', 30)
            create_buff_func = lambda params, name: RedisPendingCounterTempBuffer(
                512 * 1024, pending_counter, params, name, pending_timeout)
        else:
            create_buff_func = None

        self.recorder = RecorderApp(
            self.RECORD_SERVER % str(self.warcserver_server.port),
            warc_writer,
            accept_colls=recorder_config.get('source_filter'),
            create_buff_func=create_buff_func)

        recorder_server = GeventServer(self.recorder, port=0)

        self.recorder_path = self.RECORD_API % (recorder_server.port,
                                                recorder_coll)

        # enable PUT of custom data as 'resource' records
        if recorder_config.get('enable_put_custom_record'):
            self.put_custom_record_path = self.recorder_path + '&put_record={rec_type}&url={url}'
Ejemplo n.º 12
0
    def init_recorder(self, recorder_config):
        """Initialize the recording functionality of pywb. If recording_config is None this function is a no op

        :param str|dict|None recorder_config: The configuration for the recorder app
        :rtype: None
        """
        if not recorder_config:
            self.recorder = None
            self.recorder_path = None
            return

        if isinstance(recorder_config, str):
            recorder_coll = recorder_config
            recorder_config = {}
        else:
            recorder_coll = recorder_config['source_coll']

        # cache mode
        self.rec_cache_mode = recorder_config.get('cache', 'default')

        dedup_policy = recorder_config.get('dedup_policy')
        dedup_by_url = False

        if dedup_policy == 'none':
            dedup_policy = ''

        if dedup_policy == 'keep':
            dedup_policy = WriteDupePolicy()
        elif dedup_policy == 'revisit':
            dedup_policy = WriteRevisitDupePolicy()
        elif dedup_policy == 'skip':
            dedup_policy = SkipDupePolicy()
            dedup_by_url = True
        elif dedup_policy:
            msg = 'Invalid option for dedup_policy: {0}'
            raise Exception(msg.format(dedup_policy))

        if dedup_policy:
            dedup_index = WritableRedisIndexer(
                redis_url=self.warcserver.dedup_index_url,
                dupe_policy=dedup_policy,
                rel_path_template=self.warcserver.root_dir + '/{coll}/archive')
        else:
            dedup_index = None

        warc_writer = MultiFileWARCWriter(
            self.warcserver.archive_paths,
            max_size=int(recorder_config.get('rollover_size', 1000000000)),
            max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
            filename_template=recorder_config.get('filename_template'),
            dedup_index=dedup_index,
            dedup_by_url=dedup_by_url)

        self.recorder = RecorderApp(
            self.RECORD_SERVER % str(self.warcserver_server.port),
            warc_writer,
            accept_colls=recorder_config.get('source_filter'))

        recorder_server = GeventServer(self.recorder, port=0)

        self.recorder_path = self.RECORD_API % (recorder_server.port,
                                                recorder_coll)
 def close_file(actual_self, filename):
     MultiFileWARCWriter.close_file(actual_self, filename)
     assert list(actual_self.iter_open_files()) == []
     global all_closed
     all_closed = True
Ejemplo n.º 14
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body


        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?boo=far', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout, fh, filename,
                            cdxj=True, append_post=True, sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                '/get?boo=far', '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2

        writer.close()
        assert len(writer.fh_cache) == 0