コード例 #1
0
ファイル: test_writer.py プロジェクト: ibnesayeed/warcprox
def test_warc_writer_locking(tmpdir):
    """Test if WarcWriter is locking WARC files.
    When we don't have the .open suffix, WarcWriter locks the file and the
    external process trying to ``lock_file`` fails (result=0).
    """
    recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
    recorded_url = RecordedUrl(url='http://example.com',
                               content_type='text/plain',
                               status=200,
                               client_ip='127.0.0.2',
                               request_data=b'abc',
                               response_recorder=recorder,
                               remote_ip='127.0.0.3',
                               timestamp=datetime.utcnow())

    dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
    wwriter = WarcWriter(
        Options(directory=dirname, no_warc_open_suffix=True, writer_threads=1))
    wwriter.write_records(recorded_url)
    warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')]
    assert warcs
    target_warc = os.path.join(dirname, warcs[0])
    # launch another process and try to lock WARC file
    queue = Queue()
    p = Process(target=lock_file, args=(queue, target_warc))
    p.start()
    p.join()
    assert queue.get() == 'FAILED TO OBTAIN LOCK'
    wwriter.close_writer()

    # locking must succeed after writer has closed the WARC file.
    p = Process(target=lock_file, args=(queue, target_warc))
    p.start()
    p.join()
    assert queue.get() == 'OBTAINED LOCK'
コード例 #2
0
ファイル: test_writer.py プロジェクト: ibnesayeed/warcprox
def test_warc_writer_filename(tmpdir):
    """Test if WarcWriter is writing WARC files with custom filenames.
    """
    recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
    recorded_url = RecordedUrl(url='http://example.com',
                               content_type='text/plain',
                               status=200,
                               client_ip='127.0.0.2',
                               request_data=b'abc',
                               response_recorder=recorder,
                               remote_ip='127.0.0.3',
                               timestamp=datetime.utcnow())

    dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
    wwriter = WarcWriter(
        Options(
            directory=dirname,
            prefix='foo',
            warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}',
            writer_threads=1))
    wwriter.write_records(recorded_url)
    warcs = [fn for fn in os.listdir(dirname)]
    assert warcs
    assert re.search(r'\d{17}_foo_\d{14}_00000.warc.open',
                     wwriter._available_warcs.queue[0].path)
コード例 #3
0
ファイル: test_writer.py プロジェクト: Babibubebon/warcprox
def test_do_not_archive():
    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        wwt = warcprox.writerthread.WarcWriterProcessor(
            Options(writer_threads=1))
        wwt.inq = warcprox.TimestampedQueue(maxsize=1)
        wwt.outq = warcprox.TimestampedQueue(maxsize=1)
        try:
            wwt.start()
            # to be written -- default do_not_archive False
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/yes',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest))
            # not to be written -- do_not_archive set True
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/no',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest,
                            warcprox_meta={'warc-prefix': '-'},
                            do_not_archive=True))
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()
        finally:
            wwt.stop.set()
            wwt.join()
コード例 #4
0
def test_close_for_prefix(tmpdir):
    wwp = warcprox.writerthread.WarcWriterProcessor(
        Options(directory=str(tmpdir)))
    wwp.inq = queue.Queue(maxsize=1)
    wwp.outq = queue.Queue(maxsize=1)

    try:
        wwp.start()

        # write a record to the default prefix
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(
            RecordedUrl(url='http://example.com/1',
                        content_type='text/plain',
                        status=200,
                        client_ip='127.0.0.2',
                        request_data=b'abc',
                        response_recorder=recorder,
                        remote_ip='127.0.0.3',
                        timestamp=datetime.utcnow(),
                        payload_digest=recorder.block_digest))
        time.sleep(0.5)
        rurl = wwp.outq.get()  # wait for it to finish

        assert rurl.url == b'http://example.com/1'
        assert len(tmpdir.listdir()) == 1
        assert tmpdir.listdir()[0].basename.startswith('warcprox-')
        assert tmpdir.listdir()[0].basename.endswith('-00000.warc.open')
        assert tmpdir.listdir(
        )[0].basename == wwp.writer_pool.default_warc_writer.finalname + '.open'

        # request close of default warc
        wwp.close_for_prefix()

        # write a record to some other prefix
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(
            RecordedUrl(url='http://example.com/2',
                        content_type='text/plain',
                        status=200,
                        client_ip='127.0.0.2',
                        request_data=b'abc',
                        response_recorder=recorder,
                        remote_ip='127.0.0.3',
                        timestamp=datetime.utcnow(),
                        payload_digest=recorder.block_digest,
                        warcprox_meta={'warc-prefix': 'some-prefix'}))
        time.sleep(0.5)
        rurl = wwp.outq.get()  # wait for it to finish

        assert rurl.url == b'http://example.com/2'
        assert len(tmpdir.listdir()) == 2
        basenames = sorted(f.basename for f in tmpdir.listdir())
        assert basenames[0].startswith('some-prefix-')
        assert basenames[0].endswith('-00000.warc.open')
        assert basenames[1].startswith('warcprox-')
        assert basenames[1].endswith('-00000.warc')

        # request close of warc with prefix
        wwp.close_for_prefix('some-prefix')

        # write another record to the default prefix
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(
            RecordedUrl(url='http://example.com/3',
                        content_type='text/plain',
                        status=200,
                        client_ip='127.0.0.2',
                        request_data=b'abc',
                        response_recorder=recorder,
                        remote_ip='127.0.0.3',
                        timestamp=datetime.utcnow(),
                        payload_digest=recorder.block_digest))
        time.sleep(0.5)
        rurl = wwp.outq.get()  # wait for it to finish

        assert rurl.url == b'http://example.com/3'
        # now some-prefix warc is closed and a new default prefix warc is open
        basenames = sorted(f.basename for f in tmpdir.listdir())
        assert len(basenames) == 3
        assert basenames[0].startswith('some-prefix-')
        assert basenames[0].endswith('-00000.warc')
        assert basenames[1].startswith('warcprox-')
        assert basenames[1].endswith('-00000.warc')
        assert basenames[2].startswith('warcprox-')
        assert basenames[2].endswith('-00001.warc.open')

        # write another record to with prefix "some-prefix"
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(
            RecordedUrl(url='http://example.com/4',
                        content_type='text/plain',
                        status=200,
                        client_ip='127.0.0.2',
                        request_data=b'abc',
                        response_recorder=recorder,
                        remote_ip='127.0.0.3',
                        timestamp=datetime.utcnow(),
                        payload_digest=recorder.block_digest,
                        warcprox_meta={'warc-prefix': 'some-prefix'}))
        time.sleep(0.5)
        rurl = wwp.outq.get()  # wait for it to finish

        assert rurl.url == b'http://example.com/4'
        # new some-prefix warc will have a new random token and start over at
        # serial 00000
        basenames = sorted(f.basename for f in tmpdir.listdir())
        assert len(basenames) == 4
        assert basenames[0].startswith('some-prefix-')
        assert basenames[1].startswith('some-prefix-')
        # order of these two warcs depends on random token so we don't know
        # which is which
        assert basenames[0][-5:] != basenames[1][-5:]
        assert '-00000.' in basenames[0]
        assert '-00000.' in basenames[1]

        assert basenames[2].startswith('warcprox-')
        assert basenames[2].endswith('-00000.warc')
        assert basenames[3].startswith('warcprox-')
        assert basenames[3].endswith('-00001.warc.open')

    finally:
        wwp.stop.set()
        wwp.join()
コード例 #5
0
def test_special_dont_write_prefix():
    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        wwt = warcprox.writerthread.WarcWriterProcessor(Options(prefix='-'))
        wwt.inq = queue.Queue(maxsize=1)
        wwt.outq = queue.Queue(maxsize=1)
        try:
            wwt.start()
            # not to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/no',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest))
            # to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(
                    url='http://example.com/yes',
                    content_type='text/plain',
                    status=200,
                    client_ip='127.0.0.2',
                    request_data=b'abc',
                    response_recorder=recorder,
                    remote_ip='127.0.0.3',
                    timestamp=datetime.utcnow(),
                    payload_digest=recorder.block_digest,
                    warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            assert wwt.outq.empty()
        finally:
            wwt.stop.set()
            wwt.join()

        wwt = warcprox.writerthread.WarcWriterProcessor(
            Options(blackout_period=60, prefix='foo'))
        wwt.inq = queue.Queue(maxsize=1)
        wwt.outq = queue.Queue(maxsize=1)
        try:
            wwt.start()
            # to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/yes',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest))
            # not to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/no',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest,
                            warcprox_meta={'warc-prefix': '-'}))
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()

            # test blackout_period option. Write first revisit record because
            # its outside the blackout_period (60). Do not write the second
            # because its inside the blackout_period.
            recorder = ProxyingRecorder(io.BytesIO(b'test1'), None)
            recorder.read()
            old = datetime.utcnow() - timedelta(0, 3600)
            ru = RecordedUrl(url='http://example.com/dup',
                             content_type='text/plain',
                             status=200,
                             client_ip='127.0.0.2',
                             request_data=b'abc',
                             response_recorder=recorder,
                             remote_ip='127.0.0.3',
                             timestamp=datetime.utcnow(),
                             payload_digest=recorder.block_digest)
            ru.dedup_info = dict(
                id=b'1',
                url=b'http://example.com/dup',
                date=old.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
            wwt.inq.put(ru)
            recorded_url = wwt.outq.get(timeout=10)
            recorder = ProxyingRecorder(io.BytesIO(b'test2'), None)
            recorder.read()
            recent = datetime.utcnow() - timedelta(0, 5)
            ru = RecordedUrl(url='http://example.com/dup',
                             content_type='text/plain',
                             status=200,
                             client_ip='127.0.0.2',
                             request_data=b'abc',
                             response_recorder=recorder,
                             remote_ip='127.0.0.3',
                             timestamp=datetime.utcnow(),
                             payload_digest=recorder.block_digest)
            ru.dedup_info = dict(
                id=b'2',
                url=b'http://example.com/dup',
                date=recent.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
            wwt.inq.put(ru)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()

        finally:
            wwt.stop.set()
            wwt.join()
コード例 #6
0
def test_special_dont_write_prefix():
    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        wwt = warcprox.writerthread.WarcWriterProcessor(Options(prefix='-'))
        wwt.inq = queue.Queue(maxsize=1)
        wwt.outq = queue.Queue(maxsize=1)
        try:
            wwt.start()
            # not to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(RecordedUrl(
                url='http://example.com/no', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest))
            # to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(RecordedUrl(
                url='http://example.com/yes', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest,
                warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            assert wwt.outq.empty()
        finally:
            wwt.stop.set()
            wwt.join()

        wwt = warcprox.writerthread.WarcWriterProcessor(
                Options(blackout_period=60, prefix='foo'))
        wwt.inq = queue.Queue(maxsize=1)
        wwt.outq = queue.Queue(maxsize=1)
        try:
            wwt.start()
            # to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(RecordedUrl(
                url='http://example.com/yes', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest))
            # not to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(RecordedUrl(
                url='http://example.com/no', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest,
                warcprox_meta={'warc-prefix': '-'}))
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()

            # test blackout_period option. Write first revisit record because
            # its outside the blackout_period (60). Do not write the second
            # because its inside the blackout_period.
            recorder = ProxyingRecorder(io.BytesIO(b'test1'), None)
            recorder.read()
            old = datetime.utcnow() - timedelta(0, 3600)
            ru = RecordedUrl(
                url='http://example.com/dup',
                content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest)
            ru.dedup_info = dict(id=b'1', url=b'http://example.com/dup',
                                 date=old.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
            wwt.inq.put(ru)
            recorded_url = wwt.outq.get(timeout=10)
            recorder = ProxyingRecorder(io.BytesIO(b'test2'), None)
            recorder.read()
            recent = datetime.utcnow() - timedelta(0, 5)
            ru = RecordedUrl(
                url='http://example.com/dup', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest)
            ru.dedup_info = dict(id=b'2', url=b'http://example.com/dup',
                                 date=recent.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
            wwt.inq.put(ru)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()

        finally:
            wwt.stop.set()
            wwt.join()
コード例 #7
0
ファイル: test_writer.py プロジェクト: d235j/warcprox
import os
import fcntl
from multiprocessing import Process, Queue
from datetime import datetime
import pytest
from warcprox.mitmproxy import ProxyingRecorder
from warcprox.warcproxy import RecordedUrl
from warcprox.writer import WarcWriter
from warcprox import Options

recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')

recorded_url = RecordedUrl(url='http://example.com',
                           content_type='text/plain',
                           status=200,
                           client_ip='127.0.0.2',
                           request_data=b'abc',
                           response_recorder=recorder,
                           remote_ip='127.0.0.3',
                           timestamp=datetime.utcnow())


def lock_file(queue, filename):
    """Try to lock file and return 1 if successful, else return 0.
    It is necessary to run this method in a different process to test locking.
    """
    try:
        fi = open(filename, 'ab')
        fcntl.lockf(fi, fcntl.LOCK_EX | fcntl.LOCK_NB)
        fi.close()
        queue.put('OBTAINED LOCK')
    except IOError:
コード例 #8
0
ファイル: test_writer.py プロジェクト: leonirlopes/warcprox
def test_special_dont_write_prefix():
    class NotifyMe:
        def __init__(self):
            self.the_list = []
        def notify(self, recorded_url, records):
            self.the_list.append((recorded_url, records))

    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        q = warcprox.TimestampedQueue(maxsize=1)
        listener = NotifyMe()
        wwt = warcprox.writerthread.WarcWriterThread(
                recorded_url_q=q, options=Options(prefix='-'),
                listeners=[listener])
        try:
            wwt.start()
            # not to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/no', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest))
            # to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/yes', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest,
                warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
            wait(lambda: len(listener.the_list) == 2, 10.0)
            assert not listener.the_list[0][1]
            assert listener.the_list[1][1]
        finally:
            wwt.stop.set()
            wwt.join()

        q = warcprox.TimestampedQueue(maxsize=1)
        listener = NotifyMe()
        wwt = warcprox.writerthread.WarcWriterThread(
                recorded_url_q=q, listeners=[listener])
        try:
            wwt.start()
            # to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/yes', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest))
            # not to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/no', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest,
                warcprox_meta={'warc-prefix': '-'}))
            wait(lambda: len(listener.the_list) == 2, 10.0)
            assert listener.the_list[0][1]
            assert not listener.the_list[1][1]
        finally:
            wwt.stop.set()
            wwt.join()