Ejemplo n.º 1
0
def check_helper(args, capsys, expected_exit_value):
    exit_value = None
    try:
        main(args=args)
    except SystemExit as e:
        exit_value = e.code
    finally:
        assert exit_value == expected_exit_value

    return capsys.readouterr().out
Ejemplo n.º 2
0
    def check_helper(self, args, expected_exit_value, capsys):
        exit_value = None
        try:
            main(args=args)
        except SystemExit as e:
            exit_value = e.code
        finally:
            assert exit_value == expected_exit_value

        return capsys.readouterr()[0]  # list for py33 support
Ejemplo n.º 3
0
def test_recompress_arc2warc_verbose(capsys):
    with named_temp() as temp:
        test_file = get_test_file('example.arc.gz')

        # recompress!
        main(args=['recompress', '-v', test_file, temp.name])

        out = capsys.readouterr().out
        assert '{"offset": "0", "warc-type": "warcinfo"}' in out
        assert '"warc-target-uri": "http://example.com/"' in out

        assert 'No Errors Found!' in out
        assert '2 records read' in out
Ejemplo n.º 4
0
def test_index():
    files = [
        'example.warc.gz', 'example.warc', 'example.arc.gz', 'example.arc'
    ]
    files = [get_test_file(filename) for filename in files]

    args = ['index', '-f', 'warc-type,warc-target-uri,warc-filename']
    args.extend(files)

    expected = """\
{"warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"warc-type": "response", "warc-target-uri": "http://example.com/"}
{"warc-type": "request", "warc-target-uri": "http://example.com/"}
{"warc-type": "revisit", "warc-target-uri": "http://example.com/"}
{"warc-type": "request", "warc-target-uri": "http://example.com/"}
{"warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"warc-type": "response", "warc-target-uri": "http://example.com/"}
{"warc-type": "request", "warc-target-uri": "http://example.com/"}
{"warc-type": "revisit", "warc-target-uri": "http://example.com/"}
{"warc-type": "request", "warc-target-uri": "http://example.com/"}
{"warc-type": "warcinfo", "warc-filename": "live-web-example.arc.gz"}
{"warc-type": "response", "warc-target-uri": "http://example.com/"}
{"warc-type": "warcinfo", "warc-filename": "live-web-example.arc.gz"}
{"warc-type": "response", "warc-target-uri": "http://example.com/"}
"""

    with patch_stdout() as buff:
        res = main(args=args)
        assert buff.getvalue().decode('utf-8') == expected
Ejemplo n.º 5
0
def test_extract_warc_response(capsysbinary):
    res = main(args=['extract', get_test_file('example.warc.gz'), '784'])
    assert capsysbinary.readouterr(
    ).out == b'WARC/1.0\r\nWARC-Target-URI: http://example.com/\r\nWARC-Date: 2017-03-06T04:02:06Z\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:a9c51e3e-0221-11e7-bf66-0242ac120005>\r\nWARC-IP-Address: 93.184.216.34\r\nWARC-Block-Digest: sha1:DR5MBP7OD3OPA7RFKWJUD4CTNUQUGFC5\r\nWARC-Payload-Digest: sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK\r\nContent-Type: application/http; msgtype=response\r\nContent-Length: 975\r\n\r\nHTTP/1.1 200 OK\r\nContent-Encoding: gzip\r\nAccept-Ranges: bytes\r\nCache-Control: max-age=604800\r\nContent-Type: text/html\r\nDate: Mon, 06 Mar 2017 04:02:06 GMT\r\nEtag: "359670651+gzip"\r\nExpires: Mon, 13 Mar 2017 04:02:06 GMT\r\nLast-Modified: Fri, 09 Aug 2013 23:54:35 GMT\r\nServer: ECS (iad/182A)\r\nVary: Accept-Encoding\r\nX-Cache: HIT\r\nContent-Length: 606\r\nConnection: close\r\n\r\n\x1f\x8b\x08\x00;\x81\x05R\x00\x03\x8dTA\xaf\xd30\x0c\xbe\xefW\x98r\x01i]\xf7\x80\x07S\xd7V @\xe2\x02\x1c\xe0\xc21k\xdc\xd5Z\x93\x94$\xed6\xa1\xf7\xdfq\xdb\xbd\xae\xe5\xed@+\xb5\x8e\x1d\x7f\xfel\xc7I\x9eI\x93\xfbs\x8dPzUe\x8b\xe4\xf1\x87Bf\x0b\xe0\'\xf1\xe4+\xcc>\x9f\x84\xaa+\x84OF\t\xd2I4h\x17\xc3\x16\x85^@^\n\xeb\xd0\xa7A\xe3\x8bp\x13@\x94M\x8c\xa5\xf7u\x88\xbf\x1bj\xd3\xe0\xa3\xd1\x1e\xb5\x0f\xbb\xb0\x01\xe4\xc3*\r<\x9e|\xd4\x85\xdf\x8eP\xb7\x90\xb4P\x98\x06-\xe1\xb16\xd6O\xfc\x8f$}\x99Jl)\xc7\xb0_,\x814y\x12U\xe8rQazw\x85r\xfe\xcc\xc9t\x0c.\x81s\xe7\x82\xc1\xb63\xf2\x0c\x7fz\xb1_\x8a\xfc\xb0\xb7\xa6\xd12\xccMel\x0c\xcf\x8b5\xbf\xaf\xb6\xe3\x16%\xec\x9et\x0c\xeb\xab\xaa\x16R\x92\xde\xcft\x053\r\x0b\xa1\xa8:\xc7\x10|\xafQ\xc3\x0f\xa1]\xb0\x84\xe0\x0bV-z\xca\x05|\xc3\x06Y3*\x96\xf0\xc1r\x06Kp\xbc5th\xa9\xb8"\xf6\xc2C\xff\x95\xd4NH\xf7\xe9\xc7\xf0v\xbd\xaeOOy\xde\xa3\x02\xd1xs\x83\xee\xfd\xcc\xe1V\xee\xc5$\xfe\xceX\x896\xb4BR\xe3b\xb8C\xb5\x9dP\x12qE\xfa\xb0\xe4\x7fK\x8e<\xca\t\xc1G\xb8\xd7\x9b7\x9b\xcd\x04\xb1\xebE(17Vx2\xccU\x1b\x8dS\xd0\xf7\n%\tx\xa1\xc4)\xbcd\xf9\xae\xcb\xf2\xe5\xb4e\xf3\x0e\xfeO&\x0f\xa34/\xe4\xa4\x98\xf3\x8a\xcd\xfa~\xc3\xf6Oi\xd6s\xebX\xef\xb1dW\x12\xc37\x89\xfa#\x9au\xf2"\x89\x86y\\$]j<\x9eL\xf2r\x90\xcb\xbb\'\xa3\xc9\xaa\xc1Vg?Kr {=\xb0\x84\xce\x8b]E\xae\xe4^x\x03;\x84\xc6\xb1X\x18\x0bTU\x8d\xf3]\xd5[\x04\x1c\x10\x1d\xcf\x0f{\xe7\x8d\xe2\x01s+\xf8e\x1a\xce\xf9\xdc9\x81g\xe4\xe1\xe0]\xd0\xf5\xd5\xebH\xbe4\x8d\x87\xda\x12#\xe7\x86KA\xba\xef\'\xf0Z\xb8\x03\xa7\xde\x07\xad\xd1*r\x8e\r\xab$\xaaG\xd6\t\xdf\x17\x16\x8b4\xe8n\x8d8\x8a\x8e\xc7\xe3\x8a\x84\x16+c\xf7\xd1\x10\xcfE\x97hA\xf6\xd5X\xe4\xf0\x8c\xa7\xfa\x18\xab\x15\x83\x89\xac\x07L\xa2\xbeRIt\xa9[4\\o\x7f\x01\x08\x95\xaa\x8b\xf6\x04\x00\x00'

    res = main(
        args=['extract', '--headers',
              get_test_file('example.warc.gz'), '784'])
    assert capsysbinary.readouterr(
    ).out == b'WARC/1.0\r\nWARC-Target-URI: http://example.com/\r\nWARC-Date: 2017-03-06T04:02:06Z\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:a9c51e3e-0221-11e7-bf66-0242ac120005>\r\nWARC-IP-Address: 93.184.216.34\r\nWARC-Block-Digest: sha1:DR5MBP7OD3OPA7RFKWJUD4CTNUQUGFC5\r\nWARC-Payload-Digest: sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK\r\nContent-Type: application/http; msgtype=response\r\nContent-Length: 975\r\n\r\nHTTP/1.1 200 OK\r\nContent-Encoding: gzip\r\nAccept-Ranges: bytes\r\nCache-Control: max-age=604800\r\nContent-Type: text/html\r\nDate: Mon, 06 Mar 2017 04:02:06 GMT\r\nEtag: "359670651+gzip"\r\nExpires: Mon, 13 Mar 2017 04:02:06 GMT\r\nLast-Modified: Fri, 09 Aug 2013 23:54:35 GMT\r\nServer: ECS (iad/182A)\r\nVary: Accept-Encoding\r\nX-Cache: HIT\r\nContent-Length: 606\r\nConnection: close\r\n\r\n'

    res = main(
        args=['extract', '--payload',
              get_test_file('example.warc.gz'), '784'])
    assert capsysbinary.readouterr(
    ).out == b'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 50px;\n        background-color: #fff;\n        border-radius: 1em;\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        body {\n            background-color: #fff;\n        }\n        div {\n            width: auto;\n            margin: 0 auto;\n            border-radius: 0;\n            padding: 1em;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <h1>Example Domain</h1>\n    <p>This domain is established to be used for illustrative examples in documents. You may use this\n    domain in examples without prior coordination or asking for permission.</p>\n    <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>\n</body>\n</html>\n'
Ejemplo n.º 6
0
def test_extract_warcinfo(capsys):
    res = main(args=['extract', get_test_file('example.warc.gz'), '0'])
    assert capsys.readouterr(
    ).out == 'WARC/1.0\r\nWARC-Date: 2017-03-06T04:03:53Z\r\nWARC-Record-ID: <urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>\r\nWARC-Filename: temp-20170306040353.warc.gz\r\nWARC-Type: warcinfo\r\nContent-Type: application/warc-fields\r\nContent-Length: 249\r\n\r\nsoftware: Webrecorder Platform v3.7\r\nformat: WARC File Format 1.0\r\ncreator: temp-MJFXHZ4S\r\nisPartOf: Temporary%20Collection\r\njson-metadata: {"title": "Temporary Collection", "size": 2865, "created_at": 1488772924, "type": "collection", "desc": ""}\r\n'

    res = main(
        args=['extract', '--headers',
              get_test_file('example.warc.gz'), '0'])
    assert capsys.readouterr(
    ).out == 'WARC/1.0\r\nWARC-Date: 2017-03-06T04:03:53Z\r\nWARC-Record-ID: <urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>\r\nWARC-Filename: temp-20170306040353.warc.gz\r\nWARC-Type: warcinfo\r\nContent-Type: application/warc-fields\r\nContent-Length: 249\r\n\r\n'

    res = main(
        args=['extract', '--payload',
              get_test_file('example.warc.gz'), '0'])
    assert capsys.readouterr(
    ).out == 'software: Webrecorder Platform v3.7\r\nformat: WARC File Format 1.0\r\ncreator: temp-MJFXHZ4S\r\nisPartOf: Temporary%20Collection\r\njson-metadata: {"title": "Temporary Collection", "size": 2865, "created_at": 1488772924, "type": "collection", "desc": ""}\r\n'
Ejemplo n.º 7
0
def test_index(capsys):
    files = [
        'example.warc.gz', 'example.warc', 'example.arc.gz', 'example.arc'
    ]
    files = [get_test_file(filename) for filename in files]

    args = [
        'index', '-f',
        'length,offset,warc-type,warc-target-uri,warc-filename,http:content-type'
    ]
    args.extend(files)

    expected = """\
{"length": "353", "offset": "0", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "431", "offset": "353", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "1228", "offset": "784", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
{"length": "609", "offset": "2012", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "586", "offset": "2621", "warc-type": "revisit", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
{"length": "609", "offset": "3207", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "484", "offset": "0", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "705", "offset": "488", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "1365", "offset": "1197", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
{"length": "800", "offset": "2566", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "942", "offset": "3370", "warc-type": "revisit", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
{"length": "800", "offset": "4316", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "171", "offset": "0", "warc-type": "warcinfo", "warc-filename": "live-web-example.arc.gz"}
{"length": "856", "offset": "171", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
{"length": "150", "offset": "0", "warc-type": "warcinfo", "warc-filename": "live-web-example.arc.gz"}
{"length": "1656", "offset": "151", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
"""
    res = main(args=args)
    assert capsys.readouterr().out == expected
Ejemplo n.º 8
0
def test_recompress_arc2warc(capsys):
    with named_temp() as temp:
        test_file = get_test_file('example.arc.gz')

        # recompress!
        main(args=['recompress', test_file, temp.name])

        assert "No Errors" in capsys.readouterr().out

        expected = """\
{"warc-type": "warcinfo", "warc-block-digest": "sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"}
{"warc-type": "response", "warc-block-digest": "sha1:PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "warc-payload-digest": "sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A"}
"""

        main(args=[
            'index', temp.name, '-f',
            'warc-type,warc-block-digest,warc-payload-digest'
        ])
        assert capsys.readouterr().out == expected
Ejemplo n.º 9
0
def test_recompress_wrong_chunks(capsys):
    with named_temp() as temp:
        test_file = get_test_file('example-wrong-chunks.warc.gz')

        with pytest.raises(ArchiveLoadFailed):
            main(args=['index', test_file, '-f', 'warc-type'])

        expected = """\
{"offset": "0", "warc-type": "response", "warc-target-uri": "http://example.com/"}
{"offset": "1061", "warc-type": "request", "warc-target-uri": "http://example.com/"}
"""

        # recompress!
        main(args=['recompress', '-v', test_file, temp.name])

        out = capsys.readouterr().out
        assert '2 records read' in out
        assert 'Compression Errors Found and Fixed!' in out
        assert 'No Errors Found!' not in out

        assert expected in out
Ejemplo n.º 10
0
    def recompress(self):
        from warcio.cli import main
        try:
            count = 0
            msg = ''
            with open(self.filename, 'rb') as stream:
                try:
                    count = self.load_and_write(stream, self.output)
                    msg = 'No Errors Found!'
                except Exception as e:
                    if self.verbose:
                        print('Parsing Error(s) Found:')
                        print(
                            str(e) if isinstance(e, ArchiveLoadFailed
                                                 ) else repr(e))
                        print()

                    count = self.decompress_and_recompress(stream, self.output)
                    msg = 'Compression Errors Found and Fixed!'

                if self.verbose:
                    print('Records successfully read and compressed:')
                    main(['index', self.output])
                    print('')

                print('{0} records read and recompressed to file: {1}'.format(
                    count, self.output))
                print(msg)

        except:
            if self.verbose:
                print('Exception Details:')
                traceback.print_exc()
                print('')

            print('Recompress Failed: {0} could not be read as a WARC or ARC'.
                  format(self.filename))
            sys.exit(1)
Ejemplo n.º 11
0
def test_index_2(capsys):
    files = ['example.warc.gz']
    files = [get_test_file(filename) for filename in files]

    args = ['index', '-f', 'offset,length,http:status,warc-type,filename']
    args.extend(files)

    expected = """\
{"offset": "0", "length": "353", "warc-type": "warcinfo", "filename": "example.warc.gz"}
{"offset": "353", "length": "431", "warc-type": "warcinfo", "filename": "example.warc.gz"}
{"offset": "784", "length": "1228", "http:status": "200", "warc-type": "response", "filename": "example.warc.gz"}
{"offset": "2012", "length": "609", "warc-type": "request", "filename": "example.warc.gz"}
{"offset": "2621", "length": "586", "http:status": "200", "warc-type": "revisit", "filename": "example.warc.gz"}
{"offset": "3207", "length": "609", "warc-type": "request", "filename": "example.warc.gz"}
"""
    res = main(args=args)
    assert capsys.readouterr().out == expected
Ejemplo n.º 12
0
def test_recompress_non_chunked(capsys):
    with named_temp() as temp:
        test_file = get_test_file('example-bad-non-chunked.warc.gz')

        with pytest.raises(ArchiveLoadFailed):
            main(args=['index', test_file, '-f', 'warc-type'])

        assert capsys.readouterr().out

        # recompress!
        main(args=['recompress', test_file, temp.name])
        assert 'Compression Errors Found and Fixed!' in capsys.readouterr().out

        expected = """\
{"warc-type": "warcinfo"}
{"warc-type": "warcinfo"}
{"warc-type": "response"}
{"warc-type": "request"}
{"warc-type": "revisit"}
{"warc-type": "request"}
"""

        main(args=['index', temp.name, '-f', 'warc-type'])
        assert capsys.readouterr().out == expected