Example #1
0
    def test_transclusions_and_conversions(self, capsys):
        transclusions = """
transclusions:
  http://www.example.com/videos/barsandtone.flv:
    - url: http://www.example.com/containing/page.html
      timestamp: 20190103020000
      selector: object, embed
"""

        transclusions_file = os.path.join(self.root_dir, 'transclu2.yaml')
        with open(transclusions_file, 'wt') as fh:
            fh.write(transclusions)

        source_dir = os.path.join(self.test_root, 'convert-test')

        res = main(['-o', '-v', '-n', 'test-transc2.warc', '--transclusions', transclusions_file,
                    '--conversions', self.conversion_results, 'http://www.example.com/', source_dir])

        warcio_main(['index', '-f', 'warc-type,warc-target-uri', 'test-transc2.warc.gz'])

        out, err = capsys.readouterr()

        expected = """\
{"warc-type": "warcinfo"}
{"warc-type": "resource", "warc-target-uri": "http://www.example.com/videos/barsandtone.flv"}
{"warc-type": "conversion", "warc-target-uri": "http://www.example.com/videos/barsandtone.flv.png"}
{"warc-type": "conversion", "warc-target-uri": "http://www.example.com/videos/barsandtone.flv.webm"}
{"warc-type": "conversion", "warc-target-uri": "http://www.example.com/videos/barsandtone.flv.mp4"}
{"warc-type": "conversion", "warc-target-uri": "http://www.example.com/videos/barsandtone.flv.mkv"}
{"warc-type": "resource", "warc-target-uri": "urn:embeds:http://www.example.com/containing/page.html"}
"""
        assert out == expected
Example #2
0
    def test_warcit_new(self, caplog):
        res = main(['http://www.iana.org/', self.test_dir])
        assert res == 0

        assert 'Wrote 24 resources to www.iana.org.warc.gz' in caplog.text
        assert os.path.isfile(
            os.path.join(self.root_dir, 'www.iana.org.warc.gz'))
Example #3
0
 def test_with_magic(self, caplog):
     pytest.importorskip('magic')
     res = main([
         '-q', '-o', '--use-magic', '-n', 'test', 'http://www.iana.org/',
         self.test_dir
     ])
     assert res == 0
Example #4
0
    def test_warcit_no_such_file_2(self, caplog):
        res = main([
            '-o', '-v', 'http://www.iana.org/', self.zip_filename + '_nosuch'
        ])
        assert res == 0

        assert 'www.iana.org.zip_nosuch" not a valid' in caplog.text
Example #5
0
    def test_warcit_new_zip_file_path(self, caplog):
        res = main(['-o', '-v', 'http://www.iana.org/', self.zip_filename + '/www.iana.org/'])
        assert res == 0

        assert 'Wrote 24 resources to www.iana.org.warc.gz' in caplog.text
        assert 'Writing "http://www.iana.org/index.html" (text/html) @ "2017-10-17T14:30:26Z"' in caplog.text
        assert 'www.iana.org.zip/www.iana.org/index.html"' in caplog.text
        assert os.path.isfile(os.path.join(self.root_dir, 'www.iana.org.warc.gz'))
Example #6
0
    def test_warcit_fixed_date(self, capsys):
        res = main(['-q', '-n', 'test', '-d', '2010-12-26T10:11:12', 'http://www.iana.org/', self.test_dir])
        assert res == 0

        warcio_main(['index', '-f', 'warc-target-uri,warc-date,content-type', 'test.warc.gz'])
        out, err = capsys.readouterr()

        assert '"warc-target-uri": "http://www.iana.org/index.html", "warc-date": "2010-12-26T10:11:12Z", "content-type": "text/html"' in out
Example #7
0
    def test_no_magic(self, caplog):
        import sys
        sys.modules['magic'] = None

        res = main(['-q', '--use-magic', 'magic', '-n', 'test', 'http://www.iana.org/', self.test_dir])
        assert res == 1
        assert "python-magic or libmagic is not available" in caplog.text

        del sys.modules['magic']
Example #8
0
    def test_warcit_no_such_zip_prefix(self, caplog):
        res = main([
            '-o', '-v', 'http://www.iana.org/',
            self.zip_filename + '/www.example.com/'
        ])
        assert res == 0

        assert 'Wrote 0 resources to www.example.com.warc.gz' in caplog.text
        assert os.path.isfile(
            os.path.join(self.root_dir, 'www.example.com.warc.gz'))
Example #9
0
    def test_warcit_mime_override(self, capsys):
        res = main(['-q', '-n', 'test2', '--mime-overrides=*/index.html=custom/mime', 'http://www.iana.org/', self.test_dir])
        assert res == 0

        warcio_main(['index', '-f', 'warc-target-uri,content-type', 'test2.warc.gz'])

        out, err = capsys.readouterr()

        assert '"warc-target-uri": "http://www.iana.org/index.html", "content-type": "custom/mime"' in out
        assert '"warc-target-uri": "http://www.iana.org/about/index.html", "content-type": "custom/mime"' in out
Example #10
0
    def test_warcit_no_revisit(self, capsys):
        res = main(['-q', '-o', '--name', 'test', '--index-files', '', '--no-gzip', 'http://www.iana.org/', self.test_dir])
        assert res == 0

        warcio_main(['index', '-f', 'warc-type,warc-target-uri,warc-date', 'test.warc'])

        out, err = capsys.readouterr()

        assert '"warc-type": "warcinfo"' in out
        assert '"warc-type": "revisit", "warc-target-uri": "http://www.iana.org/"' not in out
Example #11
0
    def test_warcit_use_charset_auto_detect(self, capsys):
        res = main(['-q', '-n', 'test3', '--charset', 'cchardet', 'http://www.iana.org/', self.test_dir])
        assert res == 0

        warcio_main(['index', '-f', 'warc-target-uri,content-type', 'test3.warc.gz'])

        out, err = capsys.readouterr()
        out = out.lower() # charset names might be uppercase or lowercase
        assert '"warc-target-uri": "http://www.iana.org/index.html", "content-type": "text/html; charset=windows-1252"' in out
        assert '"warc-target-uri": "http://www.iana.org/_css/2015.1/print.css", "content-type": "text/css; charset=utf-8"' in out
Example #12
0
    def test_warcit_use_charset_custom(self, capsys):
        res = main(['-q', '-o', '-n', 'test3', '--charset', 'custom', 'http://www.iana.org/', self.test_dir])
        assert res == 0

        warcio_main(['index', '-f', 'warc-target-uri,content-type', 'test3.warc.gz'])

        out, err = capsys.readouterr()

        assert '"warc-target-uri": "http://www.iana.org/index.html", "content-type": "text/html; charset=custom"' in out
        assert '"warc-target-uri": "http://www.iana.org/_css/2015.1/print.css", "content-type": "text/css; charset=custom"' in out
Example #13
0
    def test_warcit_single_file_and_no_warcinfo(self, caplog, capsys):
        res = main(['-v', '--no-warcinfo', 'http://www.iana.org/', os.path.join(self.test_dir, 'index.html')])
        assert res == 0

        assert 'Wrote 2 resources to index.html.warc.gz' in caplog.text
        assert os.path.isfile(os.path.join(self.root_dir, 'index.html.warc.gz'))

        warcio_main(['index', '-f', 'warc-type,warc-target-uri', 'index.html.warc.gz'])

        out, err = capsys.readouterr()
        assert '"warc-type": "warcinfo"' not in out
        assert '"warc-target-uri": "http://www.iana.org/index.html"' in out
        assert '"warc-target-uri": "http://www.iana.org/"' in out
Example #14
0
    def test_warcit_diff_file(self, caplog, capsys):
        res = main(['-v', '--name', 'test', '--no-gzip', 'http://www.iana.org/', self.test_dir])
        assert res == 0

        assert 'Wrote 24 resources to test.warc' in caplog.text
        assert os.path.isfile(os.path.join(self.root_dir, 'test.warc'))

        warcio_main(['index', '-f', 'warc-type,warc-target-uri,warc-date', 'test.warc'])

        out, err = capsys.readouterr()

        assert '"warc-type": "warcinfo"' in out
        assert '"warc-type": "revisit", "warc-target-uri": "http://www.iana.org/"' in out
Example #15
0
    def test_warcit_use_charset_auto_detect(self, capsys):
        res = main([
            '-q', '-n', 'test3', '--charset', 'auto', 'http://www.iana.org/',
            self.test_dir
        ])
        assert res == 0

        warcio_main(
            ['index', '-f', 'warc-target-uri,content-type', 'test3.warc.gz'])

        out, err = capsys.readouterr()
        assert '"warc-target-uri": "http://www.iana.org/index.html", "content-type": "text/html; charset=windows-1252"' in out
        assert '"warc-target-uri": "http://www.iana.org/_css/2015.1/print.css", "content-type": "text/css; charset=utf-8"' in out
        assert '"warc-target-uri": "http://www.iana.org/_img/bookmark_icon.ico", "content-type": "image/x-icon"' in out
Example #16
0
    def test_transclusions(self, capsys):
        transclusions = """
transclusions:
  http://www.iana.org/_img/bookmark_icon.ico:
    - url: http://www.example.com/containing/page.html
      timestamp: 20190102030000
"""

        transclusions_file = os.path.join(self.root_dir, 'transclusions.yaml')
        with open(transclusions_file, 'wt') as fh:
            fh.write(transclusions)

        res = main(['-o', '-v', '-n', 'test-transc.warc', '--transclusions', transclusions_file, 'http://www.iana.org/', self.test_dir])

        warcio_main(['index', '-f', 'warc-type,warc-target-uri,warc-date', 'test-transc.warc.gz'])

        out, err = capsys.readouterr()

        assert '"warc-type": "resource", "warc-target-uri": "urn:embeds:http://www.example.com/containing/page.html, "warc-date": "2019-01-02T03:00:00Z"' not in out
Example #17
0
    def test_conversion_records(self, capsys):
        source_dir = os.path.join(self.test_root, 'convert-test')

        res = main(['-o', '-v', '-n', 'test-convert.warc',
                    '--conversions', self.conversion_results, 'http://www.example.com/', source_dir])

        warcio_main(['index', '-f', 'warc-type,warc-target-uri', 'test-convert.warc.gz'])

        out, err = capsys.readouterr()

        expected = """\
{"warc-type": "warcinfo"}
{"warc-type": "resource", "warc-target-uri": "http://www.example.com/videos/barsandtone.flv"}
{"warc-type": "conversion", "warc-target-uri": "http://www.example.com/videos/barsandtone.flv.png"}
{"warc-type": "conversion", "warc-target-uri": "http://www.example.com/videos/barsandtone.flv.webm"}
{"warc-type": "conversion", "warc-target-uri": "http://www.example.com/videos/barsandtone.flv.mp4"}
{"warc-type": "conversion", "warc-target-uri": "http://www.example.com/videos/barsandtone.flv.mkv"}
"""
        assert out == expected
Example #18
0
    def test_warcit_overwrite_with_excludes(self, caplog):
        res = main(['http://www.iana.org/', '-o', '--exclude', '*.js', self.test_dir])
        assert res == 0

        assert 'Wrote 22 resources to www.iana.org.warc.gz' in caplog.text
        assert os.path.isfile(os.path.join(self.root_dir, 'www.iana.org.warc.gz'))
Example #19
0
 def test_warcit_append(self):
     res = main(['-a', 'http://www.iana.org/', '-q', self.test_dir])
     assert res == 0
Example #20
0
    def test_warcit_already_exists(self, caplog):
        res = main(['http://www.iana.org/', '-q', self.test_dir])
        assert res == 1

        assert 'File exists' in caplog.text
Example #21
0
    def test_warcit_no_such_file(self, caplog):
        res = main(['-o', '-v', 'http://www.iana.org/', './foo'])
        assert res == 0

        assert '"./foo" not a valid' in caplog.text