コード例 #1
0
ファイル: test_warc_to_zim.py プロジェクト: openzim/warc2zim
    def verify_warc_and_zim(self, warcfile, zimfile):
        assert os.path.isfile(warcfile)
        assert os.path.isfile(zimfile)

        # autoescape=False to allow injecting html entities from translated text
        env = Environment(
            loader=PackageLoader("warc2zim", "templates"),
            extensions=["jinja2.ext.i18n"],
            autoescape=False,
        )

        head_insert = env.get_template("sw_check.html").render().encode(
            "utf-8")

        # track to avoid checking duplicates, which are not written to ZIM
        warc_urls = set()

        zim_fh = Archive(zimfile)
        for record in iter_warc_records([warcfile]):
            url = get_record_url(record)
            if not url:
                continue

            if url in warc_urls:
                continue

            if record.rec_type not in (("response", "resource", "revisit")):
                continue

            # ignore revisit records that are to the same url
            if (record.rec_type == "revisit" and
                    record.rec_headers["WARC-Refers-To-Target-URI"] == url):
                continue

            # parse headers as record, ensure headers match
            url_no_scheme = url.split("//", 2)[1]
            print(url_no_scheme)
            parsed_record = next(
                ArchiveIterator(
                    BytesIO(zim_fh.get_content("H/" + url_no_scheme))))

            assert record.rec_headers == parsed_record.rec_headers
            assert record.http_headers == parsed_record.http_headers

            # ensure payloads match
            try:
                payload = zim_fh.get_item("A/" + url_no_scheme)
            except KeyError:
                payload = None

            if record.rec_type == "revisit" or (
                    record.http_headers
                    and record.http_headers.get("Content-Length") == "0"):
                assert not payload
            else:
                payload_content = payload.content.tobytes()

                # if HTML_RAW, still need to account for the head insert, otherwise should have exact match
                if payload.mimetype == HTML_RAW:
                    assert head_insert in payload_content
                    assert (payload_content.replace(
                        head_insert, b"") == record.buffered_stream.read())
                else:
                    assert payload_content == record.buffered_stream.read()

            warc_urls.add(url)
コード例 #2
0
ファイル: test_warc_to_zim.py プロジェクト: openzim/warc2zim
 def get_article(self, zimfile, path):
     zim_fh = Archive(zimfile)
     return zim_fh.get_content(path)