def verify_warc_and_zim(self, warcfile, zimfile): assert os.path.isfile(warcfile) assert os.path.isfile(zimfile) # autoescape=False to allow injecting html entities from translated text env = Environment( loader=PackageLoader("warc2zim", "templates"), extensions=["jinja2.ext.i18n"], autoescape=False, ) head_insert = env.get_template("sw_check.html").render().encode( "utf-8") # track to avoid checking duplicates, which are not written to ZIM warc_urls = set() zim_fh = Archive(zimfile) for record in iter_warc_records([warcfile]): url = get_record_url(record) if not url: continue if url in warc_urls: continue if record.rec_type not in (("response", "resource", "revisit")): continue # ignore revisit records that are to the same url if (record.rec_type == "revisit" and record.rec_headers["WARC-Refers-To-Target-URI"] == url): continue # parse headers as record, ensure headers match url_no_scheme = url.split("//", 2)[1] print(url_no_scheme) parsed_record = next( ArchiveIterator( BytesIO(zim_fh.get_content("H/" + url_no_scheme)))) assert record.rec_headers == parsed_record.rec_headers assert record.http_headers == parsed_record.http_headers # ensure payloads match try: payload = zim_fh.get_item("A/" + url_no_scheme) except KeyError: payload = None if record.rec_type == "revisit" or ( record.http_headers and record.http_headers.get("Content-Length") == "0"): assert not payload else: payload_content = payload.content.tobytes() # if HTML_RAW, still need to account for the head insert, otherwise should have exact match if payload.mimetype == HTML_RAW: assert head_insert in payload_content assert (payload_content.replace( head_insert, b"") == record.buffered_stream.read()) else: assert payload_content == record.buffered_stream.read() warc_urls.add(url)
def get_article(self, zimfile, path): zim_fh = Archive(zimfile) return zim_fh.get_content(path)