def test_urlitem_staticcontent(tmp_path, gzip_nonhtml_url): fpath = tmp_path / "test.zim" with Creator(fpath) as creator: creator.add_item(URLItem(url=gzip_nonhtml_url, content="hello")) zim = Archive(fpath) assert bytes(zim.get_item("robots.txt").content) == b"hello"
def test_filelikeprovider_nosize(tmp_path, png_image_url): fileobj = io.BytesIO() stream_file(png_image_url, byte_stream=fileobj) fpath = tmp_path / "test.zim" with Creator(fpath) as creator: creator.add_item(FileLikeProviderItem(fileobj=fileobj, path="one.png")) zim = Archive(fpath) assert bytes(zim.get_item("one.png").content) == fileobj.getvalue()
def test_urlitem_html(tmp_path, gzip_html_url): file_path = tmp_path / "file.html" save_large_file(gzip_html_url, file_path) with open(file_path, "rb") as fh: file_bytes = fh.read() fpath = tmp_path / "test.zim" with Creator(fpath) as creator: creator.add_item(URLItem(url=gzip_html_url)) zim = Archive(fpath) assert bytes(zim.get_item("wiki/Main_Page").content) == file_bytes
def test_urlitem_nonhtmlgzip(tmp_path, gzip_nonhtml_url): file_path = tmp_path / "file.txt" save_large_file(gzip_nonhtml_url, file_path) with open(file_path, "rb") as fh: file_bytes = fh.read() fpath = tmp_path / "test.zim" with Creator(fpath) as creator: creator.add_item(URLItem(url=gzip_nonhtml_url)) creator.add_item(URLItem(url=gzip_nonhtml_url, use_disk=True)) zim = Archive(fpath) assert bytes(zim.get_item("robots.txt").content) == file_bytes
def test_urlprovider(tmp_path, png_image_url): file_path = tmp_path / "file.png" save_large_file(png_image_url, file_path) with open(file_path, "rb") as fh: file_bytes = fh.read() fpath = tmp_path / "test.zim" with Creator(fpath) as creator: creator.add_item( SpecialURLProviderItem(url=png_image_url, path="one.png")) zim = Archive(fpath) assert bytes(zim.get_item("one.png").content) == file_bytes
def test_urlitem_binary(tmp_path, png_image_url): file_path = tmp_path / "file.png" save_large_file(png_image_url, file_path) with open(file_path, "rb") as fh: file_bytes = fh.read() fpath = tmp_path / "test.zim" with Creator(fpath) as creator: creator.add_item(URLItem(url=png_image_url)) zim = Archive(fpath) assert (bytes( zim.get_item("static/images/project-logos/commonswiki.png").content) == file_bytes)
def test_add_item_for_delete(tmp_path, html_file): fpath = tmp_path / "test.zim" local_path = pathlib.Path(tmp_path / "somefile.html") # copy file to local path shutil.copyfile(html_file, local_path) with Creator(fpath, "welcome", "") as creator: creator.add_item_for(fpath=local_path, path="index", delete_fpath=True) assert not local_path.exists() reader = Archive(fpath) assert reader.get_item("index")
def test_noindexlanguage(tmp_path): fpath = tmp_path / "test.zim" with Creator(fpath, "welcome", "") as creator: creator.add_item(StaticItem(path="welcome", content="hello")) creator.update_metadata(language="bam") creator.add_item_for("index", "Index", content="-", mimetype="text/html") reader = Archive(fpath) assert reader.get_metadata("Language").decode(UTF8) == "bam" # html content triggers both title and content xapian indexes # but since indexing is disabled, we should only have title one assert reader.has_title_index assert not reader.has_fulltext_index
def test_add_item_for_delete_fail(tmp_path, png_image): fpath = tmp_path / "test.zim" local_path = pathlib.Path(tmp_path / "somefile.png") # copy file to local path shutil.copyfile(png_image, local_path) def remove_source(item): print("##########", "remove_source") os.remove(item.filepath) with Creator(fpath, "welcome", "") as creator: creator.add_item( StaticItem(filepath=local_path, path="index", callback=remove_source), callback=(delete_callback, local_path), ) assert not local_path.exists() reader = Archive(fpath) assert reader.get_item("index")
def get_metadata(self, zimfile, name): zim_fh = Archive(zimfile) return zim_fh.get_metadata(name)
def list_articles(self, zimfile): zim_fh = Archive(zimfile) for x in range(zim_fh.entry_count): yield zim_fh.get_entry_by_id(x)
def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): zim_output = "zim-out-filename.zim" warc2zim([ "-v", os.path.join(TEST_DATA_DIR, "example-response.warc"), "--name", "example-response", "--output", str(tmp_path), "--zim-file", zim_output, "-r", "https://cdn.jsdelivr.net/npm/@webrecorder/[email protected]/dist/", "--tags", "some", "--tags", "foo", "--desc", "test zim", "--tags", "bar", "--title", "Some Title", ]) zim_output = tmp_path / zim_output assert os.path.isfile(zim_output) all_articles = { article.path: article.title for article in self.list_articles(zim_output) } assert all_articles == { # entries from WARC "A/example.com/": "Example Domain", "H/example.com/": "H/example.com/", # replay system files "A/index.html": "A/index.html", "A/load.js": "A/load.js", "A/404.html": "A/404.html", "A/sw.js": "A/sw.js", "A/topFrame.html": "A/topFrame.html", } zim_fh = Archive(zim_output) # ZIM metadata assert list(zim_fh.metadata.keys()) == [ "Counter", "Creator", "Date", "Description", "Language", "Name", "Publisher", "Scraper", "Source", "Tags", "Title", ] assert zim_fh.has_fulltext_index assert zim_fh.has_title_index assert self.get_metadata(zim_output, "Description") == b"test zim" assert (self.get_metadata( zim_output, "Tags") == b"_ftindex:yes;_category:other;_sw:yes;some;foo;bar") assert self.get_metadata(zim_output, "Title") == b"Some Title"
def verify_warc_and_zim(self, warcfile, zimfile): assert os.path.isfile(warcfile) assert os.path.isfile(zimfile) # autoescape=False to allow injecting html entities from translated text env = Environment( loader=PackageLoader("warc2zim", "templates"), extensions=["jinja2.ext.i18n"], autoescape=False, ) head_insert = env.get_template("sw_check.html").render().encode( "utf-8") # track to avoid checking duplicates, which are not written to ZIM warc_urls = set() zim_fh = Archive(zimfile) for record in iter_warc_records([warcfile]): url = get_record_url(record) if not url: continue if url in warc_urls: continue if record.rec_type not in (("response", "resource", "revisit")): continue # ignore revisit records that are to the same url if (record.rec_type == "revisit" and record.rec_headers["WARC-Refers-To-Target-URI"] == url): continue # parse headers as record, ensure headers match url_no_scheme = url.split("//", 2)[1] print(url_no_scheme) parsed_record = next( ArchiveIterator( BytesIO(zim_fh.get_content("H/" + url_no_scheme)))) assert record.rec_headers == parsed_record.rec_headers assert record.http_headers == parsed_record.http_headers # ensure payloads match try: payload = zim_fh.get_item("A/" + url_no_scheme) except KeyError: payload = None if record.rec_type == "revisit" or ( record.http_headers and record.http_headers.get("Content-Length") == "0"): assert not payload else: payload_content = payload.content.tobytes() # if HTML_RAW, still need to account for the head insert, otherwise should have exact match if payload.mimetype == HTML_RAW: assert head_insert in payload_content assert (payload_content.replace( head_insert, b"") == record.buffered_stream.read()) else: assert payload_content == record.buffered_stream.read() warc_urls.add(url)
def get_article_raw(self, zimfile, path): zim_fh = Archive(zimfile) return zim_fh.get_item(path)
def get_article(self, zimfile, path): zim_fh = Archive(zimfile) return zim_fh.get_content(path)
def test_zim_creator(tmp_path, png_image, html_file, html_str): fpath = tmp_path / "test.zim" main_path, language, title = "welcome", "fra", "My Title" tags = ";".join(["toto", "tata"]) with open(png_image, "rb") as fh: png_data = fh.read() with Creator(fpath, main_path, language, title=title, tags=tags) as creator: # verbatim HTML from string creator.add_item_for("welcome", "wel", content=html_str, is_front=True) # verbatim HTML from file creator.add_item_for("welcome3", "wel3", fpath=html_file) creator.add_item_for("welcome4", "wel4", fpath=html_file) # single binary image creator.add_item_for("images/yahoo.png", "Home !!", fpath=png_image, is_front=True) # redirect to our main page (no title) creator.add_redirect("home", "welcome") # redirect to our main page (with a custom title) creator.add_redirect("home2", "welcome", "Home !!") creator.add_redirect("home3", "welcome3", "Home !!", is_front=True) creator.add_redirect("home4", "welcome4", "Home !!", is_front=False) # ensure args requirement are checked with pytest.raises(ValueError, match="One of fpath or content is required"): creator.add_item_for("images/yahoo.png") with open(png_image, "rb") as fh: creator.add_default_illustration(png_data) assert fpath.exists() reader = Archive(fpath) assert reader.get_metadata("Title").decode(UTF8) == title assert reader.get_metadata("Language").decode(UTF8) == language assert reader.get_metadata("Tags").decode(UTF8) == tags assert reader.main_entry.get_item().path == f"{main_path}" # make sure we have our image assert reader.get_item("images/yahoo.png") # make sure we have our redirects assert reader.get_entry_by_path("home2").is_redirect assert reader.get_entry_by_path( "home2").get_redirect_entry().path == f"{main_path}" # make sure titles were indexed (html with title for xapian) # see https://github.com/openzim/python-libzim/issues/125 assert "home2" in list(reader.get_suggestions("Home !!")) assert "home3" in list(reader.get_suggestions("Home !!")) assert "home4" in list(reader.get_suggestions("Home !!")) assert "images/yahoo.png" not in list(reader.get_suggestions("Home !!")) # make sure full text was indexed assert reader.get_search_results_count("PDF doc") >= 1 # ensure non-rewritten articles have not been rewritten assert bytes(reader.get_item("welcome").content).decode(UTF8) == html_str assert bytes(reader.get_item("welcome3").content).decode(UTF8) == html_str # ensure illustration is present and corrext assert reader.has_illustration() assert bytes(reader.get_illustration_item().content) == png_data
def test_urlprovider_nolength(tmp_path, png_image_url, png_image): # save url's content locally using external tool png_image = tmp_path / "original.png" save_large_file(png_image_url, png_image) with open(png_image, "rb") as fh: png_image_bytes = fh.read() # create and start an http server without Content-Length support server_fpath = tmp_path / "httpd.py" port = random.randint(10000, 20000) server_code = """ from http.server import BaseHTTPRequestHandler, HTTPServer class handler(BaseHTTPRequestHandler): def do_GET(self): self.send_response(200) self.send_header("Content-type", "image/png") if "gzip" in self.path: self.send_header("Content-Encoding", "gzip") self.end_headers() with open("{src}", "rb") as fh: self.wfile.write(fh.read()) with HTTPServer(('', {port}), handler) as server: server.serve_forever() """ with open(server_fpath, "w") as fh: fh.write( server_code.replace("{port}", str(port)).replace("{src}", str(png_image))) httpd = subprocess.Popen([sys.executable, server_fpath]) time.sleep(2) # allow http server to start fpath = tmp_path / "test.zim" try: with tempfile.TemporaryDirectory() as tmp_dir, Creator( fpath) as creator: tmp_dir = pathlib.Path(tmp_dir) creator.add_item( URLItem( url=f"http://localhost:{port}/hoho.png", path="B", tmp_dir=tmp_dir, use_disk=True, )) creator.add_item( URLItem(url=f"http://localhost:{port}/home.png", tmp_dir=tmp_dir)) creator.add_item( SpecialURLProviderItem(url=f"http://localhost:{port}/home.png", mimetype="image/png")) finally: httpd.terminate() zim = Archive(fpath) assert bytes(zim.get_item("home.png").content) == png_image_bytes assert bytes(zim.get_item("B").content) == png_image_bytes