def test_creator_compression(fpath, lipsum_item): """make sure we can create ZIM files with various compression algorithms also makes sure we're getting different sizes using diffrent alg. based on a piece of text that should give different results""" filesizes = {} for comp in libzim.writer.Compression.__members__.keys(): fpath_str = fpath.with_name(f"{fpath.name}_{comp}_str.zim") with Creator(fpath_str).config_compression(comp) as c: c.add_item(lipsum_item) fpath_val = fpath.with_name(f"{fpath.name}_{comp}_val.zim") comp_val = getattr(libzim.writer.Compression, comp) with Creator(fpath_val).config_compression(comp_val) as c: c.add_item(lipsum_item) assert Archive(fpath_str).checksum assert Archive(fpath_str).filesize == Archive(fpath_val).filesize filesizes[comp] = Archive(fpath_str).filesize for a, b in itertools.combinations(filesizes.keys(), 2): assert filesizes[a] != filesizes[b] # now don't specify with Creator(fpath) as c: c.add_item(lipsum_item) # default should be zstd assert Archive(fpath).filesize == filesizes["zstd"]
def test_open_badfile(tmpdir): fpath = tmpdir / "not-exist.zim" with pytest.raises(RuntimeError): Archive(fpath) fpath = tmpdir / "not-zim.zim" with open(fpath, "w") as fh: fh.write("text file") with pytest.raises(RuntimeError): Archive(fpath)
def test_reader_has_index(all_zims, filename, has_fulltext_index, has_title_index): zim = Archive(all_zims / filename) # we should not get a fulltext index but title should assert zim.has_fulltext_index is has_fulltext_index assert zim.has_title_index is has_title_index
def test_reader_by_id(all_zims, filename): zim = Archive(all_zims / filename) # test index access for index in range(0, zim.entry_count - 1): assert zim._get_entry_by_id(index)._index == index assert zim._get_entry_by_id(index).get_item()._index >= 0
def test_creator_indexing(fpath, lipsum_item, indexing, language, expected): fpath = "lolo.zim" with Creator(fpath).config_indexing(indexing, language) as c: c.add_item(lipsum_item) zim = Archive(fpath) assert zim.has_fulltext_index == indexing
def test_content_ref_keep(all_zims): """Get the memoryview on a content and loose the reference on the article. We try to load a lot of other articles to detect possible use of dandling pointer """ archive = Archive(all_zims / "zimfile.zim") content = None def get_content(): nonlocal content entry = archive.get_entry_by_path("A/That_Lucky_Old_Sun") item = entry.get_item() assert isinstance(item.content, memoryview) content = item.content get_content() # Now we have a content but no reference to the entry/item. gc.collect() # Load a lot of content for i in range(0, archive.entry_count, 2): entry = archive._get_entry_by_id(i) if not entry.is_redirect: _ = entry.get_item().content # Check everything is ok assert len(content) == 3559 assert ( bytes(content[:100]) == b'<!DOCTYPE html>\n<html class="client-js"><head>\n ' b'<meta charset="UTF-8">\n <title>That Lucky Old Sun<' # noqa )
def test_fileprovider(fpath, lipsum): lipsum_fpath = fpath.with_name("lipsum.html") with open(lipsum_fpath, "w") as fh: for _ in range(0, 10): fh.write(lipsum) item = StaticItem(path=HOME_PATH, filepath=lipsum_fpath, mimetype="text/html") assert HOME_PATH in str(item) assert item.get_title() in str(item) with Creator(fpath) as c: c.add_item(item) zim = Archive(fpath) with open(lipsum_fpath, "rb") as fh: assert bytes( zim.get_entry_by_path(HOME_PATH).get_item().content) == fh.read() # test feed streaming cp = item.get_contentprovider() b = cp.feed() while b.size(): assert isinstance(b, Blob) b = cp.feed()
def test_creator_metadata_nooverwrite(fpath, lipsum_item): with Creator(fpath) as c: c.add_item(lipsum_item) c.add_metadata("Key", "first") # re-setting a value prints a warning and ignore it c.add_metadata("Key", "second") zim = Archive(fpath) assert zim.get_metadata("Key").decode("UTF-8") == "first"
def test_creator_metadata(fpath, lipsum_item): metadata = { # kiwix-mandatory "Name": "wikipedia_fr_football", "Title": "English Wikipedia", "Creator": "English speaking Wikipedia contributors", "Publisher": "Wikipedia user Foobar", "Date": "2009-11-21", "Description": "All articles (without images) from the english Wikipedia", "Language": "eng", # optional "Longdescription": ("This ZIM file contains all articles (without images) " "from the english Wikipedia by 2009-11-10." " The topics are ..."), "Licence": "CC-BY", "Tags": "wikipedia;_category:wikipedia;_pictures:no;" "_videos:no;_details:yes;_ftindex:yes", "Flavour": "nopic", "Source": "https://en.wikipedia.org/", "Counter": "image/jpeg=5;image/gif=3;image/png=2", "Scraper": "sotoki 1.2.3", } # ensure we can't add if not started c = Creator(fpath) with pytest.raises(RuntimeError, match="not started"): key = next(iter(metadata.keys())) c.add_metadata(key, metadata.get(key)) del c with Creator(fpath) as c: c.add_item(lipsum_item) for name, value in metadata.items(): if name == "Date": continue c.add_metadata(name, value) mdate = datetime.date( *[int(x) for x in metadata.get("Date").split("-")]) c.add_metadata("Date", mdate) zim = Archive(fpath) for name, value in metadata.items(): assert zim.get_metadata(name).decode("UTF-8") == value
def test_reader_checksum(all_zims, filename, has_checksum, is_valid): zim = Archive(all_zims / filename) # verify checksum assert zim.has_checksum is has_checksum assert isinstance(zim.checksum, str) assert len(zim.checksum) == 32 if has_checksum else 0 assert zim.checksum != zim.uuid assert zim.check() is is_valid
def test_creator_mainpath(fpath, lipsum_item): main_path = HOME_PATH with Creator(fpath).set_mainpath(main_path) as c: c.add_item(lipsum_item) zim = Archive(fpath) assert zim.has_main_entry is True assert zim.main_entry.path == "mainPage" assert zim.main_entry.get_item().path == main_path fpath.unlink() with Creator(fpath) as c: c.add_item(lipsum_item) zim = Archive(fpath) assert zim.has_main_entry is False with pytest.raises(RuntimeError): assert zim.main_entry
def test_reader_metadata( all_zims, filename, metadata_keys, test_metadata, test_metadata_value ): zim = Archive(all_zims / filename) # make sure metadata_keys is empty assert zim.metadata_keys == metadata_keys if test_metadata: assert zim.get_metadata(test_metadata).decode("UTF-8") == test_metadata_value
def test_creator_faviconpath(fpath, favicon_data): favicon_path = HOME_PATH favicon_item = StaticItem(mimetype="image/png", path=favicon_path, content=favicon_data) with Creator(fpath).set_faviconpath(favicon_path) as c: c.add_item(favicon_item) zim = Archive(fpath) assert zim.has_favicon_entry is True assert zim.favicon_entry.path == "favicon" assert zim.favicon_entry.get_item().path == favicon_path fpath.unlink() with Creator(fpath) as c: c.add_item(favicon_item) zim = Archive(fpath) assert zim.has_favicon_entry is False with pytest.raises(RuntimeError): assert zim.favicon_entry
def test_archive_equality(all_zims): class Different: def __init__(self, filename): self.filename = filename class Sub(Archive): pass class Sub2(Archive): @property def filename(self): return 1 fpath1 = all_zims / "zimfile.zim" fpath2 = all_zims / "example.zim" zim = Archive(fpath1) assert zim != Archive(fpath2) assert zim == Archive(fpath1) assert zim != Different(fpath1) assert zim == Sub(fpath1) assert zim != Sub2(fpath1)
def test_reader_redirect(all_zims, filename, test_redirect, test_redirect_to): zim = Archive(all_zims / filename) if test_redirect: assert zim.get_entry_by_path(test_redirect).is_redirect if test_redirect_to: target_entry = zim.get_entry_by_path(test_redirect) assert target_entry.get_redirect_entry().path == test_redirect_to # make sure get_item resolves it assert target_entry.get_item().path == test_redirect_to # should be last redirect assert target_entry.get_redirect_entry().is_redirect is False with pytest.raises(RuntimeError): target_entry.get_redirect_entry().get_redirect_entry()
def test_stringprovider(fpath, lipsum): item = StaticItem(path=HOME_PATH, content=lipsum, mimetype="text/html") assert HOME_PATH in str(item) assert item.get_title() in str(item) with Creator(fpath) as c: c.add_item(item) zim = Archive(fpath) assert bytes(zim.get_entry_by_path( HOME_PATH).get_item().content) == lipsum.encode("UTF-8") # test feed streaming cp = item.get_contentprovider() b = cp.feed() while b.size(): assert isinstance(b, Blob) b = cp.feed()
def test_reader_archive(all_zims, filename, filesize, new_ns, mutlipart, zim_uuid): fpath = all_zims / filename zim = Archive(fpath) # check externaly verifiable data assert zim.filename == fpath assert zim.filesize == os.path.getsize(fpath) if filesize is not None: assert zim.filesize == filesize assert zim.has_new_namespace_scheme is new_ns assert zim.is_multipart is mutlipart assert str(fpath) in str(zim) # ensure uuid is returned assert isinstance(zim.uuid, uuid.UUID) assert len(zim.uuid.hex) == 32 if zim_uuid: assert zim.uuid.hex == zim_uuid
def test_reader_suggest_search( all_zims, filename, entry_count, suggestion_string, suggestion_count, suggestion_result, search_string, search_count, search_result, ): zim = Archive(all_zims / filename) # suggestion and search results assert zim.entry_count == entry_count assert (zim.get_estimated_suggestions_results_count(suggestion_string) == suggestion_count) assert list(zim.suggest(suggestion_string)) == suggestion_result assert zim.get_estimated_search_results_count( search_string) == search_count assert list(zim.search(search_string)) == search_result
def test_reader_get_entries( all_zims, filename, test_path, test_title, test_mimetype, test_size, test_content_includes, ): zim = Archive(all_zims / filename) # entries with pytest.raises(KeyError): zim.get_entry_by_path("___missing") if test_path: assert zim.has_entry_by_path(test_path) entry = zim.get_entry_by_path(test_path) assert entry.title == test_title assert entry.path == test_path assert test_path in str(entry) assert test_title in str(entry) item = entry.get_item() assert item.title == test_title assert item.path == test_path assert test_path in str(item) assert test_title in str(item) assert item.mimetype == test_mimetype assert item.size == test_size assert isinstance(item.content, memoryview) assert test_content_includes in bytes(item.content).decode("UTF-8") with pytest.raises(KeyError): zim.get_entry_by_title("___missing") if test_title: assert zim.has_entry_by_title(test_title) assert zim.get_entry_by_title(test_title).path == entry.path
def test_creator_redirection(fpath, lipsum_item): # ensure we can't add if not started c = Creator(fpath) with pytest.raises(RuntimeError, match="not started"): c.add_redirection("home", "hello", HOME_PATH) del c with Creator(fpath) as c: c.add_item(lipsum_item) c.add_redirection("home", "hello", HOME_PATH) c.add_redirection("accueil", "bonjour", HOME_PATH) zim = Archive(fpath) assert zim.entry_count == 3 assert zim.has_entry_by_path("home") is True assert zim.has_entry_by_path("accueil") is True assert zim.get_entry_by_path("home").is_redirect assert (zim.get_entry_by_path("home").get_redirect_entry().path == zim.get_entry_by_path(HOME_PATH).path) assert zim.get_entry_by_path("accueil").get_item().path == HOME_PATH assert "home" in list(zim.suggest("hello")) assert "accueil" in list(zim.suggest("bonjour"))
def test_reader_main_favicon_entries(all_zims, filename, new_ns, has_main_entry, has_favicon_entry): zim = Archive(all_zims / filename) # make sure we have no main entry assert zim.has_main_entry is has_main_entry if has_main_entry is False: with pytest.raises(RuntimeError): assert zim.main_entry else: assert zim.main_entry if new_ns: assert zim.main_entry.path == "mainPath" # make sure we have no favicon entry assert zim.has_favicon_entry is has_favicon_entry if has_favicon_entry is False: with pytest.raises(RuntimeError): assert zim.favicon_entry else: assert zim.favicon_entry if new_ns: assert zim.favicon_entry.path == "-/favicon"
def test_creator_filename(fpath): with Creator(fpath) as c: assert c.filename == fpath assert Archive(fpath).filename == fpath
item = TestItem("Monadical_SAS", "Monadical", content) item2 = TestItem("Monadical_2", "Monadical 2", content2) zim_file_path = f"kiwix-test-{uuid.uuid1()}.zim" print(f"Testing writer for {zim_file_path}") with Creator(zim_file_path).config_indexing(True, "eng").config_minclustersize( 512 ) as zc: zc.set_mainpath("Monadical") zc.add_item(item) zc.add_item(item2) for name, value in { "creator": "python-libzim", "description": "Created in python", "name": "Hola", "publisher": "Monadical", "title": "Test Zim", }.items(): zc.add_metadata(name.title(), value.encode("UTF-8")) print("Testing reader") zim = Archive(zim_file_path) entry = zim.get_entry_by_path("Monadical") print(f"Main entry is at {zim.main_entry.get_item().path}") print(f"Entry {entry.title} at {entry.path} is {entry.get_item().size}b:") print(bytes(entry.get_item().content).decode("UTF-8"))
# -*- coding:utf-8 -*- from libzim.reader import Archive zim = Archive("/home/xiaohe/wikiZIM/wikipedia_zh_all.zim") print(f"Main entry is at {zim.main_entry.get_item().path}") entry = zim.get_entry_by_path("性交") print(f"Entry {entry.title} at {entry.path} is {entry.get_item().size}b:") print(bytes(entry.get_item().content).decode("UTF-8"))