def test_double_close(tmp_path): fpath = tmp_path / "test.zim" with Creator(fpath, "welcome", "fra", "My Title") as creator: creator.add_zim_article(StaticArticle(url="A/welcome", content="hello")) # ensure we can close an already closed creator creator.close()
def setup(): # order matters are there are references between them from .database import get_database try: Global.database = get_database() except Exception as exc: raise Global.DatabaseException(exc) # all tasks added to a bound queue processed by workers from .executor import SotokiExecutor # mostly transforms HTML and sends to zim. # tests show no speed improv. beyond 3 workers. Global.executor = SotokiExecutor( queue_size=10, nb_workers=3, ) # images handled on a different queue. # mostly network I/O to retrieve and/or upload image. # if not in S3 bucket, resize/optimize webp image # we should consider using coroutines instead of threads Global.img_executor = SotokiExecutor( queue_size=200, nb_workers=100, prefix="IMG-T-", ) from .imager import Imager Global.imager = Imager() from .html import Rewriter Global.rewriter = Rewriter() from ..renderer import Renderer Global.renderer = Renderer() Global.creator = Creator( filename=Global.conf.output_dir.joinpath(Global.conf.fname), main_path="questions", favicon_path="illustration", language=Global.conf.iso_lang_3, title=Global.conf.title, description=Global.conf.description, creator=Global.conf.author, publisher=Global.conf.publisher, name=Global.conf.name, tags=";".join(Global.conf.tags), date=datetime.date.today(), ).config_verbose(True)
def test_create_without_workaround(tmp_path): fpath = tmp_path / "test.zim" with Creator(fpath, "welcome", "fra", title="My Title", workaround_nocancel=False) as creator: print("creator", creator, creator.workaround_nocancel) with pytest.raises(RuntimeError, match="AttributeError"): creator.add_zim_article("hello")
def test_noindexlanguage(tmp_path): fpath = tmp_path / "test.zim" with Creator(fpath, "welcome", "", "My Title") as creator: creator.add_zim_article(StaticArticle(url="A/welcome", content="hello")) creator.update_metadata(language="bam") with libzim.reader.File(fpath) as reader: assert reader.get_metadata("Language").decode(UTF8) == "bam" assert reader.get_article("X/title/xapian") with pytest.raises(KeyError): reader.get_article("X/fulltext/xapian")
def run(self): if self.s3_url_with_credentials and not self.s3_credentials_ok(): raise ValueError( "Unable to connect to Optimization Cache. Check its URL.") s3_msg = (f" using cache: {self.s3_storage.url.netloc} " f"with bucket: {self.s3_storage.bucket_name}" if self.s3_storage else "") logger.info(f"Starting scraper with:\n" f" channel_id: {self.channel_id}\n" f" build_dir: {self.build_dir}\n" f" output_dir: {self.output_dir}\n" f" using webm : {self.use_webm}\n" f" low_quality : {self.low_quality}\n" f"{s3_msg}") logger.info("Download database") self.download_db() self.sanitize_inputs() # display basic stats logger.info(f" Starting ZIM creation with:\n" f" filename: {self.fname}\n" f" title: {self.title}\n" f" description: {self.description}\n" f" creator: {self.author}\n" f" publisher: {self.publisher}\n" f" tags: {';'.join(self.tags)}") logger.info("Setup Zim Creator") self.output_dir.mkdir(parents=True, exist_ok=True) self.creator_lock = threading.Lock() self.creator = Creator( filename=self.output_dir.joinpath(self.fname), main_path=self.root_id, favicon_path="favicon.png", language="eng", title=self.title, description=self.description, creator=self.author, publisher=self.publisher, name=self.name, tags=";".join(self.tags), ).start() succeeded = False try: self.add_favicon() self.add_custom_about_and_css() # add static files logger.info("Adding local files (assets)") self.add_local_files("assets", self.templates_dir.joinpath("assets")) # setup queue for nodes processing self.nodes_futures = {} # future: node_id self.nodes_executor = cf.ThreadPoolExecutor( max_workers=self.nb_threads) # setup a dedicated queue for videos to convert self.videos_futures = {} # future: src_fname, dst_fpath, path self.pending_upload = {} # path: filepath, key, checksum self.videos_executor = cf.ProcessPoolExecutor( max_workers=self.nb_processes) logger.info("Starting nodes processing") self.populate_nodes_executor() # await completion of all futures (nodes and videos) result = cf.wait( self.videos_futures.keys() | self.nodes_futures.keys(), return_when=cf.FIRST_EXCEPTION, ) self.nodes_executor.shutdown() # properly shutting down the executor should allow processing # futures's callbacks (zim addition) as the wait() function # only awaits future completion and doesn't include callbacks self.videos_executor.shutdown() succeeded = not result.not_done except KeyboardInterrupt: self.creator.can_finish = False logger.error("KeyboardInterrupt, exiting.") except Exception as exc: # request Creator not to create a ZIM file on finish self.creator.can_finish = False logger.error("Interrupting process due to error: {exc}") logger.exception(exc) finally: if succeeded: logger.info("Finishing ZIM fileā¦") # we need to release libzim's resources. # currently does nothing but crash if can_finish=False but that's awaiting # impl. at libkiwix level with self.creator_lock: self.creator.finish() if not self.keep_build_dir: logger.info("Removing build folder") shutil.rmtree(self.build_dir, ignore_errors=True) return 0 if succeeded else 1
def run(self): if not self.inputs: logger.info( "Arguments valid, no inputs to process. Exiting with error code 100" ) return 100 self.find_main_page_metadata() # make sure Language metadata is ISO-639-3 and setup translations try: lang_data = get_language_details(self.language) self.language = lang_data["iso-639-3"] setlocale( pathlib.Path(__file__).parent, lang_data.get("iso-639-1")) except Exception: logger.error( f"Invalid language setting `{self.language}`. Using `eng`.") self.env = self.init_env() # init head insert template = self.env.get_template(HEAD_INSERT_FILE) self.head_insert = ("<head>" + template.render()).encode("utf-8") if self.custom_css: self.css_insert = ( f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" ' 'rel="Stylesheet" />\n</head>').encode("utf-8") else: self.css_insert = None self.creator = Creator( self.full_filename, main_path="A/index.html", language=self.language or "eng", title=self.title, date=datetime.date.today(), **self.metadata, ).start() self.add_replayer() for filename in pkg_resources.resource_listdir("warc2zim", "templates"): if filename == HEAD_INSERT_FILE or filename == SW_JS: continue self.creator.add_item( StaticArticle(self.env, filename, self.main_url)) for record in self.iter_all_warc_records(): self.add_items_for_warc_record(record) # process revisits, headers only for url, record in self.revisits.items(): if url not in self.indexed_urls: logger.debug("Adding revisit {0} -> {1}".format( url, record.rec_headers["WARC-Refers-To-Target-URI"])) self.creator.add_item(WARCHeadersItem(record)) self.indexed_urls.add(url) if self.favicon_url: self.add_illustration() logger.debug(f"Found {self.total_records} records in WARCs") self.creator.finish()
def test_zim_creator(tmp_path, png_image, css_file, html_file, css_str, html_str): fpath = tmp_path / "test.zim" main_page, language, title = "welcome", "fra", "My Title" tags = ";".join(["toto", "tata"]) redir_url = "A/ola" with Creator(fpath, main_page, language, title=title, tags=tags) as creator: # rewritten CSS from string creator.add_css("test.css", content=css_str, rewrite_links=True) # verbatim CSS from string creator.add_css("test2.css", content=css_str, rewrite_links=False) # rewritten CSS from file creator.add_css("test3.css", fpath=css_file, rewrite_links=True) # verbatim CSS from file creator.add_css("test4.css", fpath=css_file, rewrite_links=False) # rewritten HTML from string creator.add_article("welcome", "Welcome", content=html_str, rewrite_links=True) # verbatim HTML from string creator.add_article("welcome2", "wel2", content=html_str, rewrite_links=False) # rewritten HTML from file creator.add_article("welcome3", "Wel3", fpath=html_file, rewrite_links=True) # verbatim HTML from file creator.add_article("welcome4", "wel4", fpath=html_file, rewrite_links=False) # single binary image creator.add_binary("images/yahoo.png", fpath=png_image) # redirect to our main page (no title) creator.add_redirect("A/home", "A/welcome") # redirect to our main page (with a custom title) creator.add_redirect("A/home2", "A/welcome", "Home !!") # redirect using StaticArticle creator.add_zim_article( StaticArticle(url=redir_url, redirect_url="A/welcome")) # ensure args requirement are checked with pytest.raises(ValueError, match="One of fpath or content is required"): creator.add_binary("images/yahoo.png") with pytest.raises(ValueError, match="One of fpath or content is required"): # private method creator._add_rewriten("-", "test3.css", "", "text/css", False, False, False, None) assert fpath.exists() with libzim.reader.File(fpath) as reader: assert reader.get_metadata("Title").decode(UTF8) == title assert reader.get_metadata("Language").decode(UTF8) == language assert reader.get_metadata("Tags").decode(UTF8) == tags assert reader.main_page_url == f"A/{main_page}" # make sure we have our image assert reader.get_article("I/images/yahoo.png") # make sure we have our redirects assert reader.get_article(redir_url).is_redirect assert (reader.get_article(redir_url).get_redirect_article().longurl == f"A/{main_page}") # make sure we have full text and title indexes assert reader.get_article("X/title/xapian") assert reader.get_article("X/fulltext/xapian") # make sure titles were indexed assert "A/home2" in list(reader.suggest("Home !!")) # make sure full text was indexed assert reader.get_search_results_count("PDF doc") >= 1 # ensure CSS rewriting is OK assert count_links(reader.get_article("-/test.css"), r"../I") == 24 assert count_links(reader.get_article("-/test3.css"), r"../I") == 24 # ensure non-rewritten articles have not been rewritten assert count_links(reader.get_article("-/test2.css"), r"../I") == 0 assert count_links(reader.get_article("-/test4.css"), r"../I") == 0 assert bytes( reader.get_article("-/test2.css").content).decode(UTF8) == css_str assert bytes( reader.get_article("-/test4.css").content).decode(UTF8) == css_str # ensure CSS rewriting is OK assert count_links(reader.get_article("A/welcome"), r"../A") == 2 assert count_links(reader.get_article("A/welcome"), r"../-") == 2 assert count_links(reader.get_article("A/welcome"), r"dest.html") == 1 assert count_links(reader.get_article("A/welcome3"), r"../I") == 2 assert count_links(reader.get_article("A/welcome3"), r"../-") == 2 assert count_links(reader.get_article("A/welcome3"), r"../A") == 2 assert count_links(reader.get_article("A/welcome3"), r"dest.html") == 1 # ensure non-rewritten articles have not been rewritten assert count_links(reader.get_article("A/welcome2"), r"../I") == 0 assert count_links(reader.get_article("A/welcome4"), r"../I") == 0 assert bytes( reader.get_article("A/welcome2").content).decode(UTF8) == html_str assert bytes( reader.get_article("A/welcome4").content).decode(UTF8) == html_str