Example #1
0
def test_double_close(tmp_path):
    fpath = tmp_path / "test.zim"
    with Creator(fpath, "welcome", "fra", "My Title") as creator:
        creator.add_zim_article(StaticArticle(url="A/welcome",
                                              content="hello"))

    # ensure we can close an already closed creator
    creator.close()
Example #2
0
    def setup():
        # order matters are there are references between them

        from .database import get_database

        try:
            Global.database = get_database()
        except Exception as exc:
            raise Global.DatabaseException(exc)

        # all tasks added to a bound queue processed by workers
        from .executor import SotokiExecutor

        # mostly transforms HTML and sends to zim.
        # tests show no speed improv. beyond 3 workers.
        Global.executor = SotokiExecutor(
            queue_size=10,
            nb_workers=3,
        )

        # images handled on a different queue.
        # mostly network I/O to retrieve and/or upload image.
        # if not in S3 bucket, resize/optimize webp image
        # we should consider using coroutines instead of threads
        Global.img_executor = SotokiExecutor(
            queue_size=200,
            nb_workers=100,
            prefix="IMG-T-",
        )

        from .imager import Imager

        Global.imager = Imager()

        from .html import Rewriter

        Global.rewriter = Rewriter()

        from ..renderer import Renderer

        Global.renderer = Renderer()

        Global.creator = Creator(
            filename=Global.conf.output_dir.joinpath(Global.conf.fname),
            main_path="questions",
            favicon_path="illustration",
            language=Global.conf.iso_lang_3,
            title=Global.conf.title,
            description=Global.conf.description,
            creator=Global.conf.author,
            publisher=Global.conf.publisher,
            name=Global.conf.name,
            tags=";".join(Global.conf.tags),
            date=datetime.date.today(),
        ).config_verbose(True)
Example #3
0
def test_create_without_workaround(tmp_path):
    fpath = tmp_path / "test.zim"

    with Creator(fpath,
                 "welcome",
                 "fra",
                 title="My Title",
                 workaround_nocancel=False) as creator:
        print("creator", creator, creator.workaround_nocancel)
        with pytest.raises(RuntimeError, match="AttributeError"):
            creator.add_zim_article("hello")
Example #4
0
def test_noindexlanguage(tmp_path):
    fpath = tmp_path / "test.zim"
    with Creator(fpath, "welcome", "", "My Title") as creator:
        creator.add_zim_article(StaticArticle(url="A/welcome",
                                              content="hello"))
        creator.update_metadata(language="bam")

    with libzim.reader.File(fpath) as reader:
        assert reader.get_metadata("Language").decode(UTF8) == "bam"
        assert reader.get_article("X/title/xapian")
        with pytest.raises(KeyError):
            reader.get_article("X/fulltext/xapian")
Example #5
0
    def run(self):
        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError(
                "Unable to connect to Optimization Cache. Check its URL.")

        s3_msg = (f"  using cache: {self.s3_storage.url.netloc} "
                  f"with bucket: {self.s3_storage.bucket_name}"
                  if self.s3_storage else "")
        logger.info(f"Starting scraper with:\n"
                    f"  channel_id: {self.channel_id}\n"
                    f"  build_dir: {self.build_dir}\n"
                    f"  output_dir: {self.output_dir}\n"
                    f"  using webm : {self.use_webm}\n"
                    f"  low_quality : {self.low_quality}\n"
                    f"{s3_msg}")

        logger.info("Download database")
        self.download_db()

        self.sanitize_inputs()
        # display basic stats
        logger.info(f"  Starting ZIM creation with:\n"
                    f"  filename: {self.fname}\n"
                    f"  title: {self.title}\n"
                    f"  description: {self.description}\n"
                    f"  creator: {self.author}\n"
                    f"  publisher: {self.publisher}\n"
                    f"  tags: {';'.join(self.tags)}")

        logger.info("Setup Zim Creator")
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self.creator_lock = threading.Lock()
        self.creator = Creator(
            filename=self.output_dir.joinpath(self.fname),
            main_path=self.root_id,
            favicon_path="favicon.png",
            language="eng",
            title=self.title,
            description=self.description,
            creator=self.author,
            publisher=self.publisher,
            name=self.name,
            tags=";".join(self.tags),
        ).start()

        succeeded = False
        try:
            self.add_favicon()
            self.add_custom_about_and_css()

            # add static files
            logger.info("Adding local files (assets)")
            self.add_local_files("assets",
                                 self.templates_dir.joinpath("assets"))

            # setup queue for nodes processing
            self.nodes_futures = {}  # future: node_id
            self.nodes_executor = cf.ThreadPoolExecutor(
                max_workers=self.nb_threads)

            # setup a dedicated queue for videos to convert
            self.videos_futures = {}  # future: src_fname, dst_fpath, path
            self.pending_upload = {}  # path: filepath, key, checksum
            self.videos_executor = cf.ProcessPoolExecutor(
                max_workers=self.nb_processes)

            logger.info("Starting nodes processing")
            self.populate_nodes_executor()

            # await completion of all futures (nodes and videos)
            result = cf.wait(
                self.videos_futures.keys() | self.nodes_futures.keys(),
                return_when=cf.FIRST_EXCEPTION,
            )
            self.nodes_executor.shutdown()
            # properly shutting down the executor should allow processing
            # futures's callbacks (zim addition) as the wait() function
            # only awaits future completion and doesn't include callbacks
            self.videos_executor.shutdown()

            succeeded = not result.not_done
        except KeyboardInterrupt:
            self.creator.can_finish = False
            logger.error("KeyboardInterrupt, exiting.")
        except Exception as exc:
            # request Creator not to create a ZIM file on finish
            self.creator.can_finish = False
            logger.error("Interrupting process due to error: {exc}")
            logger.exception(exc)
        finally:
            if succeeded:
                logger.info("Finishing ZIM fileā€¦")
            # we need to release libzim's resources.
            # currently does nothing but crash if can_finish=False but that's awaiting
            # impl. at libkiwix level
            with self.creator_lock:
                self.creator.finish()

        if not self.keep_build_dir:
            logger.info("Removing build folder")
            shutil.rmtree(self.build_dir, ignore_errors=True)

        return 0 if succeeded else 1
Example #6
0
    def run(self):
        if not self.inputs:
            logger.info(
                "Arguments valid, no inputs to process. Exiting with error code 100"
            )
            return 100

        self.find_main_page_metadata()

        # make sure Language metadata is ISO-639-3 and setup translations
        try:
            lang_data = get_language_details(self.language)
            self.language = lang_data["iso-639-3"]
            setlocale(
                pathlib.Path(__file__).parent, lang_data.get("iso-639-1"))
        except Exception:
            logger.error(
                f"Invalid language setting `{self.language}`. Using `eng`.")

        self.env = self.init_env()

        # init head insert
        template = self.env.get_template(HEAD_INSERT_FILE)
        self.head_insert = ("<head>" + template.render()).encode("utf-8")
        if self.custom_css:
            self.css_insert = (
                f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" '
                'rel="Stylesheet" />\n</head>').encode("utf-8")
        else:
            self.css_insert = None

        self.creator = Creator(
            self.full_filename,
            main_path="A/index.html",
            language=self.language or "eng",
            title=self.title,
            date=datetime.date.today(),
            **self.metadata,
        ).start()

        self.add_replayer()

        for filename in pkg_resources.resource_listdir("warc2zim",
                                                       "templates"):
            if filename == HEAD_INSERT_FILE or filename == SW_JS:
                continue

            self.creator.add_item(
                StaticArticle(self.env, filename, self.main_url))

        for record in self.iter_all_warc_records():
            self.add_items_for_warc_record(record)

        # process revisits, headers only
        for url, record in self.revisits.items():
            if url not in self.indexed_urls:
                logger.debug("Adding revisit {0} -> {1}".format(
                    url, record.rec_headers["WARC-Refers-To-Target-URI"]))
                self.creator.add_item(WARCHeadersItem(record))
                self.indexed_urls.add(url)

        if self.favicon_url:
            self.add_illustration()

        logger.debug(f"Found {self.total_records} records in WARCs")

        self.creator.finish()
Example #7
0
def test_zim_creator(tmp_path, png_image, css_file, html_file, css_str,
                     html_str):
    fpath = tmp_path / "test.zim"
    main_page, language, title = "welcome", "fra", "My Title"
    tags = ";".join(["toto", "tata"])
    redir_url = "A/ola"

    with Creator(fpath, main_page, language, title=title,
                 tags=tags) as creator:
        # rewritten CSS from string
        creator.add_css("test.css", content=css_str, rewrite_links=True)
        # verbatim CSS from string
        creator.add_css("test2.css", content=css_str, rewrite_links=False)
        # rewritten CSS from file
        creator.add_css("test3.css", fpath=css_file, rewrite_links=True)
        # verbatim CSS from file
        creator.add_css("test4.css", fpath=css_file, rewrite_links=False)
        # rewritten HTML from string
        creator.add_article("welcome",
                            "Welcome",
                            content=html_str,
                            rewrite_links=True)
        # verbatim HTML from string
        creator.add_article("welcome2",
                            "wel2",
                            content=html_str,
                            rewrite_links=False)
        # rewritten HTML from file
        creator.add_article("welcome3",
                            "Wel3",
                            fpath=html_file,
                            rewrite_links=True)
        # verbatim HTML from file
        creator.add_article("welcome4",
                            "wel4",
                            fpath=html_file,
                            rewrite_links=False)
        # single binary image
        creator.add_binary("images/yahoo.png", fpath=png_image)
        # redirect to our main page (no title)
        creator.add_redirect("A/home", "A/welcome")
        # redirect to our main page (with a custom title)
        creator.add_redirect("A/home2", "A/welcome", "Home !!")

        # redirect using StaticArticle
        creator.add_zim_article(
            StaticArticle(url=redir_url, redirect_url="A/welcome"))

        # ensure args requirement are checked
        with pytest.raises(ValueError,
                           match="One of fpath or content is required"):
            creator.add_binary("images/yahoo.png")
        with pytest.raises(ValueError,
                           match="One of fpath or content is required"):
            # private method
            creator._add_rewriten("-", "test3.css", "", "text/css", False,
                                  False, False, None)

    assert fpath.exists()

    with libzim.reader.File(fpath) as reader:
        assert reader.get_metadata("Title").decode(UTF8) == title
        assert reader.get_metadata("Language").decode(UTF8) == language
        assert reader.get_metadata("Tags").decode(UTF8) == tags
        assert reader.main_page_url == f"A/{main_page}"
        # make sure we have our image
        assert reader.get_article("I/images/yahoo.png")
        # make sure we have our redirects
        assert reader.get_article(redir_url).is_redirect
        assert (reader.get_article(redir_url).get_redirect_article().longurl ==
                f"A/{main_page}")
        # make sure we have full text and title indexes
        assert reader.get_article("X/title/xapian")
        assert reader.get_article("X/fulltext/xapian")
        # make sure titles were indexed
        assert "A/home2" in list(reader.suggest("Home !!"))
        # make sure full text was indexed
        assert reader.get_search_results_count("PDF doc") >= 1

        # ensure CSS rewriting is OK
        assert count_links(reader.get_article("-/test.css"), r"../I") == 24
        assert count_links(reader.get_article("-/test3.css"), r"../I") == 24

        # ensure non-rewritten articles have not been rewritten
        assert count_links(reader.get_article("-/test2.css"), r"../I") == 0
        assert count_links(reader.get_article("-/test4.css"), r"../I") == 0
        assert bytes(
            reader.get_article("-/test2.css").content).decode(UTF8) == css_str
        assert bytes(
            reader.get_article("-/test4.css").content).decode(UTF8) == css_str

        # ensure CSS rewriting is OK
        assert count_links(reader.get_article("A/welcome"), r"../A") == 2
        assert count_links(reader.get_article("A/welcome"), r"../-") == 2
        assert count_links(reader.get_article("A/welcome"), r"dest.html") == 1
        assert count_links(reader.get_article("A/welcome3"), r"../I") == 2
        assert count_links(reader.get_article("A/welcome3"), r"../-") == 2
        assert count_links(reader.get_article("A/welcome3"), r"../A") == 2
        assert count_links(reader.get_article("A/welcome3"), r"dest.html") == 1

        # ensure non-rewritten articles have not been rewritten
        assert count_links(reader.get_article("A/welcome2"), r"../I") == 0
        assert count_links(reader.get_article("A/welcome4"), r"../I") == 0
        assert bytes(
            reader.get_article("A/welcome2").content).decode(UTF8) == html_str
        assert bytes(
            reader.get_article("A/welcome4").content).decode(UTF8) == html_str