Beispiel #1
0
def test_double_close(tmp_path):
    fpath = tmp_path / "test.zim"
    with Creator(fpath, "welcome", "fra", "My Title") as creator:
        creator.add_zim_article(StaticArticle(url="A/welcome",
                                              content="hello"))

    # ensure we can close an already closed creator
    creator.close()
Beispiel #2
0
    def setup():
        # order matters are there are references between them

        from .database import get_database

        try:
            Global.database = get_database()
        except Exception as exc:
            raise Global.DatabaseException(exc)

        # all tasks added to a bound queue processed by workers
        from .executor import SotokiExecutor

        # mostly transforms HTML and sends to zim.
        # tests show no speed improv. beyond 3 workers.
        Global.executor = SotokiExecutor(
            queue_size=10,
            nb_workers=3,
        )

        # images handled on a different queue.
        # mostly network I/O to retrieve and/or upload image.
        # if not in S3 bucket, resize/optimize webp image
        # we should consider using coroutines instead of threads
        Global.img_executor = SotokiExecutor(
            queue_size=200,
            nb_workers=100,
            prefix="IMG-T-",
        )

        from .imager import Imager

        Global.imager = Imager()

        from .html import Rewriter

        Global.rewriter = Rewriter()

        from ..renderer import Renderer

        Global.renderer = Renderer()

        Global.creator = Creator(
            filename=Global.conf.output_dir.joinpath(Global.conf.fname),
            main_path="questions",
            favicon_path="illustration",
            language=Global.conf.iso_lang_3,
            title=Global.conf.title,
            description=Global.conf.description,
            creator=Global.conf.author,
            publisher=Global.conf.publisher,
            name=Global.conf.name,
            tags=";".join(Global.conf.tags),
            date=datetime.date.today(),
        ).config_verbose(True)
Beispiel #3
0
def test_create_without_workaround(tmp_path):
    fpath = tmp_path / "test.zim"

    with Creator(fpath,
                 "welcome",
                 "fra",
                 title="My Title",
                 workaround_nocancel=False) as creator:
        print("creator", creator, creator.workaround_nocancel)
        with pytest.raises(RuntimeError, match="AttributeError"):
            creator.add_zim_article("hello")
Beispiel #4
0
def test_noindexlanguage(tmp_path):
    fpath = tmp_path / "test.zim"
    with Creator(fpath, "welcome", "", "My Title") as creator:
        creator.add_zim_article(StaticArticle(url="A/welcome",
                                              content="hello"))
        creator.update_metadata(language="bam")

    with libzim.reader.File(fpath) as reader:
        assert reader.get_metadata("Language").decode(UTF8) == "bam"
        assert reader.get_article("X/title/xapian")
        with pytest.raises(KeyError):
            reader.get_article("X/fulltext/xapian")
Beispiel #5
0
class Kolibri2Zim:
    def __init__(self, **kwargs):

        for option in options:
            if option not in kwargs:
                raise ValueError(f"Missing parameter `{option}`")

        def go(option):
            return kwargs.get(option)

        self.channel_id = go("channel_id")
        self.root_id = go("root_id")

        # video-encoding info
        self.use_webm = go("use_webm")
        self.low_quality = go("low_quality")
        self.autoplay = go("autoplay")

        # zim params
        self.fname = go("fname")
        self.tags = ([] if go("tags") is None else
                     [t.strip() for t in go("tags").split(",")])
        self.title = go("title")
        self.description = go("description")
        self.author = go("creator")
        self.publisher = go("publisher")
        self.name = go("name")

        # customization
        self.favicon = go("favicon")
        self.about = go("about")
        self.css = go("css")

        # directory setup
        self.output_dir = Path(go("output_dir")).expanduser().resolve()
        if go("tmp_dir"):
            Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True)
        self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir")))

        # performances options
        self.nb_threads = go("threads")
        self.nb_processes = go("processes")
        self.s3_url_with_credentials = go("s3_url_with_credentials")
        self.s3_storage = None

        # debug/developer options
        self.keep_build_dir = go("keep_build_dir")
        self.debug = go("debug")
        self.only_topics = go("only_topics")

        # jinja2 environment setup
        self.jinja2_env = jinja2.Environment(loader=jinja2.FileSystemLoader(
            str(self.templates_dir)),
                                             autoescape=True)

    @property
    def templates_dir(self):
        return ROOT_DIR.joinpath("templates")

    def add_local_files(self, root_path, folder):
        """ recursively add local files from {folder} starting at {path} """
        non_front = ("viewer.html", "epub_embed.html")
        for fpath in folder.iterdir():
            path = "/".join([root_path, fpath.name])
            if fpath.is_file():
                mimetype = "text/html;raw=true" if fpath.name in non_front else None
                self.creator.add_item_for(path=path,
                                          title="",
                                          fpath=fpath,
                                          mimetype=mimetype)
                logger.debug(f"Adding {path}")
            else:
                self.add_local_files(path, fpath)

    def populate_nodes_executor(self):
        """Loop on content nodes to create zim entries from kolibri DB"""
        def schedule_node(item):
            future = self.nodes_executor.submit(self.add_node, item=item)
            self.nodes_futures.update({future: item[0]})

        # schedule root-id
        schedule_node((self.db.root["id"], self.db.root["kind"]))

        # fill queue with (node_id, kind) tuples for all root node's descendants
        for node in self.db.get_node_descendants(self.root_id):
            schedule_node((node["id"], node["kind"]))

    def add_node(self, item):
        """ process a content node from the tuple in queue """
        node_id, kind = item
        # check if we have a handler for this {kind} of node
        handler = getattr(self, f"add_{kind}_node", None)

        # debug espace
        if self.only_topics and kind != "topic":
            return

        if handler:
            # add thumbnail to zim if there's one for this node
            thumbnail = self.db.get_node_thumbnail(node_id)
            if thumbnail:
                self.funnel_file(thumbnail["id"], thumbnail["ext"])
            # fire the add_{kind}_node() method which will actually process it
            handler(node_id)

    def funnel_file(self, fid, fext):
        """ directly add a Kolibri file to the ZIM using same name """
        url, fname = get_kolibri_url_for(fid, fext)
        with self.creator_lock:
            self.creator.add_item(URLItem(url=url, path=fname))
        logger.debug(f"Added {fname} from Studio")

    def download_to_disk(self, file_id, ext):
        """ download a Kolibri file to the build-dir using its filename """
        url, fname = get_kolibri_url_for(file_id, ext)
        fpath = self.build_dir / fname
        stream_file(url, fpath)
        return fpath

    def funnel_from_s3(self, file_id, path, checksum, preset):
        """whether it could fetch and add the file from S3 cache

        - checks if a cache is configured
        - checks if file is present
        - checks if file is valid (corresponds to same original file)
        - downloads and add to zim

        returns True is all this succeeded, False otherwise"""
        if not self.s3_storage:
            return False

        key = self.s3_key_for(file_id, preset)

        # exit early if we don't have this object in bucket
        if not self.s3_storage.has_object_matching(
                key,
                meta={
                    "checksum": checksum,
                    "encoder_version": str(preset.VERSION)
                }):
            return False

        # download file into memory
        fileobj = io.BytesIO()
        try:
            self.s3_storage.download_fileobj(key, fileobj)
        except Exception as exc:
            logger.error(f"failed to download {key} from cache: {exc}")
            logger.exception(exc)
            # make sure we fallback to re-encode
            return False

        # add to zim
        with self.creator_lock:
            self.creator.add_item(
                StaticItem(
                    path=path,
                    fileobj=fileobj,
                    mimetype=preset.mimetype,
                ))
        logger.debug(f"Added {path} from S3::{key}")
        return True

    def s3_key_for(self, file_id, preset):
        """compute in-bucket key for file"""
        return f"{file_id[0]}/{file_id[1]}/{file_id}/{type(preset).__name__.lower()}"

    def upload_to_s3(self, key, fpath, **meta):
        """ whether it successfully uploaded to cache """
        if not self.s3_storage:
            return

        logger.debug(f"Uploading {fpath.name} to S3::{key} with {meta}")
        try:
            self.s3_storage.upload_file(fpath, key, meta=meta)
        except Exception as exc:
            logger.error(f"{key} failed to upload to cache: {exc}")
            return False
        return True

    def add_topic_node(self, node_id):
        """Build and add the HTML page for a single topic node

        Topic nodes are used only for hierarchy and solely contains metadata"""

        # fetch details including parents for breadcrumb and children to link to
        node = self.db.get_node(node_id, with_parents=True, with_children=True)

        html = self.jinja2_env.get_template("topic.html").render(
            node_id=node_id, **node)
        with self.creator_lock:
            self.creator.add_item_for(path=node_id,
                                      title=node["title"],
                                      content=html,
                                      mimetype="text/html")
        logger.debug(f"Added topic #{node_id}")

    def add_video_node(self, node_id):
        """Add content from this `video` node to zim

        video node is composed of (1) or (2) videos files and optional subtitle files
        video files are at most one of each `high_res_video` or `low_res_video`
        subtitle files (`video_subtitle`) are VTT files and are only limited by the
        number of language to select from in kolibri studio"""

        files = self.db.get_node_files(node_id, thumbnail=False)
        if not files:
            return
        files = sorted(files, key=lambda f: f["prio"])
        it = filter(lambda f: f["supp"] == 0, files)

        try:
            # find main video file
            video_file = next(it)
        except StopIteration:
            # we have no video file
            return

        try:
            alt_video_file = next(it)
        except StopIteration:
            # we have no supplementary video file (which is OK)
            alt_video_file = None

        # now decide which file to keep and what to do with it

        # content_file has a 1:1 rel with content_localfile which is thre
        # *implementation* of the file. We use that local file ID (its checksum)
        # everywhere BUT as S3 cache ID as we want to overwrite the same key
        # should a new version of the localfile for the same file arrives.
        vid = video_file["id"]  # the local file ID (current version)
        vfid = video_file["fid"]  # the file ID in DB (version agnostic)
        vchk = video_file["checksum"]

        # we'll reencode, using the best file with appropriate preset
        if self.use_webm:
            preset = VideoWebmLow() if self.low_quality else VideoWebmHigh()
            src_fname = Path(filename_for(video_file))
            path = str(src_fname.with_suffix(f".{preset.ext}"))
            video_filename_ext = preset.ext
            video_filename = src_fname.with_suffix(
                f".{video_filename_ext}").name

            # funnel from S3 cache if it is present there
            if not self.funnel_from_s3(vfid, path, vchk, preset):

                # download original video
                src = self.download_to_disk(vid, video_file["ext"])
                dst = src.with_suffix(".webm")

                # request conversion
                self.convert_and_add_video_aside(vfid, src, vchk, dst, path,
                                                 preset)

        # we want low-q but no webm yet don't have low_res file, let's reencode
        elif self.low_quality and alt_video_file is None:
            preset = VideoMp4Low()
            src_fname = Path(filename_for(video_file))
            path = str(src_fname.with_suffix(f".{preset.ext}"))
            video_filename_ext = preset.ext
            video_filename = src_fname.with_suffix(
                f".{video_filename_ext}").name

            # funnel from S3 cache if it is present there
            if not self.funnel_from_s3(vfid, path, vchk, preset):

                # download original video
                src = self.download_to_disk(vid, video_file["ext"])

                # move source file to a new name and swap variables so our target will
                # be the previously source one
                src_ = src.with_suffix(f"{src.suffix}.orig")
                shutil.move(src, src_)
                dst = src
                src = src_

                # request conversion
                self.convert_and_add_video_aside(vfid, src, vchk, dst, path,
                                                 preset)

        # we want mp4, either in high-q or we have a low_res file to use
        else:
            video_file = alt_video_file if self.low_quality else video_file
            self.funnel_file(video_file["id"], video_file["ext"])
            video_filename = filename_for(video_file)
            video_filename_ext = video_file["ext"]

        # prepare list of subtitles for template
        subtitles = []
        for file in filter(lambda f: f["preset"] == "video_subtitle", files):
            self.funnel_file(file["id"], file["ext"])
            try:
                local, english = find_language_names(file["lang"])
            except Exception:
                english = file["lang"]
            finally:
                subtitles.append({
                    "code": file["lang"],
                    "name": english,
                    "filename": filename_for(file),
                })

        node = self.db.get_node(node_id, with_parents=True)
        html = self.jinja2_env.get_template("video.html").render(
            node_id=node_id,
            video_filename=video_filename,
            video_filename_ext=video_filename_ext,
            subtitles=sorted(subtitles, key=lambda i: i["code"]),
            thumbnail=self.db.get_thumbnail_name(node_id),
            autoplay=self.autoplay,
            **node,
        )
        with self.creator_lock:
            self.creator.add_item_for(
                path=node_id,
                title=node["title"],
                content=html,
                mimetype="text/html",
            )

    def add_video_upon_completion(self, future):
        """adds the converted video inside this future to the zim

        logs error in case of failure"""
        if future.cancelled():
            return
        src_fname, dst_fpath, path = self.videos_futures.get(future)

        try:
            future.result()
        except Exception as exc:
            logger.error(f"Error re-encoding {src_fname}: {exc}")
            logger.exception(exc)
            return

        logger.debug(f"Re-encoded {src_fname} successfuly")

        kwargs = {
            "path": path,
            "filepath": dst_fpath,
            "mimetype": get_file_mimetype(dst_fpath),
        }
        # we shall request s3 upload on the threads pool, only once item has been
        # added to ZIM so it can be removed altogether
        if self.s3_storage:
            kwargs.update({"callback": self.request_s3_upload_and_removal})

        # simply add the item, autodeleting the file
        else:
            kwargs.update({"remove": True})

        with self.creator_lock:
            self.creator.add_item(StaticItem(**kwargs))
        logger.debug(f"Added {path} from re-encoded file")

    def convert_and_add_video_aside(self, file_id, src_fpath, src_checksum,
                                    dest_fpath, path, preset):
        """add video to the process-based convertion queue"""

        future = self.videos_executor.submit(
            reencode,
            src_path=src_fpath,
            dst_path=dest_fpath,
            ffmpeg_args=preset.to_ffmpeg_args(),
            delete_src=True,
            with_process=False,
            failsafe=False,
        )
        self.videos_futures.update(
            {future: (src_fpath.name, dest_fpath, path)})
        self.pending_upload.update({
            path: (
                dest_fpath,
                self.s3_key_for(file_id, preset),
                {
                    "checksum": src_checksum,
                    "encoder_version": str(preset.VERSION)
                },
            )
        })
        future.add_done_callback(self.add_video_upon_completion)

    def request_s3_upload_and_removal(self, item):
        """ add file from item to uploads list """
        path = item.path
        del item
        dest_fpath, key, meta = self.pending_upload.get(path)
        # TODO: submit to a thread executor (to create) instead
        # this is currently called on main-tread.
        self.upload_to_s3(key, dest_fpath, **meta)

    def add_audio_node(self, node_id):
        """Add content from this `audio` node to zim

        audio node are composed of a single mp3 file"""
        file = self.db.get_node_file(node_id, thumbnail=False)
        if not file:
            return
        self.funnel_file(file["id"], file["ext"])

        node = self.db.get_node(node_id, with_parents=True)
        html = self.jinja2_env.get_template("audio.html").render(
            node_id=node_id,
            filename=filename_for(file),
            ext=file["ext"],
            thumbnail=self.db.get_thumbnail_name(node_id),
            autoplay=self.autoplay,
            **node,
        )
        with self.creator_lock:
            self.creator.add_item_for(
                path=node_id,
                title=node["title"],
                content=html,
                mimetype="text/html",
            )

    def add_exercise_node(self, node_id):
        """Add content from this `exercise` node to zim

        exercise node is composed of a single perseus file

        a perseus file is a ZIP containing an exercise.json entrypoint and other files

        we'd solely add the perseus file in the ZIM along with the perseus reader from
        https://github.com/Khan/perseus"""
        logger.warning(f"[NOT SUPPORTED] not adding exercice node {node_id}")

    def add_document_node(self, node_id):
        """Add content from this `document` node to zim

        document node is composed of one main (`priority` 1) file and
        an optionnal (`priority` 2) file
        Format for each is either `pdf` (`document` preset) or `epub` (`epub` preset)


        - add the actual PDF/epub files to zim at /{node_id}.{ext} (files' IDs)
        - add an HTML page linking to files for download
        - includes an iframe with the appropriate viewer
         - using pdf.js for PDF
         - using epub.js for EPUB
        - add an additional page for the alternate document with its viewer
        """
        def target_for(file):
            filename = filename_for(file)
            if file["ext"] == "pdf":
                return f"./assets/pdfjs/web/viewer.html?file=../../../{filename}"
            if file["ext"] == "epub":
                return f"./assets/epub_embed.html?url=../{filename}"

        # record the actual document
        files = self.db.get_node_files(node_id, thumbnail=False)
        if not files:
            return
        files = sorted(filter(lambda f: f["supp"] == 0, files),
                       key=lambda f: f["prio"])
        it = iter(files)

        try:
            main_document = next(it)
        except StopIteration:
            return

        try:
            alt_document = next(it)
        except StopIteration:
            alt_document = None

        for file in files:
            self.funnel_file(file["id"], file["ext"])
            file["target"] = target_for(file)

        node = self.db.get_node(node_id, with_parents=True)
        # convert generator to list as we might read it twice
        node["parents"] = list(node["parents"])

        # generate page once for each document, changing only `is_alt`
        if alt_document:
            options = [False, True]
        else:
            options = [False]  # main_document only

        for is_alt in options:
            html = self.jinja2_env.get_template("document.html").render(
                node_id=node_id,
                main_document=filename_for(main_document),
                main_document_ext=main_document["ext"],
                alt_document=filename_for(alt_document)
                if alt_document else None,
                alt_document_ext=alt_document["ext"] if alt_document else None,
                target=target_for(alt_document if is_alt else main_document),
                is_alt=is_alt,
                **node,
            )
            with self.creator_lock:
                path = node_id
                if is_alt:
                    path += "_alt"
                self.creator.add_item_for(
                    path=path,
                    title=node["title"],
                    content=html,
                    mimetype="text/html",
                )

    def add_html5_node(self, node_id):
        """Add content from this `html5` node to zim

        html5 node is single ZIP file containing a standalone HTML app
        which entrypoint is a file named index.html

        we extract and add each file from the ZIP to /{node_id}/

        Note: Studio doesn't enforce the mandatory index.html, thus allowing invalid
        html5 app (unreachable)"""

        file = self.db.get_node_file(node_id, thumbnail=False)
        if not file:
            return

        # download ZIP file to memory
        ark_url, ark_name = get_kolibri_url_for(file["id"], file["ext"])
        ark_data = io.BytesIO()
        stream_file(url=ark_url, byte_stream=ark_data)

        # loop over zip members and create an entry for each
        zip_ark = zipfile.ZipFile(ark_data)
        for ark_member in zip_ark.namelist():
            with self.creator_lock:
                self.creator.add_item_for(
                    path=f"{node_id}/{ark_member}",
                    content=zip_ark.open(ark_member).read(),
                )

    def run(self):
        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError(
                "Unable to connect to Optimization Cache. Check its URL.")

        s3_msg = (f"  using cache: {self.s3_storage.url.netloc} "
                  f"with bucket: {self.s3_storage.bucket_name}"
                  if self.s3_storage else "")
        logger.info(f"Starting scraper with:\n"
                    f"  channel_id: {self.channel_id}\n"
                    f"  build_dir: {self.build_dir}\n"
                    f"  output_dir: {self.output_dir}\n"
                    f"  using webm : {self.use_webm}\n"
                    f"  low_quality : {self.low_quality}\n"
                    f"{s3_msg}")

        logger.info("Download database")
        self.download_db()

        self.sanitize_inputs()
        # display basic stats
        logger.info(f"  Starting ZIM creation with:\n"
                    f"  filename: {self.fname}\n"
                    f"  title: {self.title}\n"
                    f"  description: {self.description}\n"
                    f"  creator: {self.author}\n"
                    f"  publisher: {self.publisher}\n"
                    f"  tags: {';'.join(self.tags)}")

        logger.info("Setup Zim Creator")
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self.creator_lock = threading.Lock()
        self.creator = Creator(
            filename=self.output_dir.joinpath(self.fname),
            main_path=self.root_id,
            favicon_path="favicon.png",
            language="eng",
            title=self.title,
            description=self.description,
            creator=self.author,
            publisher=self.publisher,
            name=self.name,
            tags=";".join(self.tags),
        ).start()

        succeeded = False
        try:
            self.add_favicon()
            self.add_custom_about_and_css()

            # add static files
            logger.info("Adding local files (assets)")
            self.add_local_files("assets",
                                 self.templates_dir.joinpath("assets"))

            # setup queue for nodes processing
            self.nodes_futures = {}  # future: node_id
            self.nodes_executor = cf.ThreadPoolExecutor(
                max_workers=self.nb_threads)

            # setup a dedicated queue for videos to convert
            self.videos_futures = {}  # future: src_fname, dst_fpath, path
            self.pending_upload = {}  # path: filepath, key, checksum
            self.videos_executor = cf.ProcessPoolExecutor(
                max_workers=self.nb_processes)

            logger.info("Starting nodes processing")
            self.populate_nodes_executor()

            # await completion of all futures (nodes and videos)
            result = cf.wait(
                self.videos_futures.keys() | self.nodes_futures.keys(),
                return_when=cf.FIRST_EXCEPTION,
            )
            self.nodes_executor.shutdown()
            # properly shutting down the executor should allow processing
            # futures's callbacks (zim addition) as the wait() function
            # only awaits future completion and doesn't include callbacks
            self.videos_executor.shutdown()

            succeeded = not result.not_done
        except KeyboardInterrupt:
            self.creator.can_finish = False
            logger.error("KeyboardInterrupt, exiting.")
        except Exception as exc:
            # request Creator not to create a ZIM file on finish
            self.creator.can_finish = False
            logger.error("Interrupting process due to error: {exc}")
            logger.exception(exc)
        finally:
            if succeeded:
                logger.info("Finishing ZIM file…")
            # we need to release libzim's resources.
            # currently does nothing but crash if can_finish=False but that's awaiting
            # impl. at libkiwix level
            with self.creator_lock:
                self.creator.finish()

        if not self.keep_build_dir:
            logger.info("Removing build folder")
            shutil.rmtree(self.build_dir, ignore_errors=True)

        return 0 if succeeded else 1

    def s3_credentials_ok(self):
        logger.info("testing S3 Optimization Cache credentials")
        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
        if not self.s3_storage.check_credentials(list_buckets=True,
                                                 bucket=True,
                                                 write=True,
                                                 read=True,
                                                 failsafe=True):
            logger.error("S3 cache connection error testing permissions.")
            logger.error(f"  Server: {self.s3_storage.url.netloc}")
            logger.error(f"  Bucket: {self.s3_storage.bucket_name}")
            logger.error(f"  Key ID: {self.s3_storage.params.get('keyid')}")
            logger.error(f"  Public IP: {get_public_ip()}")
            return False
        return True

    def download_db(self):
        """download channel DB from kolibri and initialize DB

        Also sets the root_id with DB-computer value"""
        # download database
        fpath = self.build_dir.joinpath("db.sqlite3")
        logger.debug(f"Downloading database into {fpath.name}…")
        stream_file(
            f"{STUDIO_URL}/content/databases/{self.channel_id}.sqlite3",
            fpath,
        )
        self.db = KolibriDB(fpath, self.root_id)
        self.root_id = self.db.root_id

    def sanitize_inputs(self):
        channel_meta = self.db.get_channel_metadata(self.channel_id)

        # input  & metadata sanitation
        period = datetime.datetime.now().strftime("%Y-%m")
        if self.fname:
            # make sure we were given a filename and not a path
            self.fname = Path(self.fname.format(period=period))
            if Path(self.fname.name) != self.fname:
                raise ValueError(f"filename is not a filename: {self.fname}")
        else:
            self.fname = f"{self.name}_{period}.zim"

        if not self.title:
            self.title = channel_meta["name"]
        self.title = self.title.strip()

        if not self.description:
            self.description = channel_meta["description"]
        self.description = self.description.strip()

        if not self.author:
            self.author = channel_meta["author"] or "Kolibri"
        self.author = self.author.strip()

        if not self.publisher:
            self.publisher = "Openzim"
        self.publisher = self.publisher.strip()

        self.tags = list(
            set(self.tags + ["_category:other", "kolibri", "_videos:yes"]))

    def add_favicon(self):
        favicon_orig = self.build_dir / "favicon"
        # if user provided a custom favicon, retrieve that
        if self.favicon:
            handle_user_provided_file(source=self.favicon, dest=favicon_orig)
        # otherwise, get thumbnail from database
        else:
            # add channel thumbnail as favicon
            try:
                favicon_prefix, favicon_data = self.db.get_channel_metadata(
                    self.channel_id)["thumbnail"].split(";base64,", 1)
                favicon_data = base64.standard_b64decode(favicon_data)
                # favicon_mime = favicon_prefix.replace("data:", "")
                with open(favicon_orig, "wb") as fh:
                    fh.write(favicon_data)
                del favicon_data
            except Exception as exc:
                logger.warning(
                    "Unable to extract favicon from DB; using default")
                logger.exception(exc)

                # use a default favicon
                handle_user_provided_file(source=self.templates_dir /
                                          "kolibri-logo.png",
                                          dest=favicon_orig)

        # convert to PNG (might already be PNG but it's OK)
        favicon_fpath = favicon_orig.with_suffix(".png")
        convert_image(favicon_orig, favicon_fpath)

        # resize to appropriate size (ZIM uses 48x48)
        resize_image(favicon_fpath, width=96, height=96, method="thumbnail")

        # generate favicon
        favicon_ico_path = favicon_fpath.with_suffix(".ico")
        create_favicon(src=favicon_fpath, dst=favicon_ico_path)

        self.creator.add_item_for("favicon.png", fpath=favicon_fpath)
        self.creator.add_item_for("favicon.ico", fpath=favicon_ico_path)

    def add_custom_about_and_css(self):
        channel_meta = self.db.get_channel_metadata(self.channel_id)

        if self.about:
            # if user provided a custom about page, use it
            with open(
                    handle_user_provided_file(source=self.about,
                                              in_dir=self.build_dir,
                                              nocopy=True),
                    "r",
            ) as fh:
                soup = BeautifulSoup(fh.read(), "lxml")
                title = soup.find("title").text
                content = soup.select("body > .container")
                # we're only interested in the first one
                if isinstance(content, list):
                    content = content[0]
        else:
            title = channel_meta["name"]
            content = None

        html = self.jinja2_env.get_template("about.html").render(
            title=title, content=content, **channel_meta)
        with self.creator_lock:
            self.creator.add_item_for(
                path="about",
                title=title,
                content=html,
                mimetype="text/html",
            )
        del html

        # if user provided a custom CSS file, use it
        if self.css:
            with open(
                    handle_user_provided_file(source=self.css,
                                              in_dir=self.build_dir,
                                              nocopy=True),
                    "r",
            ) as fh:
                content = fh.read()
        # otherwise, create a blank one
        else:
            content = ""

        self.creator.add_item_for("custom.css",
                                  content=content,
                                  mimetype="text/css")
        logger.debug("Added about page and custom CSS")
Beispiel #6
0
    def run(self):
        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError(
                "Unable to connect to Optimization Cache. Check its URL.")

        s3_msg = (f"  using cache: {self.s3_storage.url.netloc} "
                  f"with bucket: {self.s3_storage.bucket_name}"
                  if self.s3_storage else "")
        logger.info(f"Starting scraper with:\n"
                    f"  channel_id: {self.channel_id}\n"
                    f"  build_dir: {self.build_dir}\n"
                    f"  output_dir: {self.output_dir}\n"
                    f"  using webm : {self.use_webm}\n"
                    f"  low_quality : {self.low_quality}\n"
                    f"{s3_msg}")

        logger.info("Download database")
        self.download_db()

        self.sanitize_inputs()
        # display basic stats
        logger.info(f"  Starting ZIM creation with:\n"
                    f"  filename: {self.fname}\n"
                    f"  title: {self.title}\n"
                    f"  description: {self.description}\n"
                    f"  creator: {self.author}\n"
                    f"  publisher: {self.publisher}\n"
                    f"  tags: {';'.join(self.tags)}")

        logger.info("Setup Zim Creator")
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self.creator_lock = threading.Lock()
        self.creator = Creator(
            filename=self.output_dir.joinpath(self.fname),
            main_path=self.root_id,
            favicon_path="favicon.png",
            language="eng",
            title=self.title,
            description=self.description,
            creator=self.author,
            publisher=self.publisher,
            name=self.name,
            tags=";".join(self.tags),
        ).start()

        succeeded = False
        try:
            self.add_favicon()
            self.add_custom_about_and_css()

            # add static files
            logger.info("Adding local files (assets)")
            self.add_local_files("assets",
                                 self.templates_dir.joinpath("assets"))

            # setup queue for nodes processing
            self.nodes_futures = {}  # future: node_id
            self.nodes_executor = cf.ThreadPoolExecutor(
                max_workers=self.nb_threads)

            # setup a dedicated queue for videos to convert
            self.videos_futures = {}  # future: src_fname, dst_fpath, path
            self.pending_upload = {}  # path: filepath, key, checksum
            self.videos_executor = cf.ProcessPoolExecutor(
                max_workers=self.nb_processes)

            logger.info("Starting nodes processing")
            self.populate_nodes_executor()

            # await completion of all futures (nodes and videos)
            result = cf.wait(
                self.videos_futures.keys() | self.nodes_futures.keys(),
                return_when=cf.FIRST_EXCEPTION,
            )
            self.nodes_executor.shutdown()
            # properly shutting down the executor should allow processing
            # futures's callbacks (zim addition) as the wait() function
            # only awaits future completion and doesn't include callbacks
            self.videos_executor.shutdown()

            succeeded = not result.not_done
        except KeyboardInterrupt:
            self.creator.can_finish = False
            logger.error("KeyboardInterrupt, exiting.")
        except Exception as exc:
            # request Creator not to create a ZIM file on finish
            self.creator.can_finish = False
            logger.error("Interrupting process due to error: {exc}")
            logger.exception(exc)
        finally:
            if succeeded:
                logger.info("Finishing ZIM file…")
            # we need to release libzim's resources.
            # currently does nothing but crash if can_finish=False but that's awaiting
            # impl. at libkiwix level
            with self.creator_lock:
                self.creator.finish()

        if not self.keep_build_dir:
            logger.info("Removing build folder")
            shutil.rmtree(self.build_dir, ignore_errors=True)

        return 0 if succeeded else 1
Beispiel #7
0
    def run(self):
        if not self.inputs:
            logger.info(
                "Arguments valid, no inputs to process. Exiting with error code 100"
            )
            return 100

        self.find_main_page_metadata()

        # make sure Language metadata is ISO-639-3 and setup translations
        try:
            lang_data = get_language_details(self.language)
            self.language = lang_data["iso-639-3"]
            setlocale(
                pathlib.Path(__file__).parent, lang_data.get("iso-639-1"))
        except Exception:
            logger.error(
                f"Invalid language setting `{self.language}`. Using `eng`.")

        self.env = self.init_env()

        # init head insert
        template = self.env.get_template(HEAD_INSERT_FILE)
        self.head_insert = ("<head>" + template.render()).encode("utf-8")
        if self.custom_css:
            self.css_insert = (
                f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" '
                'rel="Stylesheet" />\n</head>').encode("utf-8")
        else:
            self.css_insert = None

        self.creator = Creator(
            self.full_filename,
            main_path="A/index.html",
            language=self.language or "eng",
            title=self.title,
            date=datetime.date.today(),
            **self.metadata,
        ).start()

        self.add_replayer()

        for filename in pkg_resources.resource_listdir("warc2zim",
                                                       "templates"):
            if filename == HEAD_INSERT_FILE or filename == SW_JS:
                continue

            self.creator.add_item(
                StaticArticle(self.env, filename, self.main_url))

        for record in self.iter_all_warc_records():
            self.add_items_for_warc_record(record)

        # process revisits, headers only
        for url, record in self.revisits.items():
            if url not in self.indexed_urls:
                logger.debug("Adding revisit {0} -> {1}".format(
                    url, record.rec_headers["WARC-Refers-To-Target-URI"]))
                self.creator.add_item(WARCHeadersItem(record))
                self.indexed_urls.add(url)

        if self.favicon_url:
            self.add_illustration()

        logger.debug(f"Found {self.total_records} records in WARCs")

        self.creator.finish()
Beispiel #8
0
class WARC2Zim:
    def __init__(self, args):
        logging.basicConfig(format="[%(levelname)s] %(message)s")
        if args.verbose:
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.INFO)

        self.indexed_urls = set({})

        self.output = args.output
        self.zim_file = args.zim_file

        if not self.zim_file:
            self.zim_file = "{name}_{period}.zim".format(
                name=args.name, period=time.strftime("%Y-%m"))

        self.full_filename = os.path.join(self.output, self.zim_file)

        # ensure output file is writable
        with tempfile.NamedTemporaryFile(dir=self.output, delete=True) as fh:
            logger.debug(f"Confirming output is writable using {fh.name}")

        self.inputs = args.inputs
        self.replay_viewer_source = args.replay_viewer_source
        self.custom_css = args.custom_css

        self.main_url = args.url
        # ensure trailing slash is added if missing
        parts = urlsplit(self.main_url)
        if parts.path == "":
            parts = list(parts)
            # set path
            parts[2] = "/"
            self.main_url = urlunsplit(parts)

        self.include_domains = args.include_domains

        self.favicon_url = args.favicon
        self.language = args.lang
        self.title = args.title

        tags = DEFAULT_TAGS + (args.tags or [])

        self.metadata = {
            "name": args.name,
            "description": args.description,
            "creator": args.creator,
            "publisher": args.publisher,
            "tags": ";".join(tags),
            # optional
            "source": args.source,
            "scraper": "warc2zim " + get_version(),
        }

        self.revisits = {}

        # progress file handling
        self.stats_filename = (pathlib.Path(args.progress_file)
                               if args.progress_file else None)
        if self.stats_filename and not self.stats_filename.is_absolute():
            self.stats_filename = self.output / self.stats_filename
        self.written_records = self.total_records = 0

    def add_replayer(self):
        if self.replay_viewer_source and re.match(r"^https?\:",
                                                  self.replay_viewer_source):
            self.creator.add_item(
                URLItem(
                    url=self.replay_viewer_source + SW_JS,
                    path="A/" + SW_JS,
                    mimetype="application/javascript",
                ))
        elif self.replay_viewer_source:
            self.creator.add_item_for(
                fpath=self.replay_viewer_source + SW_JS,
                path="A/" + SW_JS,
                mimetype="application/javascript",
            )
        else:
            self.creator.add_item(
                StaticArticle(self.env,
                              SW_JS,
                              self.main_url,
                              mimetype="application/javascript"))

    def init_env(self):
        # autoescape=False to allow injecting html entities from translated text
        env = Environment(
            loader=PackageLoader("warc2zim", "templates"),
            extensions=["jinja2.ext.i18n"],
            autoescape=False,
        )

        try:
            env.install_gettext_translations(Locale.translation)
        except OSError:
            logger.warning(
                "No translations table found for language: {0}".format(
                    self.language))
            env.install_null_translations()

        return env

    def update_stats(self):
        """write progress as JSON to self.stats_filename if requested"""
        if not self.stats_filename:
            return
        self.written_records += 1
        with open(self.stats_filename, "w") as fh:
            json.dump(
                {
                    "written": self.written_records,
                    "total": self.total_records
                }, fh)

    def get_custom_css_record(self):
        if re.match(r"^https?\://", self.custom_css):
            resp = requests.get(self.custom_css, timeout=10)
            resp.raise_for_status()
            payload = resp.content
        else:
            css_path = pathlib.Path(self.custom_css).expanduser().resolve()
            with open(css_path, "rb") as fh:
                payload = fh.read()

        http_headers = StatusAndHeaders(
            "200 OK",
            [("Content-Type", 'text/css; charset="UTF-8"')],
            protocol="HTTP/1.0",
        )

        return RecordBuilder().create_warc_record(
            CUSTOM_CSS_URL,
            "response",
            payload=io.BytesIO(payload),
            length=len(payload),
            http_headers=http_headers,
        )

    def run(self):
        if not self.inputs:
            logger.info(
                "Arguments valid, no inputs to process. Exiting with error code 100"
            )
            return 100

        self.find_main_page_metadata()

        # make sure Language metadata is ISO-639-3 and setup translations
        try:
            lang_data = get_language_details(self.language)
            self.language = lang_data["iso-639-3"]
            setlocale(
                pathlib.Path(__file__).parent, lang_data.get("iso-639-1"))
        except Exception:
            logger.error(
                f"Invalid language setting `{self.language}`. Using `eng`.")

        self.env = self.init_env()

        # init head insert
        template = self.env.get_template(HEAD_INSERT_FILE)
        self.head_insert = ("<head>" + template.render()).encode("utf-8")
        if self.custom_css:
            self.css_insert = (
                f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" '
                'rel="Stylesheet" />\n</head>').encode("utf-8")
        else:
            self.css_insert = None

        self.creator = Creator(
            self.full_filename,
            main_path="A/index.html",
            language=self.language or "eng",
            title=self.title,
            date=datetime.date.today(),
            **self.metadata,
        ).start()

        self.add_replayer()

        for filename in pkg_resources.resource_listdir("warc2zim",
                                                       "templates"):
            if filename == HEAD_INSERT_FILE or filename == SW_JS:
                continue

            self.creator.add_item(
                StaticArticle(self.env, filename, self.main_url))

        for record in self.iter_all_warc_records():
            self.add_items_for_warc_record(record)

        # process revisits, headers only
        for url, record in self.revisits.items():
            if url not in self.indexed_urls:
                logger.debug("Adding revisit {0} -> {1}".format(
                    url, record.rec_headers["WARC-Refers-To-Target-URI"]))
                self.creator.add_item(WARCHeadersItem(record))
                self.indexed_urls.add(url)

        if self.favicon_url:
            self.add_illustration()

        logger.debug(f"Found {self.total_records} records in WARCs")

        self.creator.finish()

    def iter_all_warc_records(self):
        # add custom css records
        if self.custom_css:
            yield self.get_custom_css_record()

        yield from iter_warc_records(self.inputs)

    def find_main_page_metadata(self):
        for record in self.iter_all_warc_records():
            if record.rec_type == "revisit":
                continue

            # if no main_url, use first 'text/html' record as the main page by default
            # not guaranteed to always work
            mime = get_record_mime_type(record)

            url = record.rec_headers["WARC-Target-URI"]

            if (not self.main_url and mime == "text/html"
                    and record.payload_length != 0
                    and (not record.http_headers
                         or record.http_headers.get_statuscode() == "200")):
                self.main_url = url

            if urldefrag(self.main_url).url != url:
                continue

            # if we get here, found record for the main page

            # if main page is not html, still allow (eg. could be text, img),
            # but print warning
            if mime not in HTML_TYPES:
                logger.warning(
                    "Main page is not an HTML Page, mime type is: {0} "
                    "- Skipping Favicon and Language detection".format(mime))
                return

            # content = record.content_stream().read()
            record.buffered_stream.seek(0)
            content = record.buffered_stream.read()

            if not self.title:
                self.title = parse_title(content)

            self.find_icon_and_language(content)

            logger.debug("Title: {0}".format(self.title))
            logger.debug("Language: {0}".format(self.language))
            logger.debug("Favicon: {0}".format(self.favicon_url))
            return

        msg = "Unable to find WARC record for main page: {0}, ZIM not created".format(
            self.main_url)
        logger.error(msg)
        raise KeyError(msg)

    def find_icon_and_language(self, content):
        soup = BeautifulSoup(content, "html.parser")

        if not self.favicon_url:
            # find icon
            icon = soup.find("link", rel="shortcut icon")
            if not icon:
                icon = soup.find("link", rel="icon")

            if icon:
                self.favicon_url = urljoin(self.main_url, icon.attrs["href"])
            else:
                self.favicon_url = urljoin(self.main_url, "/favicon.ico")

        if not self.language:
            # HTML5 Standard
            lang_elem = soup.find("html", attrs={"lang": True})
            if lang_elem:
                self.language = lang_elem.attrs["lang"]
                return

            # W3C recommendation
            lang_elem = soup.find("meta", {
                "http-equiv": "content-language",
                "content": True
            })
            if lang_elem:
                self.language = lang_elem.attrs["content"]
                return

            # SEO Recommendations
            lang_elem = soup.find("meta", {
                "name": "language",
                "content": True
            })
            if lang_elem:
                self.language = lang_elem.attrs["content"]
                return

    def add_illustration(self, record=None):
        if self.favicon_url in self.indexed_urls:
            return

        # add illustration from favicon option or in-warc favicon
        logger.info("Adding illustration from " +
                    (self.favicon_url if record is None else "WARC"))
        favicon_fname = pathlib.Path(urlparse(self.favicon_url).path).name
        src_illus_fpath = pathlib.Path(".").joinpath(favicon_fname)

        # reusing payload from WARC record
        if record:
            with open(src_illus_fpath, "wb") as fh:
                if hasattr(record, "buffered_stream"):
                    record.buffered_stream.seek(0)
                    fh.write(record.buffered_stream.read())
                else:
                    fh.write(record.content_stream().read())
        # fetching online
        else:
            try:
                handle_user_provided_file(source=self.favicon_url,
                                          dest=src_illus_fpath)
            except Exception as exc:
                logger.warning(
                    "Unable to retrieve favicon. "
                    "ZIM won't have an illustration: {exc}".format(exc=exc))
                return

        # convert to PNG (might already be PNG but it's OK)
        illus_fpath = src_illus_fpath.with_suffix(".png")
        convert_image(src_illus_fpath, illus_fpath)

        # resize to appropriate size (ZIM uses 48x48 so we double for retina)
        for size in (96, 48):
            resize_image(illus_fpath,
                         width=size,
                         height=size,
                         method="thumbnail")
            with open(illus_fpath, "rb") as fh:
                self.creator.add_illustration(size, fh.read())
        src_illus_fpath.unlink()

    def is_self_redirect(self, record, url):
        if record.rec_type != "response":
            return False

        if not record.http_headers.get_statuscode().startswith("3"):
            return False

        location = record.http_headers["Location"]
        return canonicalize(url) == canonicalize(location)

    def add_items_for_warc_record(self, record):
        url = get_record_url(record)
        if not url:
            logger.debug(
                f"Skipping record with empty WARC-Target-URI {record}")
            return

        if url in self.indexed_urls:
            logger.debug(
                "Skipping duplicate {0}, already added to ZIM".format(url))
            return

        # if include_domains is set, only include urls from those domains
        if self.include_domains:
            parts = urlsplit(url)
            if not any(
                    parts.netloc.endswith(domain)
                    for domain in self.include_domains):
                logger.debug(
                    "Skipping url {0}, outside included domains".format(url))
                return

        if record.rec_type != "revisit":
            if self.is_self_redirect(record, url):
                logger.debug("Skipping self-redirect: " + url)
                return

            self.creator.add_item(WARCHeadersItem(record))
            payload_item = WARCPayloadItem(record, self.head_insert,
                                           self.css_insert)

            if len(payload_item.content) != 0:
                self.creator.add_item(payload_item)
                self.total_records += 1
                self.update_stats()

            if url == self.favicon_url:
                self.add_illustration(record=record)

            self.indexed_urls.add(url)

        elif (record.rec_headers["WARC-Refers-To-Target-URI"] != url
              and url not in self.revisits):
            self.revisits[url] = record

        self.add_fuzzy_match_record(url)

    def add_fuzzy_match_record(self, url):
        fuzzy_url = url
        for rule in FUZZY_RULES:
            fuzzy_url = rule["match"].sub(rule["replace"], url)
            if fuzzy_url != url:
                break

        if fuzzy_url == url:
            return

        http_headers = StatusAndHeaders("302 Redirect", {"Location": url})

        date = datetime.datetime.utcnow().isoformat()
        builder = RecordBuilder()
        record = builder.create_revisit_record(
            fuzzy_url, "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", url, date,
            http_headers)

        self.revisits[fuzzy_url] = record
        logger.debug("Adding fuzzy redirect {0} -> {1}".format(fuzzy_url, url))
Beispiel #9
0
def test_zim_creator(tmp_path, png_image, css_file, html_file, css_str,
                     html_str):
    fpath = tmp_path / "test.zim"
    main_page, language, title = "welcome", "fra", "My Title"
    tags = ";".join(["toto", "tata"])
    redir_url = "A/ola"

    with Creator(fpath, main_page, language, title=title,
                 tags=tags) as creator:
        # rewritten CSS from string
        creator.add_css("test.css", content=css_str, rewrite_links=True)
        # verbatim CSS from string
        creator.add_css("test2.css", content=css_str, rewrite_links=False)
        # rewritten CSS from file
        creator.add_css("test3.css", fpath=css_file, rewrite_links=True)
        # verbatim CSS from file
        creator.add_css("test4.css", fpath=css_file, rewrite_links=False)
        # rewritten HTML from string
        creator.add_article("welcome",
                            "Welcome",
                            content=html_str,
                            rewrite_links=True)
        # verbatim HTML from string
        creator.add_article("welcome2",
                            "wel2",
                            content=html_str,
                            rewrite_links=False)
        # rewritten HTML from file
        creator.add_article("welcome3",
                            "Wel3",
                            fpath=html_file,
                            rewrite_links=True)
        # verbatim HTML from file
        creator.add_article("welcome4",
                            "wel4",
                            fpath=html_file,
                            rewrite_links=False)
        # single binary image
        creator.add_binary("images/yahoo.png", fpath=png_image)
        # redirect to our main page (no title)
        creator.add_redirect("A/home", "A/welcome")
        # redirect to our main page (with a custom title)
        creator.add_redirect("A/home2", "A/welcome", "Home !!")

        # redirect using StaticArticle
        creator.add_zim_article(
            StaticArticle(url=redir_url, redirect_url="A/welcome"))

        # ensure args requirement are checked
        with pytest.raises(ValueError,
                           match="One of fpath or content is required"):
            creator.add_binary("images/yahoo.png")
        with pytest.raises(ValueError,
                           match="One of fpath or content is required"):
            # private method
            creator._add_rewriten("-", "test3.css", "", "text/css", False,
                                  False, False, None)

    assert fpath.exists()

    with libzim.reader.File(fpath) as reader:
        assert reader.get_metadata("Title").decode(UTF8) == title
        assert reader.get_metadata("Language").decode(UTF8) == language
        assert reader.get_metadata("Tags").decode(UTF8) == tags
        assert reader.main_page_url == f"A/{main_page}"
        # make sure we have our image
        assert reader.get_article("I/images/yahoo.png")
        # make sure we have our redirects
        assert reader.get_article(redir_url).is_redirect
        assert (reader.get_article(redir_url).get_redirect_article().longurl ==
                f"A/{main_page}")
        # make sure we have full text and title indexes
        assert reader.get_article("X/title/xapian")
        assert reader.get_article("X/fulltext/xapian")
        # make sure titles were indexed
        assert "A/home2" in list(reader.suggest("Home !!"))
        # make sure full text was indexed
        assert reader.get_search_results_count("PDF doc") >= 1

        # ensure CSS rewriting is OK
        assert count_links(reader.get_article("-/test.css"), r"../I") == 24
        assert count_links(reader.get_article("-/test3.css"), r"../I") == 24

        # ensure non-rewritten articles have not been rewritten
        assert count_links(reader.get_article("-/test2.css"), r"../I") == 0
        assert count_links(reader.get_article("-/test4.css"), r"../I") == 0
        assert bytes(
            reader.get_article("-/test2.css").content).decode(UTF8) == css_str
        assert bytes(
            reader.get_article("-/test4.css").content).decode(UTF8) == css_str

        # ensure CSS rewriting is OK
        assert count_links(reader.get_article("A/welcome"), r"../A") == 2
        assert count_links(reader.get_article("A/welcome"), r"../-") == 2
        assert count_links(reader.get_article("A/welcome"), r"dest.html") == 1
        assert count_links(reader.get_article("A/welcome3"), r"../I") == 2
        assert count_links(reader.get_article("A/welcome3"), r"../-") == 2
        assert count_links(reader.get_article("A/welcome3"), r"../A") == 2
        assert count_links(reader.get_article("A/welcome3"), r"dest.html") == 1

        # ensure non-rewritten articles have not been rewritten
        assert count_links(reader.get_article("A/welcome2"), r"../I") == 0
        assert count_links(reader.get_article("A/welcome4"), r"../I") == 0
        assert bytes(
            reader.get_article("A/welcome2").content).decode(UTF8) == html_str
        assert bytes(
            reader.get_article("A/welcome4").content).decode(UTF8) == html_str