class Kolibri2Zim: def __init__(self, **kwargs): for option in options: if option not in kwargs: raise ValueError(f"Missing parameter `{option}`") def go(option): return kwargs.get(option) self.channel_id = go("channel_id") self.root_id = go("root_id") # video-encoding info self.use_webm = go("use_webm") self.low_quality = go("low_quality") self.autoplay = go("autoplay") # zim params self.fname = go("fname") self.tags = ([] if go("tags") is None else [t.strip() for t in go("tags").split(",")]) self.title = go("title") self.description = go("description") self.author = go("creator") self.publisher = go("publisher") self.name = go("name") # customization self.favicon = go("favicon") self.about = go("about") self.css = go("css") # directory setup self.output_dir = Path(go("output_dir")).expanduser().resolve() if go("tmp_dir"): Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True) self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir"))) # performances options self.nb_threads = go("threads") self.nb_processes = go("processes") self.s3_url_with_credentials = go("s3_url_with_credentials") self.s3_storage = None # debug/developer options self.keep_build_dir = go("keep_build_dir") self.debug = go("debug") self.only_topics = go("only_topics") # jinja2 environment setup self.jinja2_env = jinja2.Environment(loader=jinja2.FileSystemLoader( str(self.templates_dir)), autoescape=True) @property def templates_dir(self): return ROOT_DIR.joinpath("templates") def add_local_files(self, root_path, folder): """ recursively add local files from {folder} starting at {path} """ non_front = ("viewer.html", "epub_embed.html") for fpath in folder.iterdir(): path = "/".join([root_path, fpath.name]) if fpath.is_file(): mimetype = "text/html;raw=true" if fpath.name in non_front else None self.creator.add_item_for(path=path, title="", fpath=fpath, mimetype=mimetype) logger.debug(f"Adding {path}") else: self.add_local_files(path, fpath) def populate_nodes_executor(self): """Loop on content nodes to create zim entries from kolibri DB""" def schedule_node(item): future = self.nodes_executor.submit(self.add_node, item=item) self.nodes_futures.update({future: item[0]}) # schedule root-id schedule_node((self.db.root["id"], self.db.root["kind"])) # fill queue with (node_id, kind) tuples for all root node's descendants for node in self.db.get_node_descendants(self.root_id): schedule_node((node["id"], node["kind"])) def add_node(self, item): """ process a content node from the tuple in queue """ node_id, kind = item # check if we have a handler for this {kind} of node handler = getattr(self, f"add_{kind}_node", None) # debug espace if self.only_topics and kind != "topic": return if handler: # add thumbnail to zim if there's one for this node thumbnail = self.db.get_node_thumbnail(node_id) if thumbnail: self.funnel_file(thumbnail["id"], thumbnail["ext"]) # fire the add_{kind}_node() method which will actually process it handler(node_id) def funnel_file(self, fid, fext): """ directly add a Kolibri file to the ZIM using same name """ url, fname = get_kolibri_url_for(fid, fext) with self.creator_lock: self.creator.add_item(URLItem(url=url, path=fname)) logger.debug(f"Added {fname} from Studio") def download_to_disk(self, file_id, ext): """ download a Kolibri file to the build-dir using its filename """ url, fname = get_kolibri_url_for(file_id, ext) fpath = self.build_dir / fname stream_file(url, fpath) return fpath def funnel_from_s3(self, file_id, path, checksum, preset): """whether it could fetch and add the file from S3 cache - checks if a cache is configured - checks if file is present - checks if file is valid (corresponds to same original file) - downloads and add to zim returns True is all this succeeded, False otherwise""" if not self.s3_storage: return False key = self.s3_key_for(file_id, preset) # exit early if we don't have this object in bucket if not self.s3_storage.has_object_matching( key, meta={ "checksum": checksum, "encoder_version": str(preset.VERSION) }): return False # download file into memory fileobj = io.BytesIO() try: self.s3_storage.download_fileobj(key, fileobj) except Exception as exc: logger.error(f"failed to download {key} from cache: {exc}") logger.exception(exc) # make sure we fallback to re-encode return False # add to zim with self.creator_lock: self.creator.add_item( StaticItem( path=path, fileobj=fileobj, mimetype=preset.mimetype, )) logger.debug(f"Added {path} from S3::{key}") return True def s3_key_for(self, file_id, preset): """compute in-bucket key for file""" return f"{file_id[0]}/{file_id[1]}/{file_id}/{type(preset).__name__.lower()}" def upload_to_s3(self, key, fpath, **meta): """ whether it successfully uploaded to cache """ if not self.s3_storage: return logger.debug(f"Uploading {fpath.name} to S3::{key} with {meta}") try: self.s3_storage.upload_file(fpath, key, meta=meta) except Exception as exc: logger.error(f"{key} failed to upload to cache: {exc}") return False return True def add_topic_node(self, node_id): """Build and add the HTML page for a single topic node Topic nodes are used only for hierarchy and solely contains metadata""" # fetch details including parents for breadcrumb and children to link to node = self.db.get_node(node_id, with_parents=True, with_children=True) html = self.jinja2_env.get_template("topic.html").render( node_id=node_id, **node) with self.creator_lock: self.creator.add_item_for(path=node_id, title=node["title"], content=html, mimetype="text/html") logger.debug(f"Added topic #{node_id}") def add_video_node(self, node_id): """Add content from this `video` node to zim video node is composed of (1) or (2) videos files and optional subtitle files video files are at most one of each `high_res_video` or `low_res_video` subtitle files (`video_subtitle`) are VTT files and are only limited by the number of language to select from in kolibri studio""" files = self.db.get_node_files(node_id, thumbnail=False) if not files: return files = sorted(files, key=lambda f: f["prio"]) it = filter(lambda f: f["supp"] == 0, files) try: # find main video file video_file = next(it) except StopIteration: # we have no video file return try: alt_video_file = next(it) except StopIteration: # we have no supplementary video file (which is OK) alt_video_file = None # now decide which file to keep and what to do with it # content_file has a 1:1 rel with content_localfile which is thre # *implementation* of the file. We use that local file ID (its checksum) # everywhere BUT as S3 cache ID as we want to overwrite the same key # should a new version of the localfile for the same file arrives. vid = video_file["id"] # the local file ID (current version) vfid = video_file["fid"] # the file ID in DB (version agnostic) vchk = video_file["checksum"] # we'll reencode, using the best file with appropriate preset if self.use_webm: preset = VideoWebmLow() if self.low_quality else VideoWebmHigh() src_fname = Path(filename_for(video_file)) path = str(src_fname.with_suffix(f".{preset.ext}")) video_filename_ext = preset.ext video_filename = src_fname.with_suffix( f".{video_filename_ext}").name # funnel from S3 cache if it is present there if not self.funnel_from_s3(vfid, path, vchk, preset): # download original video src = self.download_to_disk(vid, video_file["ext"]) dst = src.with_suffix(".webm") # request conversion self.convert_and_add_video_aside(vfid, src, vchk, dst, path, preset) # we want low-q but no webm yet don't have low_res file, let's reencode elif self.low_quality and alt_video_file is None: preset = VideoMp4Low() src_fname = Path(filename_for(video_file)) path = str(src_fname.with_suffix(f".{preset.ext}")) video_filename_ext = preset.ext video_filename = src_fname.with_suffix( f".{video_filename_ext}").name # funnel from S3 cache if it is present there if not self.funnel_from_s3(vfid, path, vchk, preset): # download original video src = self.download_to_disk(vid, video_file["ext"]) # move source file to a new name and swap variables so our target will # be the previously source one src_ = src.with_suffix(f"{src.suffix}.orig") shutil.move(src, src_) dst = src src = src_ # request conversion self.convert_and_add_video_aside(vfid, src, vchk, dst, path, preset) # we want mp4, either in high-q or we have a low_res file to use else: video_file = alt_video_file if self.low_quality else video_file self.funnel_file(video_file["id"], video_file["ext"]) video_filename = filename_for(video_file) video_filename_ext = video_file["ext"] # prepare list of subtitles for template subtitles = [] for file in filter(lambda f: f["preset"] == "video_subtitle", files): self.funnel_file(file["id"], file["ext"]) try: local, english = find_language_names(file["lang"]) except Exception: english = file["lang"] finally: subtitles.append({ "code": file["lang"], "name": english, "filename": filename_for(file), }) node = self.db.get_node(node_id, with_parents=True) html = self.jinja2_env.get_template("video.html").render( node_id=node_id, video_filename=video_filename, video_filename_ext=video_filename_ext, subtitles=sorted(subtitles, key=lambda i: i["code"]), thumbnail=self.db.get_thumbnail_name(node_id), autoplay=self.autoplay, **node, ) with self.creator_lock: self.creator.add_item_for( path=node_id, title=node["title"], content=html, mimetype="text/html", ) def add_video_upon_completion(self, future): """adds the converted video inside this future to the zim logs error in case of failure""" if future.cancelled(): return src_fname, dst_fpath, path = self.videos_futures.get(future) try: future.result() except Exception as exc: logger.error(f"Error re-encoding {src_fname}: {exc}") logger.exception(exc) return logger.debug(f"Re-encoded {src_fname} successfuly") kwargs = { "path": path, "filepath": dst_fpath, "mimetype": get_file_mimetype(dst_fpath), } # we shall request s3 upload on the threads pool, only once item has been # added to ZIM so it can be removed altogether if self.s3_storage: kwargs.update({"callback": self.request_s3_upload_and_removal}) # simply add the item, autodeleting the file else: kwargs.update({"remove": True}) with self.creator_lock: self.creator.add_item(StaticItem(**kwargs)) logger.debug(f"Added {path} from re-encoded file") def convert_and_add_video_aside(self, file_id, src_fpath, src_checksum, dest_fpath, path, preset): """add video to the process-based convertion queue""" future = self.videos_executor.submit( reencode, src_path=src_fpath, dst_path=dest_fpath, ffmpeg_args=preset.to_ffmpeg_args(), delete_src=True, with_process=False, failsafe=False, ) self.videos_futures.update( {future: (src_fpath.name, dest_fpath, path)}) self.pending_upload.update({ path: ( dest_fpath, self.s3_key_for(file_id, preset), { "checksum": src_checksum, "encoder_version": str(preset.VERSION) }, ) }) future.add_done_callback(self.add_video_upon_completion) def request_s3_upload_and_removal(self, item): """ add file from item to uploads list """ path = item.path del item dest_fpath, key, meta = self.pending_upload.get(path) # TODO: submit to a thread executor (to create) instead # this is currently called on main-tread. self.upload_to_s3(key, dest_fpath, **meta) def add_audio_node(self, node_id): """Add content from this `audio` node to zim audio node are composed of a single mp3 file""" file = self.db.get_node_file(node_id, thumbnail=False) if not file: return self.funnel_file(file["id"], file["ext"]) node = self.db.get_node(node_id, with_parents=True) html = self.jinja2_env.get_template("audio.html").render( node_id=node_id, filename=filename_for(file), ext=file["ext"], thumbnail=self.db.get_thumbnail_name(node_id), autoplay=self.autoplay, **node, ) with self.creator_lock: self.creator.add_item_for( path=node_id, title=node["title"], content=html, mimetype="text/html", ) def add_exercise_node(self, node_id): """Add content from this `exercise` node to zim exercise node is composed of a single perseus file a perseus file is a ZIP containing an exercise.json entrypoint and other files we'd solely add the perseus file in the ZIM along with the perseus reader from https://github.com/Khan/perseus""" logger.warning(f"[NOT SUPPORTED] not adding exercice node {node_id}") def add_document_node(self, node_id): """Add content from this `document` node to zim document node is composed of one main (`priority` 1) file and an optionnal (`priority` 2) file Format for each is either `pdf` (`document` preset) or `epub` (`epub` preset) - add the actual PDF/epub files to zim at /{node_id}.{ext} (files' IDs) - add an HTML page linking to files for download - includes an iframe with the appropriate viewer - using pdf.js for PDF - using epub.js for EPUB - add an additional page for the alternate document with its viewer """ def target_for(file): filename = filename_for(file) if file["ext"] == "pdf": return f"./assets/pdfjs/web/viewer.html?file=../../../{filename}" if file["ext"] == "epub": return f"./assets/epub_embed.html?url=../{filename}" # record the actual document files = self.db.get_node_files(node_id, thumbnail=False) if not files: return files = sorted(filter(lambda f: f["supp"] == 0, files), key=lambda f: f["prio"]) it = iter(files) try: main_document = next(it) except StopIteration: return try: alt_document = next(it) except StopIteration: alt_document = None for file in files: self.funnel_file(file["id"], file["ext"]) file["target"] = target_for(file) node = self.db.get_node(node_id, with_parents=True) # convert generator to list as we might read it twice node["parents"] = list(node["parents"]) # generate page once for each document, changing only `is_alt` if alt_document: options = [False, True] else: options = [False] # main_document only for is_alt in options: html = self.jinja2_env.get_template("document.html").render( node_id=node_id, main_document=filename_for(main_document), main_document_ext=main_document["ext"], alt_document=filename_for(alt_document) if alt_document else None, alt_document_ext=alt_document["ext"] if alt_document else None, target=target_for(alt_document if is_alt else main_document), is_alt=is_alt, **node, ) with self.creator_lock: path = node_id if is_alt: path += "_alt" self.creator.add_item_for( path=path, title=node["title"], content=html, mimetype="text/html", ) def add_html5_node(self, node_id): """Add content from this `html5` node to zim html5 node is single ZIP file containing a standalone HTML app which entrypoint is a file named index.html we extract and add each file from the ZIP to /{node_id}/ Note: Studio doesn't enforce the mandatory index.html, thus allowing invalid html5 app (unreachable)""" file = self.db.get_node_file(node_id, thumbnail=False) if not file: return # download ZIP file to memory ark_url, ark_name = get_kolibri_url_for(file["id"], file["ext"]) ark_data = io.BytesIO() stream_file(url=ark_url, byte_stream=ark_data) # loop over zip members and create an entry for each zip_ark = zipfile.ZipFile(ark_data) for ark_member in zip_ark.namelist(): with self.creator_lock: self.creator.add_item_for( path=f"{node_id}/{ark_member}", content=zip_ark.open(ark_member).read(), ) def run(self): if self.s3_url_with_credentials and not self.s3_credentials_ok(): raise ValueError( "Unable to connect to Optimization Cache. Check its URL.") s3_msg = (f" using cache: {self.s3_storage.url.netloc} " f"with bucket: {self.s3_storage.bucket_name}" if self.s3_storage else "") logger.info(f"Starting scraper with:\n" f" channel_id: {self.channel_id}\n" f" build_dir: {self.build_dir}\n" f" output_dir: {self.output_dir}\n" f" using webm : {self.use_webm}\n" f" low_quality : {self.low_quality}\n" f"{s3_msg}") logger.info("Download database") self.download_db() self.sanitize_inputs() # display basic stats logger.info(f" Starting ZIM creation with:\n" f" filename: {self.fname}\n" f" title: {self.title}\n" f" description: {self.description}\n" f" creator: {self.author}\n" f" publisher: {self.publisher}\n" f" tags: {';'.join(self.tags)}") logger.info("Setup Zim Creator") self.output_dir.mkdir(parents=True, exist_ok=True) self.creator_lock = threading.Lock() self.creator = Creator( filename=self.output_dir.joinpath(self.fname), main_path=self.root_id, favicon_path="favicon.png", language="eng", title=self.title, description=self.description, creator=self.author, publisher=self.publisher, name=self.name, tags=";".join(self.tags), ).start() succeeded = False try: self.add_favicon() self.add_custom_about_and_css() # add static files logger.info("Adding local files (assets)") self.add_local_files("assets", self.templates_dir.joinpath("assets")) # setup queue for nodes processing self.nodes_futures = {} # future: node_id self.nodes_executor = cf.ThreadPoolExecutor( max_workers=self.nb_threads) # setup a dedicated queue for videos to convert self.videos_futures = {} # future: src_fname, dst_fpath, path self.pending_upload = {} # path: filepath, key, checksum self.videos_executor = cf.ProcessPoolExecutor( max_workers=self.nb_processes) logger.info("Starting nodes processing") self.populate_nodes_executor() # await completion of all futures (nodes and videos) result = cf.wait( self.videos_futures.keys() | self.nodes_futures.keys(), return_when=cf.FIRST_EXCEPTION, ) self.nodes_executor.shutdown() # properly shutting down the executor should allow processing # futures's callbacks (zim addition) as the wait() function # only awaits future completion and doesn't include callbacks self.videos_executor.shutdown() succeeded = not result.not_done except KeyboardInterrupt: self.creator.can_finish = False logger.error("KeyboardInterrupt, exiting.") except Exception as exc: # request Creator not to create a ZIM file on finish self.creator.can_finish = False logger.error("Interrupting process due to error: {exc}") logger.exception(exc) finally: if succeeded: logger.info("Finishing ZIM file…") # we need to release libzim's resources. # currently does nothing but crash if can_finish=False but that's awaiting # impl. at libkiwix level with self.creator_lock: self.creator.finish() if not self.keep_build_dir: logger.info("Removing build folder") shutil.rmtree(self.build_dir, ignore_errors=True) return 0 if succeeded else 1 def s3_credentials_ok(self): logger.info("testing S3 Optimization Cache credentials") self.s3_storage = KiwixStorage(self.s3_url_with_credentials) if not self.s3_storage.check_credentials(list_buckets=True, bucket=True, write=True, read=True, failsafe=True): logger.error("S3 cache connection error testing permissions.") logger.error(f" Server: {self.s3_storage.url.netloc}") logger.error(f" Bucket: {self.s3_storage.bucket_name}") logger.error(f" Key ID: {self.s3_storage.params.get('keyid')}") logger.error(f" Public IP: {get_public_ip()}") return False return True def download_db(self): """download channel DB from kolibri and initialize DB Also sets the root_id with DB-computer value""" # download database fpath = self.build_dir.joinpath("db.sqlite3") logger.debug(f"Downloading database into {fpath.name}…") stream_file( f"{STUDIO_URL}/content/databases/{self.channel_id}.sqlite3", fpath, ) self.db = KolibriDB(fpath, self.root_id) self.root_id = self.db.root_id def sanitize_inputs(self): channel_meta = self.db.get_channel_metadata(self.channel_id) # input & metadata sanitation period = datetime.datetime.now().strftime("%Y-%m") if self.fname: # make sure we were given a filename and not a path self.fname = Path(self.fname.format(period=period)) if Path(self.fname.name) != self.fname: raise ValueError(f"filename is not a filename: {self.fname}") else: self.fname = f"{self.name}_{period}.zim" if not self.title: self.title = channel_meta["name"] self.title = self.title.strip() if not self.description: self.description = channel_meta["description"] self.description = self.description.strip() if not self.author: self.author = channel_meta["author"] or "Kolibri" self.author = self.author.strip() if not self.publisher: self.publisher = "Openzim" self.publisher = self.publisher.strip() self.tags = list( set(self.tags + ["_category:other", "kolibri", "_videos:yes"])) def add_favicon(self): favicon_orig = self.build_dir / "favicon" # if user provided a custom favicon, retrieve that if self.favicon: handle_user_provided_file(source=self.favicon, dest=favicon_orig) # otherwise, get thumbnail from database else: # add channel thumbnail as favicon try: favicon_prefix, favicon_data = self.db.get_channel_metadata( self.channel_id)["thumbnail"].split(";base64,", 1) favicon_data = base64.standard_b64decode(favicon_data) # favicon_mime = favicon_prefix.replace("data:", "") with open(favicon_orig, "wb") as fh: fh.write(favicon_data) del favicon_data except Exception as exc: logger.warning( "Unable to extract favicon from DB; using default") logger.exception(exc) # use a default favicon handle_user_provided_file(source=self.templates_dir / "kolibri-logo.png", dest=favicon_orig) # convert to PNG (might already be PNG but it's OK) favicon_fpath = favicon_orig.with_suffix(".png") convert_image(favicon_orig, favicon_fpath) # resize to appropriate size (ZIM uses 48x48) resize_image(favicon_fpath, width=96, height=96, method="thumbnail") # generate favicon favicon_ico_path = favicon_fpath.with_suffix(".ico") create_favicon(src=favicon_fpath, dst=favicon_ico_path) self.creator.add_item_for("favicon.png", fpath=favicon_fpath) self.creator.add_item_for("favicon.ico", fpath=favicon_ico_path) def add_custom_about_and_css(self): channel_meta = self.db.get_channel_metadata(self.channel_id) if self.about: # if user provided a custom about page, use it with open( handle_user_provided_file(source=self.about, in_dir=self.build_dir, nocopy=True), "r", ) as fh: soup = BeautifulSoup(fh.read(), "lxml") title = soup.find("title").text content = soup.select("body > .container") # we're only interested in the first one if isinstance(content, list): content = content[0] else: title = channel_meta["name"] content = None html = self.jinja2_env.get_template("about.html").render( title=title, content=content, **channel_meta) with self.creator_lock: self.creator.add_item_for( path="about", title=title, content=html, mimetype="text/html", ) del html # if user provided a custom CSS file, use it if self.css: with open( handle_user_provided_file(source=self.css, in_dir=self.build_dir, nocopy=True), "r", ) as fh: content = fh.read() # otherwise, create a blank one else: content = "" self.creator.add_item_for("custom.css", content=content, mimetype="text/css") logger.debug("Added about page and custom CSS")
class WARC2Zim: def __init__(self, args): logging.basicConfig(format="[%(levelname)s] %(message)s") if args.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) self.indexed_urls = set({}) self.output = args.output self.zim_file = args.zim_file if not self.zim_file: self.zim_file = "{name}_{period}.zim".format( name=args.name, period=time.strftime("%Y-%m")) self.full_filename = os.path.join(self.output, self.zim_file) # ensure output file is writable with tempfile.NamedTemporaryFile(dir=self.output, delete=True) as fh: logger.debug(f"Confirming output is writable using {fh.name}") self.inputs = args.inputs self.replay_viewer_source = args.replay_viewer_source self.custom_css = args.custom_css self.main_url = args.url # ensure trailing slash is added if missing parts = urlsplit(self.main_url) if parts.path == "": parts = list(parts) # set path parts[2] = "/" self.main_url = urlunsplit(parts) self.include_domains = args.include_domains self.favicon_url = args.favicon self.language = args.lang self.title = args.title tags = DEFAULT_TAGS + (args.tags or []) self.metadata = { "name": args.name, "description": args.description, "creator": args.creator, "publisher": args.publisher, "tags": ";".join(tags), # optional "source": args.source, "scraper": "warc2zim " + get_version(), } self.revisits = {} # progress file handling self.stats_filename = (pathlib.Path(args.progress_file) if args.progress_file else None) if self.stats_filename and not self.stats_filename.is_absolute(): self.stats_filename = self.output / self.stats_filename self.written_records = self.total_records = 0 def add_replayer(self): if self.replay_viewer_source and re.match(r"^https?\:", self.replay_viewer_source): self.creator.add_item( URLItem( url=self.replay_viewer_source + SW_JS, path="A/" + SW_JS, mimetype="application/javascript", )) elif self.replay_viewer_source: self.creator.add_item_for( fpath=self.replay_viewer_source + SW_JS, path="A/" + SW_JS, mimetype="application/javascript", ) else: self.creator.add_item( StaticArticle(self.env, SW_JS, self.main_url, mimetype="application/javascript")) def init_env(self): # autoescape=False to allow injecting html entities from translated text env = Environment( loader=PackageLoader("warc2zim", "templates"), extensions=["jinja2.ext.i18n"], autoescape=False, ) try: env.install_gettext_translations(Locale.translation) except OSError: logger.warning( "No translations table found for language: {0}".format( self.language)) env.install_null_translations() return env def update_stats(self): """write progress as JSON to self.stats_filename if requested""" if not self.stats_filename: return self.written_records += 1 with open(self.stats_filename, "w") as fh: json.dump( { "written": self.written_records, "total": self.total_records }, fh) def get_custom_css_record(self): if re.match(r"^https?\://", self.custom_css): resp = requests.get(self.custom_css, timeout=10) resp.raise_for_status() payload = resp.content else: css_path = pathlib.Path(self.custom_css).expanduser().resolve() with open(css_path, "rb") as fh: payload = fh.read() http_headers = StatusAndHeaders( "200 OK", [("Content-Type", 'text/css; charset="UTF-8"')], protocol="HTTP/1.0", ) return RecordBuilder().create_warc_record( CUSTOM_CSS_URL, "response", payload=io.BytesIO(payload), length=len(payload), http_headers=http_headers, ) def run(self): if not self.inputs: logger.info( "Arguments valid, no inputs to process. Exiting with error code 100" ) return 100 self.find_main_page_metadata() # make sure Language metadata is ISO-639-3 and setup translations try: lang_data = get_language_details(self.language) self.language = lang_data["iso-639-3"] setlocale( pathlib.Path(__file__).parent, lang_data.get("iso-639-1")) except Exception: logger.error( f"Invalid language setting `{self.language}`. Using `eng`.") self.env = self.init_env() # init head insert template = self.env.get_template(HEAD_INSERT_FILE) self.head_insert = ("<head>" + template.render()).encode("utf-8") if self.custom_css: self.css_insert = ( f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" ' 'rel="Stylesheet" />\n</head>').encode("utf-8") else: self.css_insert = None self.creator = Creator( self.full_filename, main_path="A/index.html", language=self.language or "eng", title=self.title, date=datetime.date.today(), **self.metadata, ).start() self.add_replayer() for filename in pkg_resources.resource_listdir("warc2zim", "templates"): if filename == HEAD_INSERT_FILE or filename == SW_JS: continue self.creator.add_item( StaticArticle(self.env, filename, self.main_url)) for record in self.iter_all_warc_records(): self.add_items_for_warc_record(record) # process revisits, headers only for url, record in self.revisits.items(): if url not in self.indexed_urls: logger.debug("Adding revisit {0} -> {1}".format( url, record.rec_headers["WARC-Refers-To-Target-URI"])) self.creator.add_item(WARCHeadersItem(record)) self.indexed_urls.add(url) if self.favicon_url: self.add_illustration() logger.debug(f"Found {self.total_records} records in WARCs") self.creator.finish() def iter_all_warc_records(self): # add custom css records if self.custom_css: yield self.get_custom_css_record() yield from iter_warc_records(self.inputs) def find_main_page_metadata(self): for record in self.iter_all_warc_records(): if record.rec_type == "revisit": continue # if no main_url, use first 'text/html' record as the main page by default # not guaranteed to always work mime = get_record_mime_type(record) url = record.rec_headers["WARC-Target-URI"] if (not self.main_url and mime == "text/html" and record.payload_length != 0 and (not record.http_headers or record.http_headers.get_statuscode() == "200")): self.main_url = url if urldefrag(self.main_url).url != url: continue # if we get here, found record for the main page # if main page is not html, still allow (eg. could be text, img), # but print warning if mime not in HTML_TYPES: logger.warning( "Main page is not an HTML Page, mime type is: {0} " "- Skipping Favicon and Language detection".format(mime)) return # content = record.content_stream().read() record.buffered_stream.seek(0) content = record.buffered_stream.read() if not self.title: self.title = parse_title(content) self.find_icon_and_language(content) logger.debug("Title: {0}".format(self.title)) logger.debug("Language: {0}".format(self.language)) logger.debug("Favicon: {0}".format(self.favicon_url)) return msg = "Unable to find WARC record for main page: {0}, ZIM not created".format( self.main_url) logger.error(msg) raise KeyError(msg) def find_icon_and_language(self, content): soup = BeautifulSoup(content, "html.parser") if not self.favicon_url: # find icon icon = soup.find("link", rel="shortcut icon") if not icon: icon = soup.find("link", rel="icon") if icon: self.favicon_url = urljoin(self.main_url, icon.attrs["href"]) else: self.favicon_url = urljoin(self.main_url, "/favicon.ico") if not self.language: # HTML5 Standard lang_elem = soup.find("html", attrs={"lang": True}) if lang_elem: self.language = lang_elem.attrs["lang"] return # W3C recommendation lang_elem = soup.find("meta", { "http-equiv": "content-language", "content": True }) if lang_elem: self.language = lang_elem.attrs["content"] return # SEO Recommendations lang_elem = soup.find("meta", { "name": "language", "content": True }) if lang_elem: self.language = lang_elem.attrs["content"] return def add_illustration(self, record=None): if self.favicon_url in self.indexed_urls: return # add illustration from favicon option or in-warc favicon logger.info("Adding illustration from " + (self.favicon_url if record is None else "WARC")) favicon_fname = pathlib.Path(urlparse(self.favicon_url).path).name src_illus_fpath = pathlib.Path(".").joinpath(favicon_fname) # reusing payload from WARC record if record: with open(src_illus_fpath, "wb") as fh: if hasattr(record, "buffered_stream"): record.buffered_stream.seek(0) fh.write(record.buffered_stream.read()) else: fh.write(record.content_stream().read()) # fetching online else: try: handle_user_provided_file(source=self.favicon_url, dest=src_illus_fpath) except Exception as exc: logger.warning( "Unable to retrieve favicon. " "ZIM won't have an illustration: {exc}".format(exc=exc)) return # convert to PNG (might already be PNG but it's OK) illus_fpath = src_illus_fpath.with_suffix(".png") convert_image(src_illus_fpath, illus_fpath) # resize to appropriate size (ZIM uses 48x48 so we double for retina) for size in (96, 48): resize_image(illus_fpath, width=size, height=size, method="thumbnail") with open(illus_fpath, "rb") as fh: self.creator.add_illustration(size, fh.read()) src_illus_fpath.unlink() def is_self_redirect(self, record, url): if record.rec_type != "response": return False if not record.http_headers.get_statuscode().startswith("3"): return False location = record.http_headers["Location"] return canonicalize(url) == canonicalize(location) def add_items_for_warc_record(self, record): url = get_record_url(record) if not url: logger.debug( f"Skipping record with empty WARC-Target-URI {record}") return if url in self.indexed_urls: logger.debug( "Skipping duplicate {0}, already added to ZIM".format(url)) return # if include_domains is set, only include urls from those domains if self.include_domains: parts = urlsplit(url) if not any( parts.netloc.endswith(domain) for domain in self.include_domains): logger.debug( "Skipping url {0}, outside included domains".format(url)) return if record.rec_type != "revisit": if self.is_self_redirect(record, url): logger.debug("Skipping self-redirect: " + url) return self.creator.add_item(WARCHeadersItem(record)) payload_item = WARCPayloadItem(record, self.head_insert, self.css_insert) if len(payload_item.content) != 0: self.creator.add_item(payload_item) self.total_records += 1 self.update_stats() if url == self.favicon_url: self.add_illustration(record=record) self.indexed_urls.add(url) elif (record.rec_headers["WARC-Refers-To-Target-URI"] != url and url not in self.revisits): self.revisits[url] = record self.add_fuzzy_match_record(url) def add_fuzzy_match_record(self, url): fuzzy_url = url for rule in FUZZY_RULES: fuzzy_url = rule["match"].sub(rule["replace"], url) if fuzzy_url != url: break if fuzzy_url == url: return http_headers = StatusAndHeaders("302 Redirect", {"Location": url}) date = datetime.datetime.utcnow().isoformat() builder = RecordBuilder() record = builder.create_revisit_record( fuzzy_url, "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", url, date, http_headers) self.revisits[fuzzy_url] = record logger.debug("Adding fuzzy redirect {0} -> {1}".format(fuzzy_url, url))