def _update_md5(self, name): assert self.dir_path is not None, "Cannot get MD5 checksum for {} when there is no input directory".format( name) path = os.path.join(self.dir_path, name) assert "/" not in name with self._md5_lock: shallow_md5, _ = Filesystem.path_md5( path=path, shallow=True, expect=self._md5[name]["shallow"] if name in self._md5 else None) deep_md5, modified = Filesystem.path_md5( path=path, shallow=False, expect=self._md5[name]["deep"] if name in self._md5 else None) modified = max( modified if modified else 0, self._md5[name]["modified"] if name in self._md5 else 0) self._md5[name] = { "shallow": shallow_md5, "shallow_checked": int(time.time()), "deep": deep_md5, "deep_checked": int(time.time()), "modified": modified, }
def get_results(self): result_obj = tempfile.NamedTemporaryFile( prefix="daisy-pipeline-results-", suffix=".zip") result = result_obj.name url = DaisyPipelineJob.encode_url( self.engine, "/jobs/{}/result".format(self.job_id), {}) with requests.get(url, stream=True) as r: with open(result, 'wb') as f: shutil.copyfileobj(r.raw, f) if os.path.isfile(result) and os.path.getsize(result) > 0: Filesystem.unzip(self.pipeline.utils.report, result, self.dir_output)
def __init__(self, pipeline=None, source=None, stdout_level="INFO", stderr_level="INFO", cwd=None): assert pipeline assert source if not cwd: cwd = tempfile.gettempdir() self.success = False Epubcheck.init_environment() # epubcheck works better when the input is zipped if source.lower().endswith(".opf"): pipeline.utils.report.debug("EPUB is not zipped, zipping…") root_path = os.path.dirname(source) while True: assert root_path != os.path.dirname( root_path ), "No mimetype file or META-INF directory found in the EPUB, unable to determine root directory" is_root = False for filename in os.listdir(root_path): if filename == "mimetype" or filename == "META-INF": is_root = True break if is_root: break else: root_path = os.path.dirname(root_path) epub = Epub(pipeline.utils.report, root_path) source = epub.asFile() try: command = ["java", "-jar", Epubcheck.epubcheck_jar] command.append(source) pipeline.utils.report.debug("Running Epubcheck") process = Filesystem.run_static(command, cwd, pipeline.utils.report, stdout_level=stdout_level, stderr_level=stderr_level) self.success = process.returncode == 0 except subprocess.TimeoutExpired: pipeline.utils.report.error( "Epubcheck for {} took too long and were therefore stopped.". format(os.path.basename(source))) except Exception: pipeline.utils.report.debug(traceback.format_exc(), preformatted=True) pipeline.utils.report.error( "An error occured while running Epubcheck (for " + str(source) + ")")
def _update_book_count_thread(self): while self.should_run: time.sleep(1) try: for dir in list(self.book_count.keys()): dirs = [] parentdirs = self.book_count[dir]["parentdirs"] if parentdirs: for parentdir in parentdirs: dirs.append(os.path.join(dir, parentdirs[parentdir])) else: dirs.append(dir) if (self.book_count[dir]["modified"] + 15 < time.time()): books = [] for d in dirs: if os.path.isdir(d): books += Filesystem.list_book_dir(d) self.book_count[dir]["modified"] = time.time() self.book_count[dir]["count"] = len(set(books)) if not self.should_run: break except Exception: logging.exception("An error occurred while updating book count")
def format_email_report(content, dirs, dir_log, logfile, book_archive): # Formats the daily report message in html format for email. img_string penguin for linux img_string = ("<img src=\"" "sTAAALEwEAmpwYAAAAB3RJTUUH4goFCTApeBNtqgAAA2pJREFUOMt1lF1rI2UYhu/JfCST6bRp2kyCjmWzG0wllV1SULTSyoLpcfHU5jyLP6IUQX+" "DLqw/wBbPWiUeaLHBlijpiZbNR9PdDUPKSjL5mszX48Ha6U6HveGBeeGd67nf+3lnGCIi3FKv10e9/hQMw+Du3XuYn4/hjaJbqtVqtL29Tfn8KuXz" "q1QsFqlardKb5AFs26ZyuUzZbJZUVSVFUUgQBBIEgTKZDB0cHJDjOEGAaZrU6XTo6OiICoUCqapKxWKRdnd3aXZ2liRJIkmSaHNzkzqdThDw5Mn3t" "La2Rul0mmKxGOXzq3R4eEiNRoMWFxdJlmWSZZkymQxVKpUAgFtaUvH5w3t43jLx429jXF62sb+/j6urK9i2DZZlAQCu68IwjECG3MbGp7h//wFedp" "9Bc77BTz+Xsbe3BwDeywAgCALC4XAAEGJZFgsLC3j3vQcoPfoSiqKAZdlADYdDnJ2dBQDszs7OzvVCVVXE4/MwXv4NnmMxI8/AcUOwbRuu60LXdWx" "tbYHn+RsHPjuhEBJxEV9/McK3JQsPV+dfnZPjwHEczs/PUS7/4j/C64tut4uZyA9Y+sRG8kMWf/zjwLZthEIhhEIhWJaFx4+/84XpAWzbRvvyL7z/" "cQvMOzKO2wq07r9e9+tqNpuo1WpBQK/XgyQ/gyh8BGADv/+agOu6gTBN00SlUrkZ4/WDruuIzX4ABp9hqA/R6XzlC+t1XVxcYDweIxqN3jgwTRMC/" "xZc+22MR3GY5qvuHMdBEASfi36/j8lk4ncwnU7Bshwsy4JlWV76kiSB4zj0+33Pgeu6cBzHDyAiOI6N6ZQBy7KQJAk8zyORSMAwDIxGIw8giiI4jv" "eH6LouRqMRDGMChmGQTqcRDoeRyWQQDofB87xX8Xgc0ajodyAIAgaDgdelUChA0zTkciuo1+vgOG8rUqkUIpGIHxCPx9FqtbyNc3NzKJVK0DQNROS" "biKIkg2NMJpPQdR2NRhOpVNL7Eh3HgSAIPoBhTEBEYBjmBsCyLJaXlyHLMk5PTyGKIkRRRCQSgaIoGI/HHuD4+Bi5XA4rKytgbv+VNU1Dtfon6vWn" "4Hked+6k0ev1cHJyghcvnnsjlmUZ6+vrQYDjOLAsC5OJAdd1EI1G/78nJtrtCzSaTQz0AVKpJLLZLP4DF17fodMaIVYAAAAASUVORK5CYII") # + siste del: "=\" alt=\"DATA\">") message = "" first_dir_log = True timeout = 600 timeout_start = time.time() for line in content: if time.time() > timeout_start + timeout: return message if "(li) " in line: line = line.replace("(li) ", "") message = message + "\n<ul>\n<li>" + line + "</li>\n</ul>" elif "(href) " in line: line = line.replace("(href) ", "") for dir in dirs: dir_unc = Filesystem.networkpath(dir)[2] if dir_unc in line: split_href = line.split(", ") if len(split_href) == 3: smb_img_string = img_string + "=\" alt=\"{}\">".format(split_href[-1]) message = message + "\n<ul>\n<li><a href=\"file:///{}\">{}</a> {}</li>\n</ul>".format(split_href[1], split_href[0], smb_img_string) if logfile in line: if first_dir_log: split_href = line.split(", ") smb_img_string = img_string + "=\" alt=\"{}\">".format(split_href[-1]) if len(split_href) == 3: short_path = "log.txt" message = message + "\n<ul>\n<li><a href=\"file:///{}\">{}</a> {}</li>\n</ul>".format(split_href[1], short_path, smb_img_string) first_dir_log = False elif line != "": first_dir_log = True if "mail:" in line: splitline = line.split("mail: ") splitmail = splitline[-1].split(", ") smb_img_string = img_string + "=\" alt=\"{}\">".format(splitmail[-1]) message = message + "\n<p><b>{}<a href=\"file:///{}\">Link</a> {}</b></p>".format(splitline[0], splitmail[0], smb_img_string) continue elif "[" in line: message = message + "\n" + "<p><b>" + line + "</b></p>" return message
def update(): project_dir = os.path.normpath( os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..")) process = Filesystem.run_static(["git", "pull"], cwd=project_dir) if process.returncode == 0: return jsonify(process.stdout.decode("utf-8")), 200 else: return jsonify(process.stderr.decode("utf-8")), 500
def setUp(self): print("TEST: setUp (override os.unlink)") self.target = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'target', 'unittest')) self.dir_in = os.path.join(self.target, "in") self.dir_out = os.path.join(self.target, "out") self.pipeline = MockPipeline(self.dir_in, self.dir_in, self.dir_out) self.filesystem = Filesystem(self.pipeline) if os.path.exists(self.target): shutil.rmtree(self.target) self.original_unlink = os.unlink os.unlink = FilesystemTest.unlink
def getDirectories(structure): if structure == "ranked": return jsonify(Directory.dirs_ranked) elif structure == "resolved": dirs = {} buffered_network_paths = Config.get("buffered_network_paths", {}) buffered_network_hosts = Config.get("buffered_network_hosts", {}) for dir in Directory.dirs_flat: if isinstance(dir, str) and dir not in buffered_network_paths: smb, file, unc = Filesystem.networkpath(Directory.dirs_flat[dir]) host = Filesystem.get_host_from_url(smb) buffered_network_paths[dir] = smb Config.set("buffered_network_paths." + dir, smb) buffered_network_hosts[dir] = host Config.set("buffered_network_hosts." + dir, host) dirs[dir] = buffered_network_paths[dir] return jsonify(dirs) else: return jsonify(Directory.dirs_flat)
def triggerDirectoryEdition(directory_id, edition_id): path = os.path.normpath(Directory.dirs_flat[directory_id]) if directory_id in Directory.dirs_flat else None if not path: return None, 404 file_stems = [Path(file).stem for file in Filesystem.list_book_dir(path)] if edition_id not in file_stems: return None, 404 result = [] for pipeline in Pipeline.pipelines: if pipeline.dir_in and os.path.normpath(pipeline.dir_in) == path: pipeline.trigger(edition_id, auto=False) result.append(pipeline.uid) return jsonify(result), 200
def _trigger_epub_catalog_thread(self): last_check = 0 self.watchdog_bark() while self.shouldRun: time.sleep(5) self.watchdog_bark() if not self.dirsAvailable(): continue # Check for update every 3 days max_update_interval = 60 * 60 * 24 * 3 if time.time() - last_check < max_update_interval: continue last_check = time.time() logging.info("Updating formatklar and filesize for ebooks") list_books = Filesystem.list_book_dir(self.dir_out) Bibliofil.update_list_of_books("XHTML", list_books)
def is_available(self): if self.last_availability_check_time >= time.time() - 10: if not self.last_availability_check_time: logging.debug("Directory is not available (cached result)" + ( ": {}".format(self.dir_path) if self.dir_path else "")) return self.last_availability_check_result self.last_availability_check_time = time.time() if self.dir_path is None: self.last_availability_check_result = True return self.last_availability_check_result self.last_availability_check_result = False is_mount = Filesystem.ismount(self.dir_path) contains_books = False if is_mount: for entry in os.scandir(self.dir_path): contains_books = True break mount_is_mounted = not is_mount or contains_books self.last_availability_check_result = os.path.isdir( self.dir_path) and mount_is_mounted if not self.last_availability_check_result: logging.warning("Directory is not available: " + str(self.dir_path)) logging.debug( str(self.dir_path) + " is " + ("" if os.path.isdir(self.dir_path) else "not ") + " a directory.") logging.debug( str(self.dir_path) + " is " + ("" if is_mount else "not ") + " a mounted filesystem.") logging.debug( str(self.dir_path) + " does " + ("" if contains_books else "not ") + " contain books.") return self.last_availability_check_result
def _trigger_newsletter_thread(self): last_check = 0 # If no newsletter this month, trigger newsletter self.watchdog_bark() while self.shouldRun: time.sleep(5) self.watchdog_bark() if not self.dirsAvailable(): continue max_update_interval = 60 * 60 if time.time() - last_check < max_update_interval: continue last_check = time.time() self.newsletter_identifier = "120209" self.newsletter_identifier += time.strftime("%m%Y") self.year_month = datetime.datetime.today().strftime('%Y-%m') if self.newsletter_identifier not in Filesystem.list_book_dir( self.dir_out): logging.info("Lager nyhetsbrev for: " + self.year_month) self.trigger(self.newsletter_identifier)
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False # ---------- lag en kopi av EPUBen ---------- temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self.utils.report, temp_epubdir) # ---------- gjør tilpasninger i HTML-fila med XSLT ---------- opf_path = temp_epub.opf_path() if not opf_path: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne OPF-fila i EPUBen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False opf_path = os.path.join(temp_epubdir, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() html_file = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) html_file = html_file[0] if html_file else None if not html_file: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False html_dir = os.path.dirname(opf_path) html_file = os.path.join(html_dir, html_file) if not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False temp_xml_obj = tempfile.NamedTemporaryFile() temp_xml = temp_xml_obj.name # MATHML to stem self.utils.report.info("Erstatter evt. MathML i boka...") mathml_validation = Mathml_validator(self, source=html_file) if not mathml_validation.success: self.utils.report.error( "NLBPUB contains MathML errors, aborting...") return False mathML_result = Mathml_to_text(self, source=html_file, target=html_file) if not mathML_result.success: return False self.utils.report.info( "Lager skjulte overskrifter der det er nødvendig") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "create-hidden-headlines.xsl"), source=html_file, target=temp_xml, parameters={ "cover-headlines": "from-type", "frontmatter-headlines": "from-type", "bodymatter-headlines": "from-text", "backmatter-headlines": "from-type" }) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, html_file) self.utils.report.info("Tilpasser innhold for e-bok...") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "prepare-for-ebook.xsl"), source=html_file, target=temp_xml) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, html_file) # Use library-specific logo and stylesheet if available library = temp_epub.meta("schema:library") library = library.upper() if library else library logo = os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "{}_logo.png".format(library)) if os.path.isfile(logo): shutil.copy(logo, os.path.join(html_dir, os.path.basename(logo))) PrepareForEbook.update_css() stylesheet = PrepareForEbook.css_tempfile_obj.name if library is not None and library.lower() == "statped": stylesheet = PrepareForEbook.css_tempfile_statped_obj.name shutil.copy(stylesheet, os.path.join(html_dir, "ebok.css")) self.utils.report.info("Legger til logoen i OPF-manifestet") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "add-to-opf-manifest.xsl"), source=opf_path, target=temp_xml, parameters={ "href": os.path.basename(logo), "media-type": "image/png" }) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, opf_path) self.utils.report.info("Legger til CSS-fila i OPF-manifestet") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "add-to-opf-manifest.xsl"), source=opf_path, target=temp_xml, parameters={ "href": "ebok.css", "media-type": "text/css" }) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, opf_path) # add cover if missing opf_xml = ElementTree.parse(opf_path).getroot() cover_id = opf_xml.xpath( "/*/*[local-name()='manifest']/*[contains(concat(' ', @properties, ' '), ' cover-image ')]/@id" ) # from properties if not cover_id: cover_id = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@name='cover']/@content" ) # from metadata if not cover_id: cover_id = opf_xml.xpath( "/*/*[local-name()='manifest']/*[starts-with(@media-type, 'image/') and contains(@href, 'cover')]/@id" ) # from filename cover_id = cover_id[0] if cover_id else None if not cover_id: # cover not found in the book, let's try NLBs API # NOTE: identifier at this point is the e-book identifier edition_url = "{}/editions/{}?creative-work-metadata=none&edition-metadata=all".format( Config.get("nlb_api_url"), epub.identifier()) response = requests.get(edition_url) self.utils.report.debug( "looking for cover image in: {}".format(edition_url)) if response.status_code == 200: response_json = response.json() if "data" not in response_json: self.utils.report.debug("response as JSON:") self.utils.report.debug(str(response_json)) raise Exception( "No 'data' in response: {}".format(edition_url)) data = response_json["data"] cover_url = data["coverUrlLarge"] if cover_url is not None and cover_url.startswith("http"): response = requests.get(cover_url) if response.status_code == 200: _, extension = os.path.splitext(cover_url) target_href = "cover" + extension target_dir = os.path.dirname(opf_path) with open(os.path.join(target_dir, target_href), "wb") as target_file: target_file.write(response.content) self.utils.report.info( "Legger til bildet av bokomslaget i OPF-manifestet" ) media_type = None if extension.lower() in [ ".png" ]: # check for png, just in case. Should always be jpg though. media_type = "image/png" else: media_type = "image/jpeg" xslt = Xslt(self, stylesheet=os.path.join( Xslt.xslt_dir, PrepareForEbook.uid, "add-to-opf-manifest.xsl"), source=opf_path, target=temp_xml, parameters={ "href": target_href, "media-type": media_type }) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() cover_id = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@href = '{}']/@id" .format(target_href)) # from filename cover_id = cover_id[0] if cover_id else None if cover_id is None or len(cover_id) == 0: self.utils.report.warn( "Klarte ikke å finne bilde av bokomslaget for {}".format( epub.identifier())) self.utils.report.info("Legger til properties i OPF etter behov") temp_epub.update_opf_properties() # validate with epubcheck if Epubcheck.isavailable(): epubcheck = Epubcheck(self, opf_path) if not epubcheck.success: tempfile_stored_opf = os.path.join( self.utils.report.reportDir(), os.path.basename(opf_path)) shutil.copy(opf_path, tempfile_stored_opf) tempfile_stored = os.path.join(self.utils.report.reportDir(), os.path.basename(html_file)) shutil.copy(html_file, tempfile_stored) self.utils.report.info( f"Validering av DTBook feilet, lagrer temp fil for feilsøking: {tempfile_stored}" ) self.utils.report.attachment(None, tempfile_stored, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return else: self.utils.report.warn( "Epubcheck er ikke tilgjengelig, EPUB blir ikke validert!") # ---------- lagre filsett ---------- self.utils.report.info( "Boken ble konvertert. Kopierer til HTML-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( temp_epubdir, epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " ble konvertert 👍😄" + epubTitle return True
def on_book(self): epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return temp_obj = tempfile.TemporaryDirectory() temp_dir = temp_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_dir) self.utils.report.info("Henter metadata fra api.nlb.no") creative_work_metadata = None edition_metadata = None timeout = 0 while creative_work_metadata is None and timeout < 5: timeout = timeout + 1 creative_work_metadata = Metadata.get_creative_work_from_api( self.book["name"], editions_metadata="all", use_cache_if_possible=True, creative_work_metadata="all") edition_metadata = Metadata.get_edition_from_api(self.book["name"]) if creative_work_metadata is not None: break if creative_work_metadata is None: self.utils.report.warning( "Klarte ikke finne et åndsverk tilknyttet denne utgaven. Prøver igjen senere." ) return False library = edition_metadata["library"].lower() # in case of wrong upper lower cases if library == "nlb": library = "NLB" elif library == "statped": library = "Statped" elif library == "kabb": library = "KABB" if library.lower() != "statped": self.utils.report.error("Ikke en Statped bok. Avbryter") self.utils.report.should_email = False return False # Filesystem.copy(self.utils.report, self.book["source"], temp_dir) self.utils.report.info("Kopierer til EPUB master-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( temp_dir, epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " er valid 👍😄" + epubTitle self.utils.filesystem.deleteSource() return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return # ---------- lag en kopi av EPUBen ---------- temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self.utils.report, temp_epubdir) # ---------- gjør tilpasninger i HTML-fila med XSLT ---------- opf_path = temp_epub.opf_path() if not opf_path: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne OPF-fila i EPUBen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return opf_path = os.path.join(temp_epubdir, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() html_file = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) html_file = html_file[0] if html_file else None if not html_file: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return html_file = os.path.join(os.path.dirname(opf_path), html_file) if not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return temp_xml_obj = tempfile.NamedTemporaryFile() temp_xml = temp_xml_obj.name self.utils.report.info("Flater ut NLBPUB") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid, "nlbpub-flatten.xsl"), source=html_file, target=temp_xml) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return shutil.copy(temp_xml, html_file) self.utils.report.info("Deler opp NLBPUB i flere HTML-filer") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid, "nlbpub-split.xsl"), source=html_file, target=temp_xml, parameters={"output-dir": os.path.dirname(html_file)}) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return os.remove(html_file) spine_hrefs = [] for href in sorted(os.listdir(os.path.dirname(html_file))): if href.endswith(".xhtml") and href not in [ "nav.xhtml", os.path.basename(html_file) ]: spine_hrefs.append(href) self.utils.report.info("Oppdaterer OPF-fil") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid, "update-opf.xsl"), source=opf_path, target=temp_xml, parameters={"spine-hrefs": ",".join(spine_hrefs)}) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return shutil.copy(temp_xml, opf_path) nav_path = os.path.join(temp_epubdir, temp_epub.nav_path()) self.utils.report.info("Lager nytt navigasjonsdokument") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid, "generate-nav.xsl"), source=opf_path, target=nav_path) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return self.utils.report.info("Legger til properties i OPF etter behov") temp_epub.update_opf_properties() if Epubcheck.isavailable(): epubcheck = Epubcheck(self, opf_path) if not epubcheck.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return else: self.utils.report.warn( "Epubcheck not available, EPUB will not be validated!") self.utils.report.info( "Boken ble konvertert. Kopierer til e-bok-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( temp_epubdir, temp_epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") Bibliofil.book_available(NlbpubToEpub.publication_format, temp_epub.identifier()) self.utils.report.title = self.title + ": " + epub.identifier( ) + " ble konvertert 👍😄" + epubTitle return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") self.utils.report.info("Locating HTML file") epub = Epub(self.utils.report, self.book["source"]) if not epub.isepub(): return False assert epub.isepub(), "The input must be an EPUB" spine = epub.spine() if not len(spine) == 1: self.utils.report.warn( "There must only be one item in the EPUB spine") return False html_file = os.path.join(self.book["source"], os.path.dirname(epub.opf_path()), spine[0]["href"]) identifier = epub.identifier() self.utils.report.info("lag en kopi av boka") temp_resultdir_obj = tempfile.TemporaryDirectory() temp_resultdir = temp_resultdir_obj.name Filesystem.copy(self.utils.report, os.path.dirname(html_file), temp_resultdir) temp_result = os.path.join(temp_resultdir, identifier + ".xml") self.utils.report.info("sletter EPUB-spesifikke filer") for root, dirs, files in os.walk(temp_resultdir): for file in files: if Path(file).suffix.lower() in [ ".xhtml", ".html", ".smil", ".mp3", ".wav", ".opf" ]: os.remove(os.path.join(root, file)) shutil.copy(html_file, temp_result) temp_xslt_output_obj = tempfile.NamedTemporaryFile() temp_xslt_output = temp_xslt_output_obj.name # MATHML to stem self.utils.report.info("Erstatter evt. MathML i boka...") mathml_validation = Mathml_validator(self, source=temp_result) if not mathml_validation.success: return False mathML_result = Mathml_to_text(self, source=temp_result, target=temp_result) if not mathML_result.success: return False self.utils.report.info("Fikser Webarch-oppmerking") self.utils.report.debug("webarch-fixup.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToNarrationEpub.uid, "webarch-fixup.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) self.utils.report.info("Setter inn lydbokavtalen...") self.utils.report.debug("bokinfo-tts-dtbook.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "bokinfo-tts-dtbook.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) creative_work_metadata = None timeout = 0 while creative_work_metadata is None and timeout < 5: timeout = timeout + 1 creative_work_metadata = Metadata.get_creative_work_from_api( identifier, editions_metadata="all", use_cache_if_possible=True, creative_work_metadata="all") if creative_work_metadata is not None: if creative_work_metadata["magazine"] is True: self.utils.report.info( "Fjerner sidetall fordi det er et tidsskrift...") self.utils.report.debug("remove-pagenum.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "remove-pagenum.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) break if creative_work_metadata is None: self.utils.report.warning( "Klarte ikke finne et åndsverk tilknyttet denne utgaven. Konverterer likevel." ) library = epub.meta("schema:library") library = library.upper() if library else library logo = os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "{}_logo.png".format(library)) if os.path.isfile(logo): # epub_dir = os.path.join(temp_resultdir, "EPUB") image_dir = os.path.join(temp_resultdir, "images") if not os.path.isdir(image_dir): os.mkdir(image_dir) shutil.copy(logo, image_dir) self.utils.report.info("Konverterer fra XHTML5 til DTBook...") self.utils.report.debug("html-to-dtbook.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "html-to-dtbook.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) self.utils.report.info("Gjør tilpasninger i DTBook") self.utils.report.debug("dtbook-cleanup.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "dtbook-cleanup.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) # Fjern denne transformasjonen hvis det oppstår kritiske proplemer med håndteringen av komplekst innhold self.utils.report.info( "Legger inn ekstra informasjon om komplekst innhold") self.utils.report.debug("optimaliser-komplekst-innhold.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "optimaliser-komplekst-innhold.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) self.utils.report.info("Validerer DTBook...") # NOTE: This RelaxNG schema assumes that we're using DTBook 2005-3 and MathML 3.0 dtbook_relax = Relaxng( self, relaxng=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "dtbook-schema/rng/dtbook-2005-3.mathml-3.integration.rng"), source=temp_result) dtbook_sch = Schematron(self, schematron=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "dtbook-schema/sch/dtbook.mathml.sch"), source=temp_result) if not dtbook_relax.success: self.utils.report.error("Validering av DTBook feilet (RelaxNG)") if not dtbook_sch.success: self.utils.report.error("Validering av DTBook feilet (Schematron)") if not dtbook_relax.success or not dtbook_sch.success: tempfile_stored = os.path.join(self.utils.report.reportDir(), os.path.basename(temp_result)) shutil.copy(temp_result, tempfile_stored) self.utils.report.info( f"Validering av DTBook feilet, lagrer temp fil for feilsøking: {tempfile_stored}" ) self.utils.report.attachment(None, tempfile_stored, "DEBUG") return False self.utils.report.info( "Boken ble konvertert. Kopierer til DTBook-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( temp_resultdir, identifier) self.utils.report.attachment(None, archived_path, "DEBUG") return True
def test_copy_locked_files(self): print("creating book without any locked files") book1 = os.path.join(self.dir_in, "book1") os.makedirs(os.path.join(book1, "images")) Path(os.path.join(book1, "ncc.html")).touch() Path(os.path.join(book1, "images/Image.png")).touch() Path(os.path.join(book1, "images/zmage.png")).touch() print("creating book with a \"Thumbs.db\" file locked by Windows") book2 = os.path.join(self.dir_in, "book2") os.makedirs(os.path.join(book2, "images")) Path(os.path.join(book2, "ncc.html")).touch() Path(os.path.join(book2, "images/Image.png")).touch() Path(os.path.join(book2, "images/Thumbs.db")).touch() Path(os.path.join(book2, "images/zmage.png")).touch() print("creating book with a \"locked\" file locked by Windows") book3 = os.path.join(self.dir_in, "book3") os.makedirs(os.path.join(book3, "images")) Path(os.path.join(book3, "ncc.html")).touch() Path(os.path.join(book3, "images/Image.png")).touch() Path(os.path.join(book3, "images/locked")).touch() Path(os.path.join(book3, "images/zmage.png")).touch() target_book1 = os.path.join(self.dir_out, "book1") target_book2 = os.path.join(self.dir_out, "book2") target_book3 = os.path.join(self.dir_out, "book3") print("copy book1 to target_book1") Filesystem.copy(self.pipeline.utils.report, book1, target_book1) dirlist = os.listdir(target_book1) dirlist.sort() self.assertEqual(dirlist, ["images", "ncc.html"]) dirlist = os.listdir(os.path.join(target_book1, "images")) dirlist.sort() self.assertEqual(dirlist, ["Image.png", "zmage.png"]) self.assertTrue( len([m for m in self.pipeline.messages if m.startswith("[ERROR]")]) == 0) self.assertTrue( len([m for m in self.pipeline.messages if m.startswith("[WARN]")]) == 0) print("copy book1 to target_book1 once more") Filesystem.copy(self.pipeline.utils.report, book1, target_book1) dirlist = os.listdir(target_book1) dirlist.sort() self.assertEqual(dirlist, ["images", "ncc.html"]) dirlist = os.listdir(os.path.join(target_book1, "images")) dirlist.sort() self.assertEqual(dirlist, ["Image.png", "zmage.png"]) self.assertTrue( len([m for m in self.pipeline.messages if m.startswith("[ERROR]")]) == 0) self.assertTrue( len([m for m in self.pipeline.messages if m.startswith("[WARN]")]) == 0) print("copy book2 to target_book2") Filesystem.copy(self.pipeline.utils.report, book2, target_book2) dirlist = os.listdir(target_book2) dirlist.sort() self.assertEqual(dirlist, ["images", "ncc.html"]) dirlist = os.listdir(os.path.join(target_book2, "images")) dirlist.sort() self.assertEqual(dirlist, ["Image.png", "zmage.png"]) self.assertTrue( len([m for m in self.pipeline.messages if m.startswith("[ERROR]")]) == 0) self.assertTrue( len([m for m in self.pipeline.messages if m.startswith("[WARN]")]) == 0) print("copy book2 to target_book2 once more") Filesystem.copy(self.pipeline.utils.report, book2, target_book2) dirlist = os.listdir(target_book2) dirlist.sort() self.assertEqual(dirlist, ["images", "ncc.html"]) dirlist = os.listdir(os.path.join(target_book2, "images")) dirlist.sort() self.assertEqual(dirlist, ["Image.png", "zmage.png"]) self.assertTrue( len([m for m in self.pipeline.messages if m.startswith("[ERROR]")]) == 0) self.assertTrue( len([m for m in self.pipeline.messages if m.startswith("[WARN]")]) == 0) print("copy book3 to target_book3") Filesystem.copy(self.pipeline.utils.report, book3, target_book3) dirlist = os.listdir(target_book3) dirlist.sort() self.assertEqual(dirlist, ["images", "ncc.html"]) dirlist = os.listdir(os.path.join(target_book3, "images")) dirlist.sort() self.assertEqual(dirlist, ["Image.png", "locked", "zmage.png"]) self.assertTrue( len([m for m in self.pipeline.messages if m.startswith("[ERROR]")]) == 0) self.assertTrue( len([m for m in self.pipeline.messages if m.startswith("[WARN]")]) == 0) print("copy book3 to target_book3 once more") Filesystem.copy(self.pipeline.utils.report, book3, target_book3) dirlist = os.listdir(target_book3) dirlist.sort() self.assertEqual(dirlist, ["images", "ncc.html"]) dirlist = os.listdir(os.path.join(target_book3, "images")) dirlist.sort() self.assertEqual(dirlist, ["Image.png", "locked", "zmage.png"]) self.assertTrue( len([m for m in self.pipeline.messages if m.startswith("[WARN]")]) == 0) self.assertTrue( len([ m for m in self.pipeline.messages if m.startswith("[ERROR]") and "/locked" in m ]) >= 1)
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False if not epub.identifier() or not epub.identifier().isnumeric(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False self.utils.report.info("Lager en kopi av EPUBen") temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) if not os.path.exists(os.path.join(self.dir_out, epub.identifier())): os.makedirs(os.path.join(self.dir_out, epub.identifier())) time_created = time.strftime("%Y-%m-%dT%H:%M:%S") dictfiles = {} changelog = "changelog.txt" deleted = "deleted.yml" files = "files.yml" extra_files = [changelog, deleted, files, "restore_files.py"] changes_made = False new_epub = False # Overview of deleted files and changelog history deleted_path = os.path.join(self.dir_out, epub.identifier(), deleted) changelog_path = os.path.join(self.dir_out, epub.identifier(), changelog) deleted_doc = {} if os.path.isfile(deleted_path): with open(deleted_path, 'r') as f: deleted_doc = yaml.load(f, Loader=yaml.FullLoader) or {} # Dictfiles contains the most recent version of each file, saved to files.yml for (path, subdir_list, file_list) in walk(os.path.join(self.dir_out, epub.identifier())): for file_name in file_list: if file_name in extra_files: continue file_path = os.path.join(path, file_name) relative_path = file_path.replace( os.path.join(self.dir_out, epub.identifier()), "") relative_path = relative_path.strip("/") short_path = self.short_path_by_one(relative_path) new_dict = {short_path: relative_path} if short_path not in dictfiles: dictfiles.update(new_dict) elif dictfiles[short_path] < relative_path: dictfiles.update(new_dict) new_file_list = [] changelog_string = "" file_added_again = False # Changelog.txt contains the history of changes to this nlbpub with timestamps for temp_path, temp_subdir_list, temp_file_list in walk(temp_epubdir): for temp_file in temp_file_list: full_temp_file_path = os.path.join(temp_path, temp_file) temp_file = full_temp_file_path.replace(temp_epubdir, "") temp_file = temp_file.strip("/") new_file_list.append(temp_file) if temp_file in dictfiles and filecmp.cmp( full_temp_file_path, os.path.join(self.dir_out, epub.identifier(), dictfiles[temp_file])): os.remove(full_temp_file_path) elif temp_file in dictfiles and not filecmp.cmp( full_temp_file_path, os.path.join(self.dir_out, epub.identifier(), dictfiles[temp_file])): changes_made = True new_location = { temp_file: os.path.join(time_created, temp_file) } dictfiles.update(new_location) self.utils.report.info("Fil endret: " + temp_file) changelog_string += ("\n{}: Fil endret: {}".format( time_created, temp_file)) elif temp_file not in dictfiles: if dictfiles == {}: new_epub = True changes_made = True new_file = { temp_file: os.path.join(time_created, temp_file) } dictfiles.update(new_file) if not new_epub: self.utils.report.info("Fil lagt til: " + temp_file) changelog_string += ( "\n{}: Fil lagt til: {}".format( time_created, temp_file)) if temp_file in deleted_doc: changes_made = True file_added_again = True deleted_doc.pop(temp_file, None) self.utils.report.info("Fil lagt til på nytt: " + temp_file) changelog_string += ( "\n{}: Fil lagt til på nytt: {}".format( time_created, temp_file)) dirs = next(walk(temp_epubdir))[1] for dir in dirs: self.del_empty_dirs(temp_epubdir, dir) if file_added_again: with open(deleted_path, 'w') as deleted_file: for key in deleted_doc: deleted_file.write("\n'{}': '{}'".format( key.replace("'", "''"), time_created.replace("'", "''"))) # Deleted file history saved to deleted files.yml with open(deleted_path, self.append_write(deleted_path)) as deleted_file: for key in dictfiles: if key not in new_file_list and key not in deleted_doc and key not in extra_files: changes_made = True self.utils.report.info("Fil slettet: " + key) changelog_string += ("\n{}: Fil slettet: {}".format( time_created, key)) deleted_file.write("\n'{}': '{}'".format( key.replace("'", "''"), time_created.replace("'", "''"))) # Changelog saved to changelog.txt with open(changelog_path, self.append_write(changelog_path)) as changelog_file: changelog_file.write(changelog_string) deleted_doc = {} if os.path.isfile(deleted_path): with open(deleted_path, 'r') as f: deleted_doc = yaml.load(f, Loader=yaml.FullLoader) or {} for del_file in deleted_doc: try: del dictfiles[del_file] except Exception: self.utils.report.debug(traceback.format_exc(), preformatted=True) with open(os.path.join(temp_epubdir, files), 'w') as files_doc: for file in dictfiles: files_doc.write("\n'{}': '{}'".format( file.replace("'", "''"), dictfiles[file].replace("'", "''"))) # Save copy of different files in NLBPUB master. Different versions of files under NLBPUB-tidligere/xxxxxxx/time # To restore a certain version copy files from the each folder up to the wanted version to a new folder archived_path, stored = self.utils.filesystem.storeBook( temp_epubdir, epub.identifier(), subdir=time_created) self.utils.report.attachment(None, archived_path, "DEBUG") if changes_made: if new_epub: self.utils.report.info( "Endringer oppdaget for: " + epub.identifier() + ", ny epub ble kopiert til NLBpub tidligere versjoner.") self.utils.report.title = self.title + ": " + epub.identifier( ) + " 👍😄" + epubTitle + " , ny epub ble kopiert" else: self.utils.report.info( "Endringer oppdaget for: " + epub.identifier() + ", endrede filer ble kopiert til NLBpub tidligere versjoner." ) self.utils.report.title = self.title + ": " + epub.identifier( ) + " 👍😄" + epubTitle + " , endring registrert" else: self.utils.report.info("Ingen endringer oppdaget for " + epub.identifier()) self.utils.report.title = self.title + ": " + epub.identifier( ) + " 👍😄" + epubTitle + " , ingen endring registrert" self.utils.report.should_email = False return True
def on_book(self): epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return self.utils.report.info("Lager en kopi av EPUBen med tomme bildefiler") temp_noimages_epubdir_obj = tempfile.TemporaryDirectory() temp_noimages_epubdir = temp_noimages_epubdir_obj.name Filesystem.copy(self.utils.report, epub.asDir(), temp_noimages_epubdir) if os.path.isdir(os.path.join(temp_noimages_epubdir, "EPUB", "images")): temp_xml_obj = tempfile.NamedTemporaryFile() temp_xml = temp_xml_obj.name opf_image_references = [] html_image_references = {} for root, dirs, files in os.walk( os.path.join(temp_noimages_epubdir, "EPUB")): for file in files: if file.endswith(".opf"): opf_file = os.path.join(root, file) self.utils.report.info( "Fjerner alle bildereferanser fra OPFen, og erstatter med en referanse til dummy.jpg..." ) opf_xml_document = ElementTree.parse(opf_file) opf_xml = opf_xml_document.getroot() image_items = opf_xml.xpath( "//*[local-name()='item' and starts-with(@media-type, 'image/')]" ) replaced = False for image_item in image_items: if image_item.attrib[ "href"] not in opf_image_references: opf_image_references.append( image_item.attrib["href"]) if image_item.get("href") == "images/cover.jpg": pass # don't change the reference to cover.jpg elif not replaced: image_item.attrib["href"] = "images/dummy.jpg" replaced = True else: image_item.getparent().remove(image_item) opf_xml_document.write(opf_file, method='XML', xml_declaration=True, encoding='UTF-8', pretty_print=False) if file.endswith(".xhtml"): html_file = os.path.join(root, file) html_xml_document = ElementTree.parse(html_file) html_xml = html_xml_document.getroot() image_references = html_xml.xpath( "//@href | //@src | //@altimg") for reference in image_references: path = reference.split("#")[0] if path.startswith("images/"): if path not in html_image_references: html_image_references[path] = [] html_image_references[path].append(file) self.utils.report.info( "Erstatter alle bildereferanser med images/dummy.jpg..." ) self.utils.report.debug("dummy-jpg.xsl") self.utils.report.debug(" source = " + html_file) self.utils.report.debug(" target = " + temp_xml) xslt = Xslt(self, stylesheet=os.path.join( Xslt.xslt_dir, IncomingNordic.uid, "dummy-jpg.xsl"), source=html_file, target=temp_xml) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, html_file) # validate for the presence of image files here, since epubcheck won't be able to do it anymore after we change the EPUB image_files_present = [] for root, dirs, files in os.walk( os.path.join(temp_noimages_epubdir, "EPUB", "images")): for file in files: fullpath = os.path.join(root, file) relpath = os.path.relpath( fullpath, os.path.join(temp_noimages_epubdir, "EPUB")) image_files_present.append(relpath) image_error = False for file in image_files_present: if file not in opf_image_references: self.utils.report.error( "Bildefilen er ikke deklarert i OPFen: " + file) image_error = True for file in opf_image_references: if file not in image_files_present: self.utils.report.error( "Bildefilen er deklarert i OPFen, men finnes ikke: " + file) image_error = True for file in html_image_references: if file not in opf_image_references: self.utils.report.error( "Bildefilen er deklarert i HTMLen, men finnes ikke: " + file + " (deklarert i: " + ", ".join(html_image_references[file]) + ")") image_error = True if image_error: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False for root, dirs, files in os.walk( os.path.join(temp_noimages_epubdir, "EPUB", "images")): for file in files: if file == "cover.jpg": continue # don't delete the cover file fullpath = os.path.join(root, file) os.remove(fullpath) shutil.copy( os.path.join(Xslt.xslt_dir, IncomingNordic.uid, "reference-files", "demobilde.jpg"), os.path.join(temp_noimages_epubdir, "EPUB", "images", "dummy.jpg")) temp_noimages_epub = Epub(self.utils.report, temp_noimages_epubdir) self.utils.report.info( "Validerer EPUB med epubcheck og nordiske retningslinjer...") epub_noimages_file = temp_noimages_epub.asFile() with DaisyPipelineJob(self, "nordic-epub3-validate", {"epub": os.path.basename(epub_noimages_file)}, priority="high", pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context={ os.path.basename(epub_noimages_file): epub_noimages_file }) as dp2_job: # get validation report report_file = os.path.join(dp2_job.dir_output, "html-report/report.xhtml") if os.path.isfile(report_file): with open(report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report.html"), "SUCCESS" if dp2_job.status == "SUCCESS" else "ERROR") if dp2_job.status != "SUCCESS": self.utils.report.error("Klarte ikke å validere boken") self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return self.utils.report.debug("Making a copy of the EPUB to work on…") epub_fixed, epub_fixed_obj = epub.copy() epub_unzipped = epub_fixed.asDir() nav_path = os.path.join(epub_unzipped, epub_fixed.nav_path()) mathML_validation_result = True mathml_error_count = 0 mathml_errors_not_shown = 0 mathml_report_errors_max = 10 for root, dirs, files in os.walk(epub_unzipped): for f in files: file = os.path.join(root, f) if not file.endswith(".xhtml") or file is nav_path: continue self.utils.report.info("Checking MathML in " + file) mathml_validation = Mathml_validator( self, source=file, report_errors_max=mathml_report_errors_max) if not mathml_validation.success: mathml_error_count += mathml_validation.error_count mathml_errors_not_shown += max( (mathml_validation.error_count - mathml_report_errors_max), 0) if mathml_error_count > mathml_report_errors_max: mathml_report_errors_max = 0 # don't put any more errors for the other HTML documents in the main report mathML_validation_result = False if mathml_errors_not_shown > 0: self.utils.report.error( "{} additional MathML errors not shown in the main report. Check the log for details." .format(mathml_errors_not_shown)) if mathML_validation_result is False: return False self.utils.report.debug( "Making sure that the EPUB has the correct file and directory permissions…" ) epub_fixed.fix_permissions() try: self.utils.report.info("Genererer ACE-rapport...") ace_dir = os.path.join(self.utils.report.reportDir(), "accessibility-report") process = self.utils.filesystem.run( [IncomingNordic.ace_cli, "-o", ace_dir, epub_fixed.asFile()]) if process.returncode == 0: self.utils.report.info("ACE-rapporten ble generert.") else: self.utils.report.warn( "En feil oppstod ved produksjon av ACE-rapporten for " + epub.identifier()) self.utils.report.debug(traceback.format_stack()) # attach report ace_status = None with open(os.path.join(ace_dir, "report.json")) as json_report: ace_status = json.load( json_report)["earl:result"]["earl:outcome"] if ace_status == "pass": ace_status = "SUCCESS" else: ace_status = "WARN" self.utils.report.attachment(None, os.path.join(ace_dir, "report.html"), ace_status) except subprocess.TimeoutExpired: self.utils.report.warn( "Det tok for lang tid å lage ACE-rapporten for " + epub.identifier() + ", og prosessen ble derfor stoppet.") except Exception: self.utils.report.warn( "En feil oppstod ved produksjon av ACE-rapporten for " + epub.identifier()) self.utils.report.debug(traceback.format_exc(), preformatted=True) self.utils.report.info( "Boken er valid. Kopierer til EPUB master-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( epub_fixed.asDir(), epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " er valid 👍😄" + epubTitle self.utils.filesystem.deleteSource() return True
def on_book(self): epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book["name"] + " feilet 😭👎" + epubTitle return if not epub.identifier(): self.utils.report.error(self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book["name"] + " feilet 😭👎" + epubTitle return self.utils.report.should_email = self.should_email_default self.utils.report.should_message_slack = self.should_message_slack self.utils.report.info("Lager kopi av EPUB...") nordic_epubdir_obj = tempfile.TemporaryDirectory() nordic_epubdir = nordic_epubdir_obj.name Filesystem.copy(self.pipeline.utils.report, epub.asDir(), nordic_epubdir) nordic_epub = Epub(self.utils.report, nordic_epubdir) html_file = os.path.join(nordic_epubdir, "EPUB", nordic_epub.identifier() + ".xhtml") nav_file = os.path.join(nordic_epubdir, "EPUB", "nav" + ".xhtml") package_file = os.path.join(nordic_epubdir, "EPUB", "package" + ".opf") nlbpub_files = [html_file, nav_file, package_file] for file in nlbpub_files: if not os.path.isfile(file): self.utils.report.error(file + " Not found. This is not a valid NLBPUB") self.utils.report.info("Validerer NLBPUB") schematron_files = ["nordic2015-1.sch", "nordic2015-1.nav-references.sch", "nordic2015-1.opf.sch"] rng_files = "nordic-html5.rng" html_sch = Schematron(self, schematron=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", schematron_files[0]), source=html_file) nav_sch = Schematron(self, schematron=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", schematron_files[1]), source=nav_file) opf_sch = Schematron(self, schematron=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", schematron_files[2]), source=package_file) warning_sch = Schematron(self, schematron=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", "nlbpub-check-need-for-manual-intervention.sch"), source=html_file) schematron_list = [html_sch, nav_sch, opf_sch] html_relax = Relaxng(self, relaxng=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", rng_files), source=html_file) for i in range(0, len(schematron_list)): if not schematron_list[i].success: self.utils.report.error("Validering av NLBPUB feilet etter schematron: " + schematron_files[i]) return False if not html_relax.success: self.utils.report.error("Validering av NLBPUB feilet etter RELAXNG: " + rng_files) return False self.utils.report.info("Boken er valid.") if not self.skip_warning: #warning_sch = Schematron(self, schematron=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", "nlbpub-check-need-for-manual-intervention.sch"), source=html_file) if warning_sch.success is False: if self.uid == "NLBPUB-incoming-warning": archived_path, stored = self.utils.filesystem.storeBook(nordic_epubdir, epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier() + " er valid, men må sjekkes manuelt 👍😄" + epubTitle self.utils.report.should_email = True self.utils.report.should_message_slack = True return True else: self.utils.report.should_email = False self.utils.report.should_message_slack = False self.utils.report.title = self.title + ": " + epub.identifier() + " er valid, men må sjekkes manuelt 👍😄" + epubTitle return True else: if self.uid == "NLBPUB-incoming-validator": archived_path, stored = self.utils.filesystem.storeBook(nordic_epubdir, epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier() + " er valid 👍😄" + epubTitle self.utils.filesystem.deleteSource() return True else: self.utils.report.info(epub.identifier() + " er valid og har ingen advarsler.") return True archived_path, stored = self.utils.filesystem.storeBook(nordic_epubdir, epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier() + " er valid 👍😄" + epubTitle return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") temp_absdir_obj = tempfile.TemporaryDirectory() temp_absdir = temp_absdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_absdir) file_exists = { "abstracts": False, "back-cover": False, "test-audio": False } if not os.path.isfile(os.path.join(temp_absdir, "ncc.html")): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎. Er dette en daisy 2.02 lydbok med en ncc.html fil?" return False try: nccdoc = ElementTree.parse(os.path.join(temp_absdir, "ncc.html")).getroot() except Exception: self.utils.report.info( "Klarte ikke lese ncc fila. Sjekk loggen for detaljer.") self.utils.report.debug(traceback.format_exc(), preformatted=True) edition_identifier = "" audio_title = "" audio_title = " (" + nccdoc.xpath( "string(//*[@name='dc:title']/@content)") + ") " issue_identifier = nccdoc.xpath( "string(//*[@name='dc:identifier']/@content)") edition_identifier = issue_identifier[0:6] if edition_identifier == (""): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + "Lydbok feilet 😭👎" return False try: smilFile = nccdoc.xpath( "substring-before(//*[text()='Bokomtale' or text()='Baksidetekst' or text()='Omslagstekst']/@href,'#')" ) smilFile_Id = nccdoc.xpath( "substring-after(//*[text()='Bokomtale' or text()='Baksidetekst' or text()='Omslagstekst']/@href,'#')" ) except Exception: self.utils.report.debug(traceback.format_exc(), preformatted=True) self.utils.report.error( "Det oppstod en feil for" + edition_identifier + " under lasting av smilfilene. Sjekk loggen for detaljer.") return False # Back-cover if (smilFile != ""): try: smildoc = ElementTree.parse(os.path.join( temp_absdir, smilFile)).getroot() mp3File = smildoc.xpath("string((//audio/@src)[1])") mp3File_start = smildoc.xpath( "substring-before(substring-after(((//par[@id='{0}' or text/@id='{0}']//audio)[1]/@clip-begin),'='),'s')" .format(smilFile_Id)) mp3File_end = smildoc.xpath( "substring-before(substring-after(((//par[@id='{0}' or text/@id='{0}']//audio)[last()]/@clip-end),'='),'s')" .format(smilFile_Id)) if mp3File_start == mp3File_end: self.utils.report.info( "Klarte ikke å bestemme start-/slutt-tid for baksidetekst" ) # Creates audio segment in milliseconds from start to end of the abstract file mp3 = AudioSegment.from_mp3(os.path.join(temp_absdir, mp3File)) new_mp3 = mp3[float(mp3File_start) * 1000:float(mp3File_end) * 1000] new_mp3.export( os.path.join(temp_absdir, self.parentdirs["back-cover"] + ".mp3")) self.utils.report.info("Baksidetekst eksportert fra: " + mp3File) file_exists["back-cover"] = True except Exception: self.utils.report.debug(traceback.format_exc(), preformatted=True) self.utils.report.info( "Klarte ikke hente ut baksidetekst for " + edition_identifier + " sjekk loggen for detaljer.") else: self.utils.report.info("Baksidetekst ikke funnet for " + edition_identifier) # creates abstract from ncc --> smil --> mp3 several_smilFiles = [] several_smilFiles_id = [] try: number_of_smilfiles = int(nccdoc.xpath("count(//@href)")) for i in range(number_of_smilfiles): several_smilFiles.append( nccdoc.xpath( "substring-before((//@href)[{0}],'#')".format(i + 1))) several_smilFiles_id.append( nccdoc.xpath( "substring-after((//@href)[{0}],'#')".format(i + 1))) except Exception: self.utils.report.info(traceback.format_exc(), preformatted=True) self.utils.report.info("Klarte ikke hente ut .smil filene for " + edition_identifier + audio_title) timeout = time.time() + 60 * 2 duration = 0 num = 0 try: while (duration <= 50 and time.time() < timeout and int(number_of_smilfiles / 2 + num) < int( number_of_smilfiles * 0.9)): smilFile_abstract = several_smilFiles[int(number_of_smilfiles * 0.5 + num)] smilFile_abstract_id = several_smilFiles_id[int( number_of_smilfiles * 0.5 + num)] smildoc_abstract = ElementTree.parse( os.path.join(temp_absdir, smilFile_abstract)).getroot() mp3File_abstract_start = float( smildoc_abstract.xpath( "substring-before(substring-after(((//par[@id='{0}' or text/@id='{0}']//audio)[1]/@clip-begin),'='),'s')" .format(smilFile_abstract_id))) if (smilFile_abstract == several_smilFiles[ int(number_of_smilfiles * 0.5 + num) + 1]): smilFile_abstract_id = several_smilFiles_id[ int(number_of_smilfiles * 0.5 + num) + 1] mp3File_abstract_end = float( smildoc_abstract.xpath( "substring-before(substring-after(((//par[@id='{0}' or text/@id='{0}']//audio)[last()]/@clip-end),'='),'s')" .format(smilFile_abstract_id))) duration = mp3File_abstract_end - mp3File_abstract_start num = num + 1 mp3File_abstract = smildoc_abstract.xpath( "string((//audio/@src)[1])") except Exception: self.utils.report.info(traceback.format_exc(), preformatted=True) self.utils.report.info("Lydutdrag fra smilfiler feilet.") if (duration >= 75): mp3File_abstract_end = mp3File_abstract_start + 75 # As a last resort, just use an mp3 of sufficient length if (duration < 20): try: for item in os.listdir(temp_absdir): if (item.endswith(".mp3")): try_mp3 = AudioSegment.from_mp3( os.path.join(temp_absdir, item)) if (len(try_mp3) / 1000 > duration): mp3File_abstract = item mp3File_abstract_start = 0 mp3File_abstract_end = len(try_mp3) / 1000 duration = mp3File_abstract_end if (duration > 75): mp3File_abstract_start = 0.0 mp3File_abstract_end = 75.0 break except Exception: self.utils.report.debug(traceback.format_exc(), preformatted=True) self.utils.report.info( "Klarte ikke hente ut lydutdrag basert på mp3 filene i mappa. Sjekk loggen for detaljer." ) # Export abstract try: mp3_abstract = AudioSegment.from_mp3( os.path.join(temp_absdir, mp3File_abstract)) new_mp3_abstract = mp3_abstract[mp3File_abstract_start * 1000:mp3File_abstract_end * 1000] final_mp3 = new_mp3_abstract.fade_out(3000) final_mp3.export( os.path.join(temp_absdir, self.parentdirs["abstracts"] + ".mp3")) self.utils.report.info("Lydutdrag eksportert fra: " + mp3File_abstract) file_exists["abstracts"] = True except Exception: self.utils.report.info(traceback.format_exc(), preformatted=True) self.utils.report.error( "Klarte ikke eksportere excerpt.mp3. Har du ffmpeg kodeken for .mp3 filer?" ) # Copies abstract and back cover to dir_out if (os.path.isfile( os.path.join(temp_absdir, self.parentdirs["back-cover"] + ".mp3")) or os.path.isfile( os.path.join(temp_absdir, self.parentdirs["abstracts"] + ".mp3"))): if (file_exists["back-cover"]): shutil.copy( os.path.join(temp_absdir, self.parentdirs["back-cover"] + ".mp3"), os.path.join(temp_absdir, self.parentdirs["test-audio"] + ".mp3")) file_exists["test-audio"] = True if (self.parentdirs["abstracts"]): self.utils.report.info( "Baksidetekst og lydutdrag funnet. Kopierer til {}.mp3" .format(self.parentdirs["test-audio"])) else: self.utils.report.info( "Baksidetekst funnet. Kopierer til {}.mp3".format( self.parentdirs["test-audio"])) elif (self.parentdirs["abstracts"]): shutil.copy( os.path.join(temp_absdir, self.parentdirs["abstracts"] + ".mp3"), os.path.join(temp_absdir, self.parentdirs["test-audio"] + ".mp3")) file_exists["test-audio"] = True self.utils.report.info("Lydutdrag funnet. Kopierer til " + self.parentdirs["test-audio"]) for key in self.parentdirs: if (file_exists[key]): archived_path, stored = self.utils.filesystem.storeBook( os.path.join(temp_absdir, self.parentdirs[key] + ".mp3"), edition_identifier, parentdir=self.parentdirs[key], file_extension="mp3") if edition_identifier != issue_identifier: archived_path, stored = self.utils.filesystem.storeBook( os.path.join(temp_absdir, self.parentdirs[key] + ".mp3"), issue_identifier, parentdir=self.parentdirs[key], file_extension="mp3") self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + edition_identifier + " lydutdrag ble eksportert 👍😄" + audio_title else: self.utils.report.title = ( "Klarte ikke hente ut hverken baksidetekst eller lydutdrag 😭👎. " ) + audio_title return False return True
def plot(self, uids, name): dot = Digraph(name="Produksjonssystem", format="png") dot.graph_attr["bgcolor"] = "transparent" node_ranks = {} for rank in Directory.dirs_ranked: node_ranks[rank["id"]] = [] # remember edges so that we don't plot them twice edges = {} for uid in uids: pipeline = None for p in self.pipelines: if p[0].uid == uid: pipeline = p break if not pipeline: continue group_pipeline = pipeline[0].get_current_group_pipeline() title = group_pipeline.get_group_title() pipeline_id = group_pipeline.get_group_id() # re.sub(r"[^a-z\d]", "", title.lower()) queue = group_pipeline.get_queue() queue_created = len([book for book in queue if Pipeline.get_main_event(book) == "created"]) if queue else 0 queue_deleted = len([book for book in queue if Pipeline.get_main_event(book) == "deleted"]) if queue else 0 queue_modified = len([book for book in queue if Pipeline.get_main_event(book) == "modified"]) if queue else 0 queue_triggered = len([book for book in queue if Pipeline.get_main_event(book) == "triggered"]) if queue else 0 queue_autotriggered = len([book for book in queue if Pipeline.get_main_event(book) == "autotriggered"]) if queue else 0 queue_string = [] if queue_created: queue_string.append("nye:"+str(queue_created)) if queue_modified: queue_string.append("endret:"+str(queue_modified)) if queue_deleted: queue_string.append("slettet:"+str(queue_deleted)) if queue_triggered: queue_string.append("trigget:"+str(queue_triggered)) if queue_autotriggered: queue_string.append("autotrigget:"+str(queue_autotriggered)) queue_string = ", ".join(queue_string) queue_size = 0 if queue: queue_size = len(queue) if not group_pipeline.should_handle_autotriggered_books(): queue_size -= queue_autotriggered book = Metadata.pipeline_book_shortname(group_pipeline) relpath_in = None netpath_in = "" rank_in = None if pipeline[0].dir_in: for rank in Directory.dirs_ranked: for dir in rank["dirs"]: if os.path.normpath(pipeline[0].dir_in) == os.path.normpath(rank["dirs"][dir]): rank_in = rank["id"] break if pipeline[0].dir_in and not pipeline[0].dir_base: relpath_in = os.path.basename(os.path.dirname(pipeline[0].dir_in)) elif pipeline[0].dir_in and pipeline[0].dir_base: base_path = Filesystem.get_base_path(pipeline[0].dir_in, pipeline[0].dir_base) relpath_in = os.path.relpath(pipeline[0].dir_in, base_path) if "master" in pipeline[0].dir_base and pipeline[0].dir_base["master"] == base_path: pass else: if pipeline[0].dir_in not in self.buffered_network_paths: smb, file, unc = Filesystem.networkpath(pipeline[0].dir_in) host = Filesystem.get_host_from_url(smb) self.buffered_network_paths[pipeline[0].dir_in] = smb self.buffered_network_hosts[pipeline[0].dir_in] = host netpath_in = self.buffered_network_hosts[pipeline[0].dir_in] if not netpath_in: netpath_in = self.buffered_network_paths[pipeline[0].dir_in] book_count_in = self.get_book_count(pipeline[0].dir_in) label_in = "< <font point-size='24'>{}</font>{}{} >".format( relpath_in, "\n<br/><i><font point-size='20'>{} {}</font></i>".format(book_count_in, "bok" if book_count_in == 1 else "bøker"), "\n<br/><i><font point-size='20'>{}</font></i>".format(netpath_in.replace("\\", "\\\\")) if netpath_in else "") relpath_out = None netpath_out = "" rank_out = None if pipeline[0].dir_out: for rank in Directory.dirs_ranked: for dir in rank["dirs"]: if os.path.normpath(pipeline[0].dir_out) == os.path.normpath(rank["dirs"][dir]): rank_out = rank["id"] break if pipeline[0].dir_out and not pipeline[0].dir_base: relpath_out = os.path.basename(os.path.dirname(pipeline[0].dir_out)) elif pipeline[0].dir_out and pipeline[0].dir_base: base_path = Filesystem.get_base_path(pipeline[0].dir_out, pipeline[0].dir_base) relpath_out = os.path.relpath(pipeline[0].dir_out, base_path) if "master" in pipeline[0].dir_base and pipeline[0].dir_base["master"] == base_path: pass else: if pipeline[0].dir_out not in self.buffered_network_paths: smb, file, unc = Filesystem.networkpath(pipeline[0].dir_out) host = Filesystem.get_host_from_url(smb) self.buffered_network_paths[pipeline[0].dir_out] = unc self.buffered_network_hosts[pipeline[0].dir_out] = host netpath_out = self.buffered_network_hosts[pipeline[0].dir_out] if not netpath_out: netpath_out = self.buffered_network_paths[pipeline[0].dir_out] book_count_out = self.get_book_count(pipeline[0].dir_out, pipeline[0].parentdirs) label_out = "< <font point-size='24'>{}</font>{}{} >".format( relpath_out, "\n<br/><i><font point-size='20'>{} {}</font></i>".format(book_count_out, "bok" if book_count_out == 1 else "bøker"), "\n<br/><i><font point-size='20'>{}</font></i>".format(netpath_out.replace("\\", "\\\\")) if netpath_out else "") if rank_out: node_ranks[rank_out].append(pipeline_id) elif rank_in: next_rank = self.next_rank(rank_in) if next_rank: node_ranks[next_rank].append(pipeline_id) else: node_ranks[rank_in].append(pipeline_id) state = group_pipeline.get_state() status = group_pipeline.get_status() progress_text = group_pipeline.get_progress() pipeline_label = "< <font point-size='26'>{}</font>{} >".format( title, "".join(["\n<br/><i><font point-size='22'>{}</font></i>".format(val) for val in [queue_string, progress_text, status] if val])) fillcolor = "lightskyblue1" if book or queue_size: fillcolor = "lightslateblue" elif state == "considering": fillcolor = "lightskyblue3" elif not group_pipeline.running: fillcolor = "white" elif isinstance(group_pipeline, DummyPipeline): fillcolor = "snow" dot.attr("node", shape="box", style="filled", fillcolor=fillcolor) dot.node(pipeline_id, pipeline_label.replace("\\", "\\\\")) if relpath_in: fillcolor = "wheat" if not pipeline[0].dir_in_obj or not pipeline[0].dir_in_obj.is_available(): fillcolor = "white" dot.attr("node", shape="folder", style="filled", fillcolor=fillcolor) dot.node(pipeline[1], label_in) if pipeline[1] not in edges: edges[pipeline[1]] = [] if pipeline_id not in edges[pipeline[1]]: edges[pipeline[1]].append(pipeline_id) dot.edge(pipeline[1], pipeline_id) node_ranks[rank_in].append(pipeline[1]) if relpath_out: fillcolor = "wheat" if not pipeline[0].dir_out_obj or not pipeline[0].dir_out_obj.is_available(): fillcolor = "white" dot.attr("node", shape="folder", style="filled", fillcolor=fillcolor) dot.node(pipeline[2], label_out) if pipeline_id not in edges: edges[pipeline_id] = [] if pipeline[2] not in edges[pipeline_id]: edges[pipeline_id].append(pipeline[2]) dot.edge(pipeline_id, pipeline[2]) node_ranks[rank_out].append(pipeline[2]) for rank in node_ranks: subgraph = Digraph("cluster_" + rank, graph_attr={"style": "dotted"}) subgraph.graph_attr["bgcolor"] = "#FFFFFFAA" if node_ranks[rank]: subgraph.attr("node", shape="none", style="filled", fillcolor="transparent") subgraph.node("_ranklabel_" + rank, "< <i><font point-size='28'>{}</font></i> >".format(" <br/>".join(str(self.rank_name(rank)).split(" ")))) for dir in node_ranks[rank]: subgraph.node(dir) dot.subgraph(subgraph) dot.render(os.path.join(self.report_dir, name + "_")) # there seems to be some race condition when doing this across a mounted network drive, # so if we get an exception we retry a few times and hope that it works. # see: https://github.com/nlbdev/produksjonssystem/issues/81 for t in reversed(range(10)): try: shutil.copyfile(os.path.join(self.report_dir, name + "_.png"), os.path.join(self.report_dir, name + ".png")) with open(os.path.join(self.report_dir, name + ".js"), "w") as javascript_file: javascript_file.write("setTime(\"{}\");".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) break except Exception as e: logging.debug(" Unable to copy plot image: {}".format(os.path.join(self.report_dir, name + "_.png"))) time.sleep(0.5) if t == 0: raise e dashboard_file = os.path.join(self.report_dir, name + ".html") if not os.path.isfile(dashboard_file): dashboard_template = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../dashboard.html')) if not os.path.exists(self.report_dir): os.makedirs(self.report_dir) shutil.copyfile(dashboard_template, dashboard_file)
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False # ---------- lag en kopi av EPUBen ---------- temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self.utils.report, temp_epubdir) # ---------- gjør tilpasninger i HTML-fila med XSLT ---------- opf_path = temp_epub.opf_path() if not opf_path: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne OPF-fila i EPUBen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False opf_path = os.path.join(temp_epubdir, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() html_file = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) html_file = html_file[0] if html_file else None if not html_file: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False html_file = os.path.join(os.path.dirname(opf_path), html_file) if not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False temp_html_obj = tempfile.NamedTemporaryFile() temp_html = temp_html_obj.name self.utils.report.info("Tilpasser innhold for punktskrift...") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForBraille.uid, "prepare-for-braille.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_html, html_file) # ---------- hent nytt boknummer fra /html/head/meta[@name='dc:identifier'] og bruk som filnavn ---------- html_xml = ElementTree.parse(temp_html).getroot() result_identifier = html_xml.xpath( "/*/*[local-name()='head']/*[@name='dc:identifier']") result_identifier = result_identifier[0].attrib[ "content"] if result_identifier and "content" in result_identifier[ 0].attrib else None if not result_identifier: self.utils.report.error( self.book["name"] + ": Klarte ikke å finne boknummer i ny HTML-fil.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False shutil.copy(html_file, temp_html) os.remove(html_file) html_file = os.path.join( os.path.dirname(html_file), result_identifier + ".html") # Bruk html istedenfor xhtml når det ikke er en EPUB shutil.copy(temp_html, html_file) # TODO: sett inn HTML5 doctype: <!DOCTYPE html> # ---------- slett EPUB-spesifikke filer ---------- items = opf_xml.xpath("/*/*[local-name()='manifest']/*") for item in items: delete = False if "properties" in item.attrib and "nav" in re.split( r'\s+', item.attrib["properties"]): delete = True if "media-type" in item.attrib: if item.attrib["media-type"].startswith("audio/"): delete = True elif item.attrib["media-type"] == "application/smil+xml": delete = True if not delete or "href" not in item.attrib: continue fullpath = os.path.join(os.path.dirname(opf_path), item.attrib["href"]) os.remove(fullpath) os.remove(opf_path) # ---------- lagre HTML-filsett ---------- html_dir = os.path.dirname(opf_path) self.utils.report.info( "Boken ble konvertert. Kopierer til arkiv for punkt-klare HTML-filer." ) archived_path, stored = self.utils.filesystem.storeBook( html_dir, self.book["name"]) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + self.book[ "name"] + " ble konvertert 👍😄" + epubTitle return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # check that this is an EPUB (we only insert metadata into EPUBs) if not epub.isepub(): return False if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke ├Ц bestemme boknummer basert p├Ц dc:identifier." ) return False if epub.identifier() != self.book["name"].split(".")[0]: self.utils.report.error( self.book["name"] + ": Filnavn stemmer ikke overens med dc:identifier: {}".format( epub.identifier())) return False should_produce, metadata_valid = Metadata.should_produce( epub.identifier(), self.publication_format, report=self.utils.report) if not metadata_valid: self.utils.report.info( "{} har feil i metadata for {}. Avbryter.".format( epub.identifier(), self.publication_format)) self.utils.report.title = "{}: {} har feil i metadata for {} ЪўГЪЉј {}".format( self.title, epub.identifier(), self.publication_format, epubTitle) return False if not should_produce: self.utils.report.info( "{} skal ikke produseres som {}. Avbryter.".format( epub.identifier(), self.publication_format)) self.utils.report.title = "{}: {} Skal ikke produseres som {} Ъци {}".format( self.title, epub.identifier(), self.publication_format, epubTitle) return True self.utils.report.info("Lager en kopi av EPUBen") temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self.utils.report, temp_epubdir) is_valid = Metadata.insert_metadata( self.utils.report, temp_epub, publication_format=self.publication_format, report_metadata_errors=False) if not is_valid: self.utils.report.error( "Bibliofil-metadata var ikke valide. Avbryter.") return False self.utils.report.info( "Boken ble oppdatert med format-spesifikk metadata. Kopierer til {}-arkiv." .format(self.publication_format)) archived_path, stored = self.utils.filesystem.storeBook( temp_epub.asDir(), epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = "{}: {} har f├Цtt {}-spesifikk metadata og er klar til ├Ц produseres ЪЉЇЪўё {}".format( self.title, epub.identifier(), self.publication_format, temp_epub.meta("dc:title")) return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False # ---------- lag en kopi av EPUBen ---------- temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self, temp_epubdir) # ---------- gjør tilpasninger i HTML-fila med XSLT ---------- opf_path = temp_epub.opf_path() if not opf_path: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne OPF-fila i EPUBen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False opf_path = os.path.join(temp_epubdir, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() html_file = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) html_file = html_file[0] if html_file else None if not html_file: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False html_dir = os.path.dirname(opf_path) html_file = os.path.join(html_dir, html_file) if not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False temp_html_obj = tempfile.NamedTemporaryFile() temp_html = temp_html_obj.name xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForDocx.uid, "prepare-for-docx.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_html, html_file) archived_path, stored = self.utils.filesystem.storeBook( temp_epubdir, epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " ble konvertert 👍😄" + epubTitle return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") metadata = Metadata.get_metadata_from_book(self.utils.report, self.book["source"]) metadata["identifier"] = re.sub(r"[^\d]", "", metadata["identifier"]) if not metadata["identifier"]: self.utils.report.error( "Klarte ikke å bestemme boknummer for {}".format( self.book["name"])) return False if metadata["identifier"] != self.book["name"]: self.utils.report.info("Boknummer for {} er: {}".format( self.book["name"], metadata["identifier"])) self.utils.report.info("Lager en kopi av DTBoken") temp_dtbookdir_obj = tempfile.TemporaryDirectory() temp_dtbookdir = temp_dtbookdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_dtbookdir) # find DTBook XML dtbook = None for root, dirs, files in os.walk(temp_dtbookdir): for f in files: if f.endswith(".xml"): xml = ElementTree.parse(os.path.join(root, f)).getroot() if xml.xpath( "namespace-uri()" ) == "http://www.daisy.org/z3986/2005/dtbook/": dtbook = os.path.join(root, f) break if dtbook is not None: break if not dtbook: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne DTBook") return False # rename all files to lower case for root, dirs, files in os.walk(temp_dtbookdir): for f in files: if not f.lower() == f: self.utils.report.warn( "renaming to lowercase: {}".format(f)) shutil.move(os.path.join(root, f), os.path.join(root, f.lower())) temp_dtbook_file_obj = tempfile.NamedTemporaryFile() temp_dtbook_file = temp_dtbook_file_obj.name self.utils.report.info("Rydder opp i nordisk DTBook") xslt = Xslt(self, stylesheet=os.path.join(NordicDTBookToEpub.xslt_dir, NordicDTBookToEpub.uid, "nordic-cleanup-dtbook.xsl"), source=dtbook, target=temp_dtbook_file) if not xslt.success: return False shutil.copy(temp_dtbook_file, dtbook) self.utils.report.info("Validerer Nordisk DTBook...") # create context for Pipeline 2 job dtbook_dir = os.path.dirname(dtbook) dtbook_context = {} for root, dirs, files in os.walk(dtbook_dir): for file in files: fullpath = os.path.join(root, file) relpath = os.path.relpath(fullpath, dtbook_dir) dtbook_context[relpath] = fullpath with DaisyPipelineJob( self, "nordic-dtbook-validate", { "dtbook": os.path.basename(dtbook), "no-legacy": "false" }, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context=dtbook_context) as dp2_job_dtbook_validate: dtbook_validate_status = None if dp2_job_dtbook_validate.status == "SUCCESS": dtbook_validate_status = "SUCCESS" elif dp2_job_dtbook_validate.status in ["VALIDATION_FAIL", "FAIL"]: dtbook_validate_status = "WARN" else: dtbook_validate_status = "ERROR" report_file = os.path.join(dp2_job_dtbook_validate.dir_output, "html-report/report.xhtml") if dtbook_validate_status == "WARN": report_doc = ElementTree.parse(report_file) errors = report_doc.xpath( '//*[@class="error" or @class="message-error"]') for error in errors: error_text = " ".join( [e.strip() for e in error.xpath('.//text()')]).strip() error_text = " ".join(error_text.split()).strip() if bool( error_text) else error_text if (bool(error_text) and (error_text.startswith("[tpb124]") or error_text.startswith("[tpb43]") or error_text.startswith("[tpb10] Meta dc:Publisher") or error_text.startswith("[tpb10] Meta dc:Date") or error_text.startswith("[opf3g]") or 'element "h1" not allowed here' in error_text or 'element "h2" not allowed here' in error_text or 'element "h3" not allowed here' in error_text or 'element "h4" not allowed here' in error_text or 'element "h5" not allowed here' in error_text or 'element "h6" not allowed here' in error_text or 'token "toc-brief" invalid' in error_text)): continue # ignorer disse feilmeldingene if error_text.startswith("Incorrect file signature"): magic_number = error.xpath( '*[@class="message-details"]/*[last()]/*[last()]/text()' )[0] magic_number = " ".join(magic_number.split()).strip( ) if bool(magic_number) else magic_number # JFIF already allowed: 0xFF 0xD8 0xFF 0xE0 0x?? 0x?? 0x4A 0x46 0x49 0x46 if magic_number.startswith( "0xFF 0xD8 0xFF 0xDB"): # Also allow JPEG RAW continue elif magic_number[: 19] == "0xFF 0xD8 0xFF 0xE1" and magic_number[ 30:] == ("0x45 0x78 0x69 0x66" ): # Also allow EXIF continue else: dtbook_validate_status = "ERROR" self.utils.report.error(error_text) else: dtbook_validate_status = "ERROR" self.utils.report.error(error_text) # get conversion report if os.path.isfile(report_file): with open(report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-dtbook.html"), dtbook_validate_status) if dtbook_validate_status == "ERROR": self.utils.report.error("Klarte ikke å validere boken") return False if dtbook_validate_status == "WARN": self.utils.report.warn( "DTBoken er ikke valid, men vi fortsetter alikevel.") self.utils.report.info( "Konverterer fra Nordisk DTBook til Nordisk HTML...") temp_htmldir_obj = tempfile.TemporaryDirectory() temp_htmldir = temp_htmldir_obj.name temp_htmlfile = None with DaisyPipelineJob( self, "nordic-dtbook-to-html", { "dtbook": os.path.basename(dtbook), "fail-on-error": "false", "no-legacy": "false" }, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context=dtbook_context) as dp2_job_dtbook_to_html: convert_status = "SUCCESS" if dp2_job_dtbook_to_html.status == "SUCCESS" else "ERROR" convert_report_file = os.path.join( dp2_job_dtbook_to_html.dir_output, "html-report/report.xhtml") if convert_status != "SUCCESS": self.utils.report.error( "Klarte ikke å konvertere boken fra DTBook til HTML") # get conversion report if os.path.isfile(convert_report_file): with open(convert_report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-dtbook-to-html.html"), convert_status) return False dp2_html_dir = os.path.join(dp2_job_dtbook_to_html.dir_output, "output-dir") if not os.path.isdir(dp2_html_dir): self.utils.report.error( "Finner ikke 'output-dir' for den konverterte boken: {}". format(dp2_html_dir)) return False Filesystem.copy(self.utils.report, dp2_html_dir, temp_htmldir) temp_htmlfile = os.path.join(temp_htmldir, metadata["identifier"] + ".xhtml") if not os.path.isfile(temp_htmlfile): self.utils.report.error( "Finner ikke den konverterte boken: {}".format(temp_htmlfile)) self.utils.report.info( "Kanskje filnavnet er forskjellig fra IDen?") return False self.utils.report.info("Rydder opp i nordisk HTML") temp_html_xslt_output_obj = tempfile.NamedTemporaryFile() temp_html_xslt_output = temp_html_xslt_output_obj.name xslt = Xslt(self, stylesheet=os.path.join(NordicDTBookToEpub.xslt_dir, NordicDTBookToEpub.uid, "nordic-cleanup-html.xsl"), source=temp_htmlfile, target=temp_html_xslt_output) if not xslt.success: return False shutil.copy(temp_html_xslt_output, temp_htmlfile) self.utils.report.info( "Konverterer fra Nordisk HTML til Nordisk EPUB3...") # create context for Pipeline 2 job html_dir = os.path.dirname(temp_htmlfile) html_context = {} for root, dirs, files in os.walk(html_dir): for file in files: fullpath = os.path.join(root, file) relpath = os.path.relpath(fullpath, html_dir) html_context[relpath] = fullpath temp_epub_file_obj = tempfile.NamedTemporaryFile() temp_epub_file = temp_epub_file_obj.name with DaisyPipelineJob(self, "nordic-html-to-epub3", { "html": os.path.basename(temp_htmlfile), "fail-on-error": "false" }, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context=html_context) as dp2_job_html_to_epub: convert_status = "SUCCESS" if dp2_job_html_to_epub.status == "SUCCESS" else "ERROR" convert_report_file = os.path.join(dp2_job_html_to_epub.dir_output, "html-report/report.xhtml") if convert_status != "SUCCESS": self.utils.report.error("Klarte ikke å konvertere boken") # get conversion report if os.path.isfile(convert_report_file): with open(convert_report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-html-to-epub3.html"), convert_status) return False dp2_epub_file = os.path.join(dp2_job_html_to_epub.dir_output, "output-dir", metadata["identifier"] + ".epub") if not os.path.isfile(dp2_epub_file): self.utils.report.error( "Finner ikke den konverterte boken: {}".format( dp2_epub_file)) self.utils.report.info( "Kanskje filnavnet er forskjellig fra IDen?") return False self.utils.report.info("Validerer Nordisk EPUB 3...") epub_file = dp2_epub_file.asFile() with DaisyPipelineJob(self, "nordic-epub3-validate", {"epub": os.path.basename(epub_file)}, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context={ os.path.basename(epub_file): epub_file }) as dp2_job_epub_validate: epub_validate_status = "SUCCESS" if dp2_job_epub_validate.status == "SUCCESS" else "ERROR" report_file = os.path.join(dp2_job_epub_validate.dir_output, "html-report/report.xhtml") if epub_validate_status == "ERROR": # attach intermediary file from conversion with open(temp_htmlfile, 'r') as intermediary_htmlfile: self.utils.report.attachment( intermediary_htmlfile.readlines(), os.path.join(self.utils.report.reportDir(), "intermediary-html.html"), "DEBUG") epub_validate_status = "WARN" report_doc = ElementTree.parse(report_file) errors = report_doc.xpath( '//*[@class="error" or @class="message-error"]') for error in errors: error_text = " ".join([ e.strip() for e in error.xpath('.//text()') ]).strip() error_text = " ".join(error_text.split()).strip( ) if bool(error_text) else error_text if (bool(error_text) and (error_text.startswith("[nordic280]") or "PKG-021: Corrupted image file encountered." in error_text)): continue # ignorer disse feilmeldingene else: self.utils.report.warn( "Not ignoring: {}".format(error_text)) if error_text.startswith("Incorrect file signature"): magic_number = error.xpath( '*[@class="message-details"]/*[last()]/*[last()]/text()' )[0] magic_number = " ".join(magic_number.split( )).strip() if bool(magic_number) else magic_number # JFIF already allowed: 0xFF 0xD8 0xFF 0xE0 0x?? 0x?? 0x4A 0x46 0x49 0x46 if magic_number.startswith( "0xFF 0xD8 0xFF 0xDB" ): # Also allow JPEG RAW continue elif magic_number[: 19] == "0xFF 0xD8 0xFF 0xE1" and magic_number[ 30:] == ( "0x45 0x78 0x69 0x66" ): # Also allow EXIF continue else: epub_validate_status = "ERROR" self.utils.report.error(error_text) else: epub_validate_status = "ERROR" self.utils.report.error(error_text) # get conversion report if os.path.isfile(report_file): with open(report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-epub3.html"), epub_validate_status) if epub_validate_status == "ERROR": self.utils.report.error( "Klarte ikke å validere EPUB 3-versjonen av boken") return False Filesystem.copy(self.utils.report, dp2_epub_file, temp_epub_file) epub = Epub(self.utils.report, temp_epub_file) if not epub.isepub(): return False self.utils.report.info( "Boken ble konvertert. Kopierer til EPUB3-fra-DTBook-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( epub.asDir(), metadata["identifier"], overwrite=self.overwrite) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = "{}: {} ble konvertert 👍😄 ({})".format( self.title, metadata["identifier"], metadata["title"]) return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") self.utils.report.info("Lager en kopi av filsettet") temp_htmldir_obj = tempfile.TemporaryDirectory() temp_htmldir = temp_htmldir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_htmldir) self.utils.report.info("Finner HTML-fila") html_file = None for root, dirs, files in os.walk(temp_htmldir): for f in files: if f.endswith("html"): html_file = os.path.join(root, f) if not html_file or not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne en HTML-fil.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet " return False html_xml = ElementTree.parse(html_file).getroot() identifier = html_xml.xpath( "/*/*[local-name()='head']/*[@name='dc:identifier']") metadata = Metadata.get_metadata_from_book(self.utils.report, temp_htmldir) line_spacing = "single" duplex = "true" for e in html_xml.xpath( "/*/*[local-name()='head']/*[@name='dc:format.linespacing']"): if "double" == e.attrib["content"]: line_spacing = "double" for e in html_xml.xpath( "/*/*[local-name()='head']/*[@name='dc:format.printing']"): if "single-sided" == e.attrib["content"]: duplex = "false" self.utils.report.info("Linjeavstand: {}".format( "åpen" if line_spacing == "double" else "enkel")) self.utils.report.info("Trykk: {}".format("enkeltsidig" if duplex == "false" else "dobbeltsidig")) bookTitle = "" bookTitle = " (" + html_xml.xpath( "string(/*/*[local-name()='head']/*[local-name()='title']/text())" ) + ") " identifier = identifier[0].attrib[ "content"] if identifier and "content" in identifier[ 0].attrib else None if not identifier: self.utils.report.error( self.book["name"] + ": Klarte ikke å finne boknummer i HTML-fil.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet " return False epub_identifier = html_xml.xpath( "/*/*[local-name()='head']/*[@name='nlbprod:identifier.epub']") epub_identifier = epub_identifier[0].attrib[ "content"] if epub_identifier and "content" in epub_identifier[ 0].attrib else None # ---------- konverter til PEF ---------- # create context for Pipeline 2 job html_dir = os.path.dirname(html_file) html_context = {} for root, dirs, files in os.walk(html_dir): for file in files: kind = mimetypes.guess_type(file)[0] if kind is not None and kind.split("/")[0] in [ "image", "video", "audio" ]: continue # ignore media files fullpath = os.path.join(root, file) relpath = os.path.relpath(fullpath, html_dir) html_context[relpath] = fullpath script_id = "nlb:html-to-pef" pipeline_and_script_version = [ ("1.11.1-SNAPSHOT", "1.10.0-SNAPSHOT"), ] braille_arguments = { "source": os.path.basename(html_file), "braille-standard": "(dots:6)(grade:0)", "line-spacing": line_spacing, "duplex": duplex, } # for custom Statped options using NLBs PIP (remove `and False` or replace with `or True` to test) if metadata["library"].lower() == "statped" and False: # see: https://github.com/nlbdev/pipeline/blob/nlb/nlb/book-to-pef/src/main/resources/xml/html-to-pef.xpl#L146-L167 # # (1) 'http://www.nlb.no/pipeline/modules/braille/pre-processing.xsl', # (2) 'http://www.daisy.org/pipeline/modules/braille/xml-to-pef/generate-toc.xsl', # (3) if ($default-table-class = '') then resolve-uri('add-table-classes.xsl') else (), # (4) if ($insert-boilerplate = 'true') then 'http://www.nlb.no/pipeline/modules/braille/insert-boilerplate.xsl' else (), # (5) if ($apply-default-stylesheet = 'true') then 'http://www.nlb.no/pipeline/modules/braille/default.scss' else (), # (6) if ($stylesheet) then tokenize($stylesheet,',') else ()),' ')"/> braille_arguments["insert-boilerplate"] = "false" # disable (4) braille_arguments[ "apply-default-stylesheet"] = "false" # disable (5) # (1-3) will still be included. Specifying (6) let's us include replacements for (4) and (5) braille_arguments["stylesheet"] = ",".join([ "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/xslt/insert-boilerplate.xsl", "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/scss/braille.scss" ]) # for custom Statped options using DAISYs PIP (remove `and False` or replace with `or True` to test) if metadata["library"].lower() == "statped" and True: # use DAISYs version of PIP instead script_id = "html-to-pef" pipeline_and_script_version = [ ("1.14.6", None), ("1.14.5", None), ("1.14.4", "4.2.0"), ("1.14.4-SNAPSHOT", "4.1.1"), ("1.14.3", "4.1.1"), ("1.14.2", "4.1.0"), ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ] braille_arguments = { "html": os.path.basename(html_file), "transform": "(formatter:dotify)(translator:liblouis)(dots:6)(grade:0)", "stylesheet": " ".join([ # 1. better volume breaking, and also removes title page and print toc, moves the colophon and copyright page to the end of the book # "https://raw.githubusercontent.com/nlbdev/pipeline/nlb/nlb/book-to-pef/src/main/resources/xml/pre-processing.xsl", "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/xslt/pre-processing.xsl", #"https://raw.githubusercontent.com/daisy/pipeline/master/modules/braille/xml-to-pef/src/main/resources/xml/xslt/generate-toc.xsl", # 3. NLB: Add table classes based on the dimensions of the table, for better handling of tables "https://raw.githubusercontent.com/nlbdev/pipeline/nlb/nlb/book-to-pef/src/main/resources/xml/add-table-classes.xsl", # 4. NLB: Generate a new title page and about page in the frontmatter # "https://raw.githubusercontent.com/nlbdev/pipeline/nlb/nlb/book-to-pef/src/main/resources/xml/insert-boilerplate.xsl", "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/xslt/insert-boilerplate.xsl", # 5. Statped-specific SCSS "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/scss/braille.scss", ]), "page-width": '38', "page-height": '29', "toc-depth": '2', "maximum-number-of-sheets": '50', "include-production-notes": 'true', "hyphenation": 'false', "allow-volume-break-inside-leaf-section-factor": '10', "prefer-volume-break-before-higher-level-factor": '1', "stylesheet-parameters": "(skip-margin-top-of-page:true)", } pef_tempdir_object = tempfile.TemporaryDirectory() self.utils.report.info("Konverterer fra HTML til PEF...") found_pipeline_version = None found_script_version = None with DaisyPipelineJob( self, script_id, braille_arguments, pipeline_and_script_version=pipeline_and_script_version, context=html_context) as dp2_job: found_pipeline_version = dp2_job.found_pipeline_version found_script_version = dp2_job.found_script_version # get conversion report if os.path.isdir( os.path.join(dp2_job.dir_output, "preview-output-dir")): Filesystem.copy( self.utils.report, os.path.join(dp2_job.dir_output, "preview-output-dir"), os.path.join(self.utils.report.reportDir(), "preview")) self.utils.report.attachment( None, os.path.join(self.utils.report.reportDir(), "preview" + "/" + identifier + ".pef.html"), "SUCCESS" if dp2_job.status == "SUCCESS" else "ERROR") if dp2_job.status != "SUCCESS": self.utils.report.info("Klarte ikke å konvertere boken") self.utils.report.title = self.title + ": " + identifier + " feilet 😭👎" + bookTitle return False dp2_pef_dir = os.path.join(dp2_job.dir_output, "pef-output-dir") dp2_new_pef_dir = os.path.join(dp2_job.dir_output, "output-dir") if not os.path.exists(dp2_pef_dir) and os.path.exists( dp2_new_pef_dir): dp2_pef_dir = dp2_new_pef_dir if not os.path.isdir(dp2_pef_dir): self.utils.report.info("Finner ikke den konverterte boken.") self.utils.report.title = self.title + ": " + identifier + " feilet 😭👎" + bookTitle return False Filesystem.copy(self.utils.report, dp2_pef_dir, pef_tempdir_object.name) self.utils.report.info("Boken ble konvertert.") self.utils.report.info("Kopierer metadata fra HTML til PEF...") try: pef_file = None for root, dirs, files in os.walk(pef_tempdir_object.name): for f in files: if f.endswith(".pef"): pef_file = os.path.join(root, f) if not pef_file or not os.path.isfile(pef_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne en PEF-fil.") else: additional_metadata = [] additional_metadata.append( ("daisy-pipeline-engine-version", "nlbprod", "http://www.nlb.no/production", None, found_pipeline_version)) additional_metadata.append( ("daisy-pipeline-script-id", "nlbprod", "http://www.nlb.no/production", None, script_id)) additional_metadata.append( ("daisy-pipeline-script-version", "nlbprod", "http://www.nlb.no/production", None, found_script_version)) for argument in braille_arguments: if argument in ["source", "html"]: continue # skip HTML file path values = braille_arguments[argument] values = values if isinstance(values, list) else [values] for value in values: additional_metadata.append( ("daisy-pipeline-argument", "nlbprod", "http://www.nlb.no/production", argument, value)) transfer_metadata_from_html_to_pef(html_file, pef_file, additional_metadata) except Exception: self.utils.report.warning(traceback.format_exc(), preformatted=True) self.utils.report.error( "An error occured while trying to insert metadata about the conversion" ) self.utils.report.info("Kopierer til PEF-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( pef_tempdir_object.name, identifier) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + identifier + " ble konvertert 👍😄" + bookTitle return True
def on_book(self): self.utils.report.info("Validerer Daisy 2.02 lydbok") if self.dp1_home == "" or self.validator_script == "": if not self.init_environment(): self.utils.report.error( "Pipeline1 ble ikke funnet. Avbryter..") return False folder = self.book["name"] if self.book["name"].isnumeric() is False: self.utils.report.warn( f"{folder} er ikke et tall, prosesserer ikke denne boka. Mulig det er en multivolum bok." ) self.utils.report.should_email = False return False if os.path.isdir(os.path.join(self.dir_out, folder)): self.utils.report.error( f"{folder} finnes allerede på share, avbryter.") return False if self.nlbsamba_out == "": self.nlbsamba_out = Config.get("nlbsamba.dir") if self.nlbsamba_out is None: self.nlbsamba_out = "" temp_obj = tempfile.TemporaryDirectory() temp_dir = temp_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_dir) if not os.path.isfile(os.path.join(temp_dir, "ncc.html")): self.utils.report.error("Finner ikke ncc fila") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎. Er dette en daisy 2.02 lydbok med en ncc.html fil?" return False try: ncc_tree = ElementTree.parse(os.path.join(temp_dir, "ncc.html")) ncc_encoding = ncc_tree.docinfo.encoding.lower() nccdoc = ncc_tree.getroot() except Exception: self.utils.report.info( "Klarte ikke lese ncc fila. Sjekk loggen for detaljer.") self.utils.report.debug(traceback.format_exc(), preformatted=True) return False edition_identifier = "" audio_title = "" audio_title = nccdoc.xpath("string(//*[@name='dc:title']/@content)") edition_identifier = nccdoc.xpath( "string(//*[@name='dc:identifier']/@content)") if ncc_encoding != 'utf-8': self.utils.report.error( self.book["name"] + ": Encodingen til filen er ikke utf-8, (f{ncc_encoding}) avbryter." ) self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False str_edition_identifier = str(edition_identifier) str_book_name = str(self.book["name"]) if edition_identifier == ( "") or str_edition_identifier != str_book_name: self.utils.report.error( self.book["name"] + f": Klarte ikke å bestemme boknummer basert på dc:identifier. dc:identifier: {str_edition_identifier} mappenavn: {str_book_name}" ) self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False self.utils.report.info("Henter metadata fra api.nlb.no") creative_work_metadata = None edition_metadata = None timeout = 0 while creative_work_metadata is None and timeout < 5: timeout = timeout + 1 creative_work_metadata = Metadata.get_creative_work_from_api( edition_identifier, editions_metadata="all", use_cache_if_possible=True, creative_work_metadata="all") edition_metadata = Metadata.get_edition_from_api( edition_identifier) if creative_work_metadata is not None: break if creative_work_metadata is None: self.utils.report.warning( "Klarte ikke finne et åndsverk tilknyttet denne utgaven. Prøver igjen senere." ) return False library = edition_metadata["library"].lower() # in case of wrong upper lower cases if library == "nlb": library = "NLB" elif library == "statped": library = "Statped" elif library == "kabb": library = "KABB" periodical = False if creative_work_metadata[ "newspaper"] is True or creative_work_metadata[ "magazine"] is True: periodical = True if len(edition_identifier) != 12: self.utils.report.error( f"Boka {edition_identifier} er en avis eller et magasin, men utgavenummeret har ikke 12 siffer" ) return False else: if len(edition_identifier) != 6: self.utils.report.error( f"Boka {edition_identifier} har ikke 6 siffer") return False root_directory = Path(temp_dir) max_size = 702545920 - 20971520 size = sum(f.stat().st_size for f in root_directory.glob('**/*') if f.is_file()) multi_volume = False if size >= max_size: self.utils.report.info( f"{edition_identifier} er på størrelse {size}, sjekker om det er en multivolum bok." ) multi_volume = True else: self.utils.report.info( f"{edition_identifier} er på størrelse {size} bytes") multi_volume_dirs = [] if multi_volume: files_dir = os.listdir(self.dir_in) for file in files_dir: if file.startswith(self.book["name"]) and file[-1].isdigit( ) and file[-2] == "_": self.utils.report.info( f"{file} er en del av multi volum boka {edition_identifier}" ) multi_volume_dirs.append(file) multi_volume_directory = Path( os.path.join(self.dir_in, file)) multi_volume_size = size = sum( f.stat().st_size for f in multi_volume_directory.glob('**/*') if f.is_file()) if multi_volume_size >= max_size: self.utils.report.info( f" Multi volum mappen {file} er på størrelse {multi_volume_size}, dette er for stort" ) self.utils.report.title = self.title + ": " + self.book[ "name"] + " Lydbok feilet 😭👎" return False else: multi_volume_files = os.listdir(multi_volume_directory) self.utils.report.info( f"Validerer filer til multi volum {file}...") if self.check_files(edition_identifier, multi_volume_files, library, multi_volume_directory, multi_volume) is False: return False if len(multi_volume_dirs) <= 0: self.utils.report.error( f"{edition_identifier} bør være en multivolum bok, men har ikke flere multivolum mapper. Avbryter." ) self.utils.report.title = self.title + ": " + self.book[ "name"] + "Lydbok feilet 😭👎" return False files_book = os.listdir(temp_dir) if "default.css" in files_book and library != "Statped": self.utils.report.info("Erstatter default.css med en tom fil") open(os.path.join(temp_dir, "default.css"), 'w').close() self.utils.report.info("Validerer filer...") if self.check_files(edition_identifier, files_book, library, temp_dir, False) is False: return False dc_creator = nccdoc.xpath("string(//*[@name='dc:creator']/@content)") if not len(dc_creator) >= 1: self.utils.report.error( f"{edition_identifier} finner ikke dc:creator, dette må boka ha" ) return False dc_narrator = nccdoc.xpath( "string(//*[@name='ncc:narrator']/@content)") if not len(dc_narrator) >= 1: self.utils.report.error( f"{edition_identifier} finner ikke ncc:narrator, dette må boka ha" ) return False multimedia_types = [ "audioOnly", "audioNcc", "audioPartText", "audioFullText", "textPartAudio", "textNcc" ] ncc_multimedia_type = nccdoc.xpath( "string(//*[@name='ncc:multimediaType']/@content)") if ncc_multimedia_type not in multimedia_types: self.utils.report.error( f"{edition_identifier} har ikke en valid ncc:multimediaType, dette må boka ha. Multimediatype er {ncc_multimedia_type}" ) return False first_head_class = nccdoc.xpath( "string(//*[local-name()='h1'][1]/@class)") second_head = nccdoc.xpath("string(//*[local-name()='h1'][2])").lower() accepted_second_head = [ "lydbokavtalen", "audiobook agreement", "the audiobook agreement", "tigar announcement", "nlb" ] if first_head_class != "title": self.utils.report.error( f"{edition_identifier} første heading {first_head_class} er ikke title" ) return False if second_head not in accepted_second_head and library == "NLB" and creative_work_metadata[ "newspaper"] is False and not ( creative_work_metadata["magazine"] is True and library == "KABB"): self.utils.report.error( f"{edition_identifier} andre heading {second_head} er ikke Lydbokavtalen, Audiobook agreement, eller Tigar announcement" ) return False if library != "Statped": status = self.validate_book(os.path.join(temp_dir, "ncc.html")) if status == "ERROR" or status is False: self.utils.report.error( "Pipeline validator: Boka er ikke valid. Se rapport.") return False self.utils.report.info("Pipeline validator: Boka er valid") if multi_volume: for folder in multi_volume_dirs: self.utils.report.debug(f"Flytter multivolum fil {folder}") archived_path_multi, stored = self.utils.filesystem.storeBook( os.path.join(self.dir_in, folder), folder) self.utils.report.attachment(None, archived_path_multi, "DEBUG") if self.nlbsamba_out != "": archived_path_samba_multi, stored_samba_multi = self.utils.filesystem.storeBook( os.path.join(self.dir_in, folder), folder, dir_out=self.nlbsamba_out) self.utils.report.attachment(None, archived_path_samba_multi, "DEBUG") shutil.rmtree(os.path.join(self.dir_in, folder)) if library == "Statped": css_format = "Statped" elif edition_metadata["includesText"] is True: css_format = "daisy202" else: css_format = "daisy202-ncc" self.utils.report.info(f"Inserting CSS: {css_format}") if library != "Statped": self.utils.filesystem.insert_css( os.path.join(temp_dir, "default.css"), library, css_format) files_temp = os.listdir(temp_dir) archived_path, stored = self.utils.filesystem.storeBook( temp_dir, edition_identifier) if self.nlbsamba_out != "": archived_path_samba, stored_samba = self.utils.filesystem.storeBook( temp_dir, edition_identifier, dir_out=self.nlbsamba_out) self.utils.report.attachment(None, archived_path_samba, "DEBUG") files_out = os.listdir(os.path.join(self.dir_out, edition_identifier)) if self.nlbsamba_out != "": if len(files_temp) == len( os.listdir( os.path.join(self.nlbsamba_out, edition_identifier))): with open( os.path.join(self.nlbsamba_out, edition_identifier, '.donedaisy'), 'w') as file: self.utils.report.debug(".donedaisy created") else: self.utils.report.error( f"MANGLER FILER i {self.nlbsamba_out}, sjekk utmappa") return False if len(files_temp) == len(files_out): with open( os.path.join(self.dir_out, edition_identifier, '.donedaisy'), 'w') as file: self.utils.report.debug(".donedaisy created") else: self.utils.report.error( f"MANGLER FILER i {self.dir_out}, sjekk utmappa") return False self.utils.report.info("Boka er godkjent og overført") if periodical: available_title = "" if creative_work_metadata["newspaper"] is False: available_title = audio_title Bibliofil.book_available("DAISY 2.02", edition_identifier, title=available_title) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + edition_identifier + " er valid 👍😄" + audio_title self.utils.filesystem.deleteSource() return True
def email(self, recipients, subject=None, should_email=True, should_message_slack=True, should_attach_log=True, should_escape_chars=True): if not subject: assert isinstance(self.title, str) or self.pipeline is not None, "either title or pipeline must be specified when subject is missing" subject = self.title if self.title else self.pipeline.title smtp = { "host": Config.get("email.smtp.host", None), "port": Config.get("email.smtp.port", None), "user": Config.get("email.smtp.user", None), "pass": Config.get("email.smtp.pass", None) } sender = Address(Config.get("email.sender.name", "undefined"), addr_spec=Config.get("email.sender.address", "*****@*****.**")) # 0. Create attachment with complete log (including DEBUG statements) if should_attach_log is True: self.attachLog() attachments = [] for m in self._messages["attachment"]: smb, file, unc = Filesystem.networkpath(m["text"]) base_path = Filesystem.get_base_path(m["text"], self.pipeline.dir_base) relpath = os.path.relpath(m["text"], base_path) if base_path else None if m["text"].startswith(self.reportDir()): relpath = os.path.relpath(m["text"], self.reportDir()) if not [a for a in attachments if a["unc"] == unc]: attachments.append({ "title": "{}{}".format(relpath, ("/" if os.path.isdir(m["text"]) else "")), "smb": smb, "file": file, "unc": unc, "severity": m["severity"] }) # Determine overall status status = "INFO" for message_type in self._messages: for m in self._messages[message_type]: if m["severity"] == "SUCCESS" and status in ["INFO"]: status = "SUCCESS" elif m["severity"] == "WARN" and status in ["INFO", "SUCCESS"]: status = "WARN" elif m["severity"] == "ERROR": status = "ERROR" try: assert isinstance(smtp, dict), "smtp must be a dict" assert isinstance(sender, Address), "sender must be a Address" assert isinstance(recipients, str) or isinstance(recipients, list) or isinstance(recipients, tuple), "recipients must be a str, list or tuple" assert isinstance(self.title, str) or self.pipeline and isinstance(self.pipeline.title, str), "title or pipeline.title must be a str" if isinstance(recipients, str): recipients = [recipients] elif isinstance(recipients, tuple): recipients = list(recipients) if status == "ERROR": for key in Config.get("administrators", default=[]): if key not in recipients: recipients.append(key) # when testing, only allow e-mail addresses defined in the ALLOWED_EMAIL_ADDRESSES_IN_TEST env var if Config.get("test"): subject = "[test] " + subject filtered_recipients = [] for recipient in recipients: if recipient in Config.get("email.allowed_email_addresses_in_test"): filtered_recipients.append(recipient) recipients = filtered_recipients # 1. join lines with severity SUCCESS/INFO/WARN/ERROR markdown_text = [] for m in self._messages["message"]: if should_escape_chars: text = m['text'].replace("&", "&").replace("<", "<").replace(">", ">") else: text = m['text'] if m['preformatted'] is True: markdown_text.append("<pre>{}</pre>".format(text)) elif m['severity'] != 'DEBUG': markdown_text.append(text) if attachments != [] or should_attach_log is True: markdown_text.append("\n----\n") markdown_text.append("\n# Lenker\n") markdown_text.append("\n<ul style=\"list-style: none;\">") # Pick icon and style for INFO-attachments attachment_styles = { "DEBUG": { "icon": "🗎", "style": "" }, "INFO": { "icon": "🛈", "style": "" }, "SUCCESS": { "icon": "😄", "style": "background-color: #bfffbf;" }, "WARN": { "icon": "😟", "style": "background-color: #ffffbf;" }, "ERROR": { "icon": "ðŸ˜", "style": "background-color: #ffbfbf;" } } for attachment in attachments: # UNC links seems to be preserved when viewed in Outlook. # file: and smb: URIs are disallowed or removed. # So these links will only work in Windows. # If we need this to work cross-platform, we would have # to map the network share paths to a web server so that # the transfers go through http:. This could maybe be mapped # using environment variables. li = "<li>" li += "<span style=\"vertical-align: middle; font-size: 200%;\">" + attachment_styles[attachment["severity"]]["icon"] + "</span> " li += "<span style=\"vertical-align: middle; " + attachment_styles[attachment["severity"]]["style"] + "\">" li += "<a href=\"file:///" + attachment["unc"] + "\">" + attachment["title"] + "</a> " li += "<a href=\"" + attachment["smb"] + "\">" + self.img_string + "=\" alt=\"" + attachment["smb"] + "\"/>" + "</a> " li += "</span>" li += "</li>" markdown_text.append(li) markdown_text.append("</ul>\n") label_string = "" for label in self.pipeline.labels: label_string += "[{}] ".format(label) markdown_text.append("\n[{}] {} [{}] [status:{}]".format(self.pipeline.uid, label_string, self.pipeline.publication_format, status)) markdown_text = "\n".join(markdown_text) # 2. parse string as Markdown and render as HTML if should_escape_chars: markdown_html = markdown.markdown(markdown_text, extensions=['markdown.extensions.fenced_code', 'markdown.extensions.codehilite']) else: markdown_html = markdown_text markdown_html = '''<!DOCTYPE html> <html> <head> <meta charset=\"utf-8\"/> <title>''' + subject.replace("&", "&").replace("<", "<").replace(">", ">") + '''</title> </head> <body> ''' + markdown_html + ''' </body> </html> ''' if not should_email: logging.info("[e-mail] Not sending email") else: # 3. build e-mail msg = EmailMessage() msg['Subject'] = re.sub(r"\s", " ", subject).strip() msg['From'] = sender msg['To'] = Report.emailStringsToAddresses(recipients) msg.set_content(markdown_text) msg.add_alternative(markdown_html, subtype="html") logging.info("[e-mail] E-mail with subject '{}' will be sent to: {}".format(msg['Subject'], ", ".join(recipients))) # 4. send e-mail if smtp["host"] and smtp["port"]: smtp_server = "{}:{}".format(smtp["host"], smtp["port"]) logging.info("[e-mail] SMTP server: {}".format(smtp_server)) with smtplib.SMTP(smtp_server) as s: s.ehlo() # s.starttls() if smtp["user"] and smtp["pass"]: s.login(smtp["user"], smtp["pass"]) else: logging.debug("[e-mail] user/pass not configured") logging.debug("[e-mail] sending…") s.send_message(msg) logging.debug("[e-mail] sending complete.") else: logging.warning("[e-mail] host/port not configured") temp_md_obj = tempfile.NamedTemporaryFile(suffix=".md") temp_html_obj = tempfile.NamedTemporaryFile(suffix=".html") with open(temp_md_obj.name, "w") as f: f.write(markdown_text) logging.debug("[e-mail] markdown: {}".format(temp_md_obj.name)) with open(temp_html_obj.name, "w") as f: f.write(markdown_html) logging.debug("[e-mail] html: {}".format(temp_html_obj.name)) if should_attach_log is True: path_mail = os.path.join(self.reportDir(), "email.html") shutil.copy(temp_html_obj.name, path_mail) self.mailpath = Filesystem.networkpath(path_mail) else: yesterday = datetime.now() - timedelta(1) yesterday = str(yesterday.strftime("%Y-%m-%d")) path_mail = os.path.join(self.pipeline.dir_reports, "logs", "dagsrapporter", yesterday, self.pipeline.uid + ".html") shutil.copy(temp_html_obj.name, path_mail) self.mailpath = Filesystem.networkpath(path_mail) except AssertionError as e: logging.error("[e-mail] " + str(e)) if not should_message_slack: logging.warning("Not sending message to slack") else: # 5. send message to Slack slack_attachments = [] for attachment in attachments: color = None if attachment["severity"] == "SUCCESS": color = "good" elif attachment["severity"] == "WARN": color = "warning" elif attachment["severity"] == "ERROR": color = "danger" slack_attachments.append({ "title_link": attachment["smb"], "title": attachment["title"], "fallback": attachment["title"], "color": color }) Slack.slack(text=subject, attachments=slack_attachments)
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False # language must be exctracted from epub or else docx default language (nb) wil be used in the converted file language = "" try: #language = " (" + epub.meta("dc:language") + ") " language = epub.meta("dc:language") except Exception: pass # ---------- lag en kopi av EPUBen ---------- temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self, temp_epubdir) opf_path = temp_epub.opf_path() if not opf_path: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne OPF-fila i EPUBen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False opf_path = os.path.join(temp_epubdir, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() html_file = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) html_file = html_file[0] if html_file else None if not html_file: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False html_file = os.path.join(os.path.dirname(opf_path), html_file) if not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False temp_xml_file_obj = tempfile.NamedTemporaryFile() temp_xml_file = temp_xml_file_obj.name self.utils.report.info( "Konverterer fra ASCIIMath til norsk punktnotasjon…") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NLBpubToDocx.uid, "nordic-asciimath-epub.xsl"), source=html_file, target=temp_xml_file) if not xslt.success: return False shutil.copy(temp_xml_file, html_file) # ---------- konverter HTML-fila til DOCX ---------- temp_docxdir_obj = tempfile.TemporaryDirectory() temp_docxdir = temp_docxdir_obj.name try: self.utils.report.info("Konverterer fra XHTML til DOCX...") process = self.utils.filesystem.run([ "/usr/bin/ebook-convert", html_file, os.path.join(temp_docxdir, epub.identifier() + "_calibre.docx"), "--chapter=/", "--chapter-mark=none", "--page-breaks-before=/", "--no-chapters-in-toc", "--toc-threshold=0", "--docx-page-size=a4", # "--linearize-tables", "--extra-css=" + os.path.join(Xslt.xslt_dir, self.uid, 'extra.css'), # NOTE: microsoft fonts must be installed: # sudo apt-get install ttf-mscorefonts-installer "--embed-font-family=Verdana", "--docx-page-margin-top=42", "--docx-page-margin-bottom=42", "--docx-page-margin-left=70", "--docx-page-margin-right=56", #"--language="+epub.meta('dc:language'), ("--language=" + language) if language else "", "--base-font-size=13", #"--remove-paragraph-spacing", #"--remove-paragraph-spacing-indent-size=-1", "--font-size-mapping=13,13,13,13,13,13,13,13" ]) if process.returncode == 0: self.utils.report.info("Boken ble konvertert.") # ------------- script from kvile --------------- document = Document( os.path.join(temp_docxdir, epub.identifier() + "_calibre.docx")) emptyParagraph = False normalParagraph = "Normal" normalParagraphNoIndent = "NormalNoIndent" headingIndent = Cm(1.25) fontSize = Pt(13) # ny kode 2021-01-20 #folder = os.path.join(temp_docxdir) folder = Path(temp_docxdir) # slutt ny kode #self.utils.report.info("Folder: "+folder) def zipdir(src, dst, zip_name): os.chdir(dst) ziph = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) for root, dirs, files in os.walk(src): for file in files: ziph.write(os.path.join(root, file), arcname=os.path.join( root.replace(src, ""), file)) ziph.close() def writeFile(txt, dst): tempFile = open(folder / dst, "w+") tempFile.write(txt) tempFile.close() def delete_paragraph(paragraph): # self.utils.report.info("Delete paragraph: ") p = paragraph._element p.getparent().remove(p) p._p = p._element = None def delete_element(element): element.getparent().remove(element) element._element = None indent = Cm(0.44) hangingIndentList = Cm(0.63) document.styles[normalParagraph].font.size = fontSize document.styles[ normalParagraph].paragraph_format.first_line_indent = indent styleNoIndent = document.styles.add_style( 'NormalNoIndent', WD_STYLE_TYPE.PARAGRAPH) styleNoIndent.base_style = document.styles[normalParagraph] document.styles[ normalParagraphNoIndent].paragraph_format.first_line_indent = Cm( 0) # set style to normal for regular paragraphs, set keep_with_next to false, remove multiple empty paragraphs, and remove empty p after page nr or heading for paragraph in document.paragraphs: # deleting empty text-elements emptyTextElementList = document.element.xpath( "//w:t[. = '']") for emptyTextElement in emptyTextElementList: delete_element(emptyTextElement) paragraph.paragraph_format.keep_with_next = None if re.match("Para 0[1-9]|[0-9] Block|Para [0-9]", paragraph.style.name ) and paragraph.style.font.underline != True: paragraph.style = normalParagraph if len(paragraph.text) <= 1 or re.match( r"^--- \d+ til ", paragraph.text ) or paragraph.style.name[ 0: 7] == "Heading": # if empty p or page nr or heading paragraph.text = re.sub( r"^\s(.*)", r"\1", paragraph.text) #remove space at beginning av p # self.utils.report.info("Paragraph.text <= 1 ") if len( paragraph.text ) == 0 and emptyParagraph: #if last p also was empty or page nr # self.utils.report.info("Paragraph.text == 0 ") delete_paragraph(paragraph) emptyParagraph = True else: emptyParagraph = False if re.match(r"^\s*STATPED_DUMMYTEXT_LI_OL\s*$", paragraph.text): paragraph.text = "" # no indent after Heading, page-nr, or paragraphs starting with "Bilde: ", paragraphs in only bold (text=^_[^_]*_$) and the paragraph after p in only bold, or on empty p. removeIndent = False for paragraph in document.paragraphs: #remove space at beginning of line after <br/> spaceAfterBreakList = paragraph._element.xpath( r'w:r/w:br[@w:clear="none"]/following::w:t[@xml:space="preserve"][1]' ) if len(spaceAfterBreakList) > 0: for spaceAfterBreakElement in spaceAfterBreakList: if re.match( '^ ', spaceAfterBreakElement.text ) and not (spaceAfterBreakElement.xpath( r'preceding-sibling::*[1][self::w:t]')): spaceAfterBreakElement.text = re.sub( r"^ ", r"", spaceAfterBreakElement.text) #remove break before paragraph end breakBeforeParagraphEndList = paragraph._element.xpath( r'w:r[last()]/w:br[@w:clear="none" and not(following-sibling::*)]' ) if len(breakBeforeParagraphEndList) > 0: delete_element(breakBeforeParagraphEndList[0]) t = paragraph.text.strip() if re.match( r"^Bilde: |^Forklaring: |^--- \d+ til |^_[^_]*_$|^STATPED_DUMMYTEXT_LIST_UNSTYLED|^STATPED_DUMMYTEXT_P_BEFORE_DL", t) or ((removeIndent or len(t) == 0) and paragraph.style.name == "Normal"): paragraph.style = normalParagraphNoIndent # Remove dummy-text and set hengemarg if re.match( r"^(STATPED_DUMMYTEXT_LIST_UNSTYLED|STATPED_DUMMYTEXT_DL)", paragraph.text): paragraph.paragraph_format.left_indent = hangingIndentList #Pt(0) paragraph.paragraph_format.first_line_indent = -hangingIndentList #Pt(-20) if re.match(r"^STATPED_DUMMYTEXT", paragraph.text): paragraph.text = re.sub( r"^(STATPED_DUMMYTEXT_LIST_UNSTYLED|STATPED_DUMMYTEXT_DL|STATPED_DUMMYTEXT_P_BEFORE_DL)", "", paragraph.text) if len(t) == 0 or paragraph.style.name[ 0:7] == "Heading" or re.match( r"^--- \d+ til |^_[^_]*_$", t): removeIndent = True else: removeIndent = False # remove bold from Headings. paraStylesWithoutBoldOrUnderline = [ ] #list of all para-styles without underline or bold paraStylesWithoutUnderline = [ ] #list of all para-styles without underline for style in document.styles: if style.name[0:7] == "Heading": style.font.bold = None style.paragraph_format.left_indent = headingIndent #Pt(0) style.paragraph_format.first_line_indent = -headingIndent #Pt(-20) style.paragraph_format.space_before = Pt(0) style.paragraph_format.space_after = Pt(0) style_element = style._element spacing = style_element.xpath(r'w:pPr/w:spacing')[0] spacing.set(qn('w:beforeLines'), "0") spacing.set(qn('w:afterLines'), "0") if style.name[0:5] == "Para ": if style.font.underline != True: paraStylesWithoutUnderline.append(style.name) if style.font.bold != True: paraStylesWithoutBoldOrUnderline.append( style.name) # find all para-styles with wanted properties in tables and change style paraStylesInTables = [] #for paraStyleWithoutBoldOrUnderline in paraStylesWithoutBoldOrUnderline: for paraStyleWithoutUnderline in paraStylesWithoutUnderline: for element in document.element.xpath( "//w:tbl//w:p//w:pStyle[@w:val = '" + paraStyleWithoutUnderline + "']"): paraStylesInTables.append(element) for paraStyleInTables in paraStylesInTables: paraStyleInTables.attrib[ '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'] = normalParagraphNoIndent # or normalParagraph # uncomment if you want to modify first p in a cell # firstParaStylesInTables = [] # for paraStyleWithoutBoldOrUnderline in paraStylesWithoutBoldOrUnderline: # for element in document.element.xpath("//w:tc//w:p[position()=1]//w:pStyle[@w:val = '" + normalParagraph + "']"): # firstParaStylesInTables.append(element) # for paraStyleInTables in firstParaStylesInTables: # paraStyleInTables.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'] = normalParagraphNoIndent # tables missing required <w:tblGrid>, so throws: docx.oxml.exceptions.InvalidXmlError: required ``<w:tblGrid>`` child element not present #from docx.table import _Cell, Table #from docx.oxml.text.paragraph import CT_P # for row in table.columns: # try: # for cell in row.cells: # firstP = True # for p in cell.paragraphs: # if p.style.font.underline != True and re.match(r"^Para | Block",p.style.name): # if firstP: # p.style = "NormalNoIndent" # firstP = False # else: # p.style = "Normal" # except Exception as e: # pass document.save( os.path.join(temp_docxdir, epub.identifier() + ".docx")) self.utils.report.info( "Temp-fil ble lagret: " + os.path.join(temp_docxdir, epub.identifier() + ".docx")) wordFile = os.path.join(temp_docxdir, epub.identifier() + ".docx") zipDocument = zipfile.ZipFile((folder / wordFile)) tempFolder = "temp" zipDocument.extractall(folder / tempFolder) zipDocument.close() zippedFile = tempFolder + "/word/numbering.xml" xmlFile = open((folder / zippedFile), 'r+') xmlText = xmlFile.read() xmlText = re.sub(r'w:left="1152"', r'w:left="360"', xmlText) xmlText = re.sub(r'w:left="1512"', r'w:left="720"', xmlText) xmlText = re.sub(r'w:left="1872"', r'w:left="1080"', xmlText) xmlText = re.sub( r'<w:numFmt w:val="lowerLetter"/><w:lvlText w:val="%([1-9])\."/>', r'<w:numFmt w:val="lowerLetter"/><w:lvlText w:val="%\1)"/>', xmlText) # a. as a) in lists #xmlText = re.sub(r'<w:lvlText w:val="%(1|2)\."/>', r'<w:lvlText w:val="%\1)"/>', xmlText) # a. as a), and 1. as 1) in lists writeFile(xmlText, zippedFile) zipdir(str(folder / tempFolder), str(folder), os.path.join(temp_docxdir, epub.identifier() + ".docx")) # ---------- end script from kvile ------- else: self.utils.report.error( "En feil oppstod ved konvertering til DOCX for " + epub.identifier()) self.utils.report.debug(traceback.format_stack()) self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False except subprocess.TimeoutExpired: self.utils.report.error( "Det tok for lang tid å konvertere " + epub.identifier() + " til DOCX, og Calibre-prosessen ble derfor stoppet.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False except Exception: self.utils.report.error( "En feil oppstod ved konvertering til DOCX for " + epub.identifier()) self.utils.report.info(traceback.format_exc(), preformatted=True) self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False archived_path, stored = self.utils.filesystem.storeBook( temp_docxdir, epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " ble konvertert 👍😄" + epubTitle return True