def compile_schematron(schematron, cwd, report): with Schematron._cache_lock: if schematron in Schematron.cache and Schematron.cache[schematron] and os.path.isfile(Schematron.cache[schematron].name): return Schematron.cache[schematron].name try: temp_xml_1_obj = tempfile.NamedTemporaryFile() temp_xml_1 = temp_xml_1_obj.name temp_xml_2_obj = tempfile.NamedTemporaryFile() temp_xml_2 = temp_xml_2_obj.name temp_xml_3_obj = tempfile.NamedTemporaryFile() temp_xml_3 = temp_xml_3_obj.name report.debug("Compiling schematron ({} + {}): {}".format("iso_dsdl_include.xsl", os.path.basename(schematron), os.path.basename(temp_xml_1))) xslt = Xslt(report=report, cwd=cwd, stylesheet=os.path.join(Schematron.schematron_dir, "iso_dsdl_include.xsl"), source=schematron, target=temp_xml_1, stdout_level="DEBUG", stderr_level="DEBUG") if not xslt.success: return None report.debug("Compiling schematron ({} + {}): {}".format("iso_abstract_expand.xsl", os.path.basename(schematron), os.path.basename(temp_xml_2))) xslt = Xslt(report=report, cwd=cwd, stylesheet=os.path.join(Schematron.schematron_dir, "iso_abstract_expand.xsl"), source=temp_xml_1, target=temp_xml_2, stdout_level="DEBUG", stderr_level="DEBUG") if not xslt.success: return None report.debug("Compiling schematron ({} + {}): {}".format("iso_svrl_for_xslt2.xsl", os.path.basename(schematron), os.path.basename(temp_xml_3))) xslt = Xslt(report=report, cwd=cwd, stylesheet=os.path.join(Schematron.schematron_dir, "iso_svrl_for_xslt2.xsl"), source=temp_xml_2, target=temp_xml_3, stdout_level="DEBUG", stderr_level="DEBUG") if not xslt.success: return None with Schematron._cache_lock: Schematron.cache[schematron] = temp_xml_3_obj return Schematron.cache[schematron].name except Exception: report.debug(traceback.format_exc(), preformatted=True) report.error("An error occured while compiling the Schematron (" + str(schematron) + ")") return None
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False # ---------- lag en kopi av EPUBen ---------- temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self.utils.report, temp_epubdir) # ---------- gjør tilpasninger i HTML-fila med XSLT ---------- opf_path = temp_epub.opf_path() if not opf_path: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne OPF-fila i EPUBen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False opf_path = os.path.join(temp_epubdir, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() html_file = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) html_file = html_file[0] if html_file else None if not html_file: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False html_file = os.path.join(os.path.dirname(opf_path), html_file) if not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False temp_html_obj = tempfile.NamedTemporaryFile() temp_html = temp_html_obj.name self.utils.report.info("Tilpasser innhold for punktskrift...") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForBraille.uid, "prepare-for-braille.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_html, html_file) # ---------- hent nytt boknummer fra /html/head/meta[@name='dc:identifier'] og bruk som filnavn ---------- html_xml = ElementTree.parse(temp_html).getroot() result_identifier = html_xml.xpath( "/*/*[local-name()='head']/*[@name='dc:identifier']") result_identifier = result_identifier[0].attrib[ "content"] if result_identifier and "content" in result_identifier[ 0].attrib else None if not result_identifier: self.utils.report.error( self.book["name"] + ": Klarte ikke å finne boknummer i ny HTML-fil.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False shutil.copy(html_file, temp_html) os.remove(html_file) html_file = os.path.join( os.path.dirname(html_file), result_identifier + ".html") # Bruk html istedenfor xhtml når det ikke er en EPUB shutil.copy(temp_html, html_file) # TODO: sett inn HTML5 doctype: <!DOCTYPE html> # ---------- slett EPUB-spesifikke filer ---------- items = opf_xml.xpath("/*/*[local-name()='manifest']/*") for item in items: delete = False if "properties" in item.attrib and "nav" in re.split( r'\s+', item.attrib["properties"]): delete = True if "media-type" in item.attrib: if item.attrib["media-type"].startswith("audio/"): delete = True elif item.attrib["media-type"] == "application/smil+xml": delete = True if not delete or "href" not in item.attrib: continue fullpath = os.path.join(os.path.dirname(opf_path), item.attrib["href"]) os.remove(fullpath) os.remove(opf_path) # ---------- lagre HTML-filsett ---------- html_dir = os.path.dirname(opf_path) self.utils.report.info( "Boken ble konvertert. Kopierer til arkiv for punkt-klare HTML-filer." ) archived_path, stored = self.utils.filesystem.storeBook( html_dir, self.book["name"]) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + self.book[ "name"] + " ble konvertert 👍😄" + epubTitle return True
def __init__(self, pipeline=None, schematron=None, source=None, report=None, cwd=None, attach_report=True): assert pipeline or report and (report.pipeline or cwd) assert schematron and "/" in schematron and os.path.isfile(schematron) assert source and "/" in source and os.path.isfile(source) if not report: report = pipeline.utils.report if not cwd: assert report.pipeline.dir_in is not None, ( "Schematron: for pipelines with no input directory, " + "the current working directory needs to be explicitly set." ) cwd = report.pipeline.dir_in self.success = False try: compiled_schematron = Schematron.compile_schematron(schematron, cwd, report) if not compiled_schematron: return temp_xml_report_obj = tempfile.NamedTemporaryFile() temp_xml_report = temp_xml_report_obj.name report.debug("Validating against compiled Schematron ({} + {}): {}".format( "iso_svrl_for_xslt2.xsl", os.path.basename(source), os.path.basename(temp_xml_report))) xslt = Xslt(report=report, cwd=cwd, stylesheet=compiled_schematron, source=source, target=temp_xml_report, stdout_level="DEBUG", stderr_level="DEBUG") if not xslt.success: return # Count number of errors svrl_schematron_output = ElementTree.parse(temp_xml_report).getroot() errors = svrl_schematron_output.findall('{http://purl.oclc.org/dsdl/svrl}failed-assert') errors.extend(svrl_schematron_output.findall('{http://purl.oclc.org/dsdl/svrl}successful-report')) if len(errors) == 0: self.success = True else: max_errors = 20 e = 0 pattern_title = None for element in svrl_schematron_output.getchildren(): if element.tag == '{http://purl.oclc.org/dsdl/svrl}active-pattern': pattern_title = element.attrib["name"] if "name" in element.attrib else None continue if element.tag == '{http://purl.oclc.org/dsdl/svrl}failed-assert' or element.tag == '{http://purl.oclc.org/dsdl/svrl}successful-report': location = element.attrib["location"] if "location" in element.attrib else None test = element.attrib["test"] if "test" in element.attrib else None text = element.find('{http://purl.oclc.org/dsdl/svrl}text') text = text.text if text is not None and text.text else "(missing description)" if e < max_errors: report.error((pattern_title + ": " if pattern_title else "") + text) report.debug((pattern_title + ": " if pattern_title else "") + text + (" ({})".format(location) if location else "") + (" ({})".format(test) if test else "")) e += 1 # Create HTML report if temp_xml_report and "/" in temp_xml_report: html_report_obj = tempfile.NamedTemporaryFile() html_report = html_report_obj.name report.debug("Creating HTML report for Schematron validation ({} + {}): {}".format( "iso_svrl_for_xslt2.xsl", os.path.basename(temp_xml_report), os.path.basename(html_report))) xslt = Xslt(report=report, cwd=cwd, stylesheet=os.path.join(Xslt.xslt_dir, Schematron.uid, "svrl-to-html.xsl"), source=temp_xml_report, target=html_report) if not xslt.success: return if attach_report: schematron_report_dir = os.path.join(report.reportDir(), "schematron") os.makedirs(schematron_report_dir, exist_ok=True) name = ".".join(os.path.basename(schematron).split(".")[:-1]) available_path = os.path.join(schematron_report_dir, "{}.html".format(name)) if os.path.exists(available_path): for i in range(2, 100000): available_path = os.path.join(schematron_report_dir, "{}-{}.html".format(name, i)) # assumes we won't have move than 1000 reports if not os.path.exists(available_path): break if os.path.exists(available_path): report.warn("Klarte ikke å finne et tilgjengelig filnavn for rapporten") else: report.debug("Lagrer rapport som {}".format(available_path)) with open(html_report, 'r') as result_report: report.attachment(result_report.readlines(), available_path, "SUCCESS" if self.success else "ERROR") except Exception: report.debug(traceback.format_exc(), preformatted=True) report.error("An error occured while running the Schematron (" + str(schematron) + ")")
def __init__(self, pipeline=None, reference=None, source=None, report=None, attach_report=True): assert pipeline or report, "either a pipeline or a report must be specified" assert reference and "/" in reference and os.path.isfile( reference), "reference must point to a reference file" assert (isinstance(source, str) and "/" in source and os.path.isfile(source) or isinstance(source, list) and False not in [ "/" in s and os.path.isfile(s) for s in source ]), "source must refer to one or more absolute file paths" if not report: report = pipeline.utils.report if isinstance(source, str): source = [source] self.success = False temp_referanseoversikt_obj = tempfile.NamedTemporaryFile() temp_referanseoversikt = temp_referanseoversikt_obj.name report.debug("Lager referanseoversikt") xslt = Xslt(report=report, stylesheet=os.path.join(Xslt.xslt_dir, CompareWithReference.uid, "generer-markupoversikt.xsl"), source=reference, target=temp_referanseoversikt, stdout_level="INFO", stderr_level="INFO") if not xslt.success: return try: for source_file in source: this_success = False temp_oversikt_obj = tempfile.NamedTemporaryFile() temp_oversikt = temp_oversikt_obj.name temp_rapport_obj = tempfile.NamedTemporaryFile() temp_rapport = temp_rapport_obj.name report.debug("Lager oversikt") xslt = Xslt(report=report, stylesheet=os.path.join( Xslt.xslt_dir, CompareWithReference.uid, "generer-markupoversikt.xsl"), source=source_file, target=temp_oversikt, stdout_level="INFO", stderr_level="INFO") if not xslt.success: return report.debug("Sammenligner oversikt med referanseoversikt") xslt = Xslt(report=report, stylesheet=os.path.join( Xslt.xslt_dir, CompareWithReference.uid, "sammenlign-markupoversikter.xsl"), template="start", stdout_level="INFO", stderr_level="INFO", parameters={ "filA": "file://" + temp_oversikt, "filB": "file://" + temp_referanseoversikt, "rapport": "file://" + temp_rapport }) if not xslt.success: return # Count number of errors ns = { "html": "http://www.w3.org/1999/xhtml", "re": "http://exslt.org/regular-expressions" } shutil.copy(temp_rapport, "/tmp/rapport.xhtml") report_document = ElementTree.parse(temp_rapport).getroot() errors = report_document.xpath( "//*[re:match(@class,'(^|\s)error(\s|$)')]", namespaces=ns) if len(errors) == 0: this_success = True self.success = self.success and this_success # Create HTML report if temp_rapport and "/" in temp_rapport and attach_report: compare_with_reference_report_dir = os.path.join( report.reportDir(), "compare-with-reference") os.makedirs(compare_with_reference_report_dir, exist_ok=True) name = ".".join( os.path.basename(reference).split(".")[:-1]) available_path = os.path.join( compare_with_reference_report_dir, "{}.html".format(name)) if os.path.exists(available_path): for i in range(2, 100000): # assumes we won't have move than 1000 reports available_path = os.path.join( compare_with_reference_report_dir, "{}-{}.html".format(name, i)) if not os.path.exists(available_path): break if os.path.exists(available_path): report.warn( "Klarte ikke å finne et tilgjengelig filnavn for rapporten" ) else: report.debug( "Lagrer rapport som {}".format(available_path)) with open(temp_rapport, 'r') as result_report: report.attachment( result_report.readlines(), available_path, "SUCCESS" if this_success else "ERROR") except Exception: report.debug(traceback.format_exc(), preformatted=True) report.error( "An error occured while running the CompareWithReference (" + str(reference) + ")")
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False # ---------- lag en kopi av EPUBen ---------- temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self, temp_epubdir) # ---------- gjør tilpasninger i HTML-fila med XSLT ---------- opf_path = temp_epub.opf_path() if not opf_path: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne OPF-fila i EPUBen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False opf_path = os.path.join(temp_epubdir, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() html_file = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) html_file = html_file[0] if html_file else None if not html_file: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False html_dir = os.path.dirname(opf_path) html_file = os.path.join(html_dir, html_file) if not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False temp_html_obj = tempfile.NamedTemporaryFile() temp_html = temp_html_obj.name xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForDocx.uid, "prepare-for-docx.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_html, html_file) archived_path, stored = self.utils.filesystem.storeBook( temp_epubdir, epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " ble konvertert 👍😄" + epubTitle return True
def on_book(self): date_iso = self.book["name"] if not re.match(r"^\d\d\d\d-\d\d-\d\d$", date_iso): self.utils.report.error("Ugyldig mappenavn: {}".format( self.book["name"])) return False date_numbers = re.sub(r"^\d\d(\d\d)-(\d\d)-(\d\d)$", r"\1\2\3", date_iso) newspapers = { "Aftenposten": { "id": "611823", "title": "Aftenposten" }, "Bergens_Tidende": { "id": "618720", "title": "Bergens Tidende" }, "Faedrelandsvennen": { "id": "618363", "title": "Fædrelandsvennen" }, "Stavanger_Aftenblad": { "id": "618360", "title": "Stavanger Aftenblad" } } files = os.listdir(self.book["source"]) for paper in newspapers: files_paper = "" for file in files: if file.startswith(paper): files_paper += file + "," newspapers[paper]["files"] = files_paper # Use xslt to transform to correct dc:identifier for paper in newspapers: if len(newspapers[paper]["files"]) == 0: self.utils.report.info( "Ingen artikler for {} i dag. {} blir ikke produsert for {}." .format(newspapers[paper]["title"], newspapers[paper]["title"], date_iso)) continue self.utils.report.info("Henter feed for " + paper) temp_xml_obj = tempfile.NamedTemporaryFile() temp_xml = temp_xml_obj.name if os.path.isdir("/tmp"): temp_xml = "/tmp/{}_{}_joined.xml".format( date_iso, paper) # for easier debugging self.utils.report.info("Setter sammen feed for " + paper) xslt = Xslt(self, parameters={ "files": newspapers[paper]["files"], "basepath": self.book["source"] }, stylesheet=os.path.join(Xslt.xslt_dir, self.uid, "schibsted-join.xsl"), template="main", target=temp_xml) if not xslt.success: self.utils.report.error( "Transformering av html for {} med xslt feilet".format( paper)) self.utils.report.title = self.title + ": feilet 😭👎" return False temp_dtbook_obj = tempfile.NamedTemporaryFile() temp_dtbook = temp_dtbook_obj.name self.utils.report.info("Lager dtbook med xslt for " + paper) xslt = Xslt(self, parameters={ "identifier": newspapers[paper]["id"], "title": newspapers[paper]["title"], "date": date_iso }, stylesheet=os.path.join(Xslt.xslt_dir, self.uid, "schibsted-to-dtbook.xsl"), source=temp_xml, target=temp_dtbook) if not xslt.success: self.utils.report.error( "Transformering av html for {} med xslt feilet".format( paper)) self.utils.report.title = self.title + ": feilet 😭👎" return False archived_path, stored = self.utils.filesystem.storeBook( temp_dtbook, newspapers[paper]["id"] + date_numbers, file_extension="xml") self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.info("Dagens Schibsted-aviser er ferdig produsert") self.utils.report.title = self.title + ": " + " dagens Schibsted-aviser ble produsert 👍😄" return True
def on_book(self): self.utils.report.info("Lager nyhetsbrev i punktskrift med Pipeline 2") with DaisyPipelineJob( self, "nlb:catalog-month", { "month": self.year_month, "make-email": "false" }, priority="low", pipeline_and_script_version=[ ("1.11.1-SNAPSHOT", None), ], ) as dp2_job_newsletter: if dp2_job_newsletter.status == "SUCCESS": newsletter = None for file in os.listdir( os.path.join(dp2_job_newsletter.dir_output, "output-dir")): if file.endswith(".xhtml"): newsletter = file break if not newsletter: self.utils.report.error("Could not find html") return False os.mkdir( os.path.join(dp2_job_newsletter.dir_output, "output-dir", self.newsletter_identifier)) html_file = os.path.join(dp2_job_newsletter.dir_output, "output-dir", newsletter) os.rename( html_file, os.path.join(os.path.dirname(html_file), self.newsletter_identifier, self.newsletter_identifier + ".html")) html_file = os.path.join(os.path.dirname(html_file), self.newsletter_identifier, self.newsletter_identifier + ".html") if dp2_job_newsletter.status != "SUCCESS": self.utils.report.info("Klarte ikke å konvertere boken") self.utils.report.title = self.title + " feilet 😭👎" return False temp_html_obj = tempfile.NamedTemporaryFile() temp_html = temp_html_obj.name # Use xslt to transform to correct dc:identifier xslt = Xslt(self, parameters={"identifier": self.newsletter_identifier}, stylesheet=os.path.join(Xslt.xslt_dir, self.uid, "newsletter-id.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.error( "Transformering av html med xslt feilet") self.utils.report.title = self.title + ": " + self.newsletter_identifier + " feilet 😭👎" return False shutil.copy(temp_html, html_file) archived_path, stored = self.utils.filesystem.storeBook( os.path.dirname(html_file), self.newsletter_identifier) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.info( "Nyhetsbrev punktskrift ble produsert i pipeline2") self.utils.report.title = self.title + ": " + self.newsletter_identifier + " ble produsert 👍😄" return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") metadata = Metadata.get_metadata_from_book(self.utils.report, self.book["source"]) metadata["identifier"] = re.sub(r"[^\d]", "", metadata["identifier"]) if not metadata["identifier"]: self.utils.report.error( "Klarte ikke å bestemme boknummer for {}".format( self.book["name"])) return False if metadata["identifier"] != self.book["name"]: self.utils.report.info("Boknummer for {} er: {}".format( self.book["name"], metadata["identifier"])) self.utils.report.info("Lager en kopi av DTBoken") temp_dtbookdir_obj = tempfile.TemporaryDirectory() temp_dtbookdir = temp_dtbookdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_dtbookdir) # find DTBook XML dtbook = None for root, dirs, files in os.walk(temp_dtbookdir): for f in files: if f.endswith(".xml"): xml = ElementTree.parse(os.path.join(root, f)).getroot() if xml.xpath( "namespace-uri()" ) == "http://www.daisy.org/z3986/2005/dtbook/": dtbook = os.path.join(root, f) break if dtbook is not None: break if not dtbook: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne DTBook") return False # rename all files to lower case for root, dirs, files in os.walk(temp_dtbookdir): for f in files: if not f.lower() == f: self.utils.report.warn( "renaming to lowercase: {}".format(f)) shutil.move(os.path.join(root, f), os.path.join(root, f.lower())) temp_dtbook_file_obj = tempfile.NamedTemporaryFile() temp_dtbook_file = temp_dtbook_file_obj.name self.utils.report.info("Rydder opp i nordisk DTBook") xslt = Xslt(self, stylesheet=os.path.join(NordicDTBookToEpub.xslt_dir, NordicDTBookToEpub.uid, "nordic-cleanup-dtbook.xsl"), source=dtbook, target=temp_dtbook_file) if not xslt.success: return False shutil.copy(temp_dtbook_file, dtbook) self.utils.report.info("Validerer Nordisk DTBook...") # create context for Pipeline 2 job dtbook_dir = os.path.dirname(dtbook) dtbook_context = {} for root, dirs, files in os.walk(dtbook_dir): for file in files: fullpath = os.path.join(root, file) relpath = os.path.relpath(fullpath, dtbook_dir) dtbook_context[relpath] = fullpath with DaisyPipelineJob( self, "nordic-dtbook-validate", { "dtbook": os.path.basename(dtbook), "no-legacy": "false" }, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context=dtbook_context) as dp2_job_dtbook_validate: dtbook_validate_status = None if dp2_job_dtbook_validate.status == "SUCCESS": dtbook_validate_status = "SUCCESS" elif dp2_job_dtbook_validate.status in ["VALIDATION_FAIL", "FAIL"]: dtbook_validate_status = "WARN" else: dtbook_validate_status = "ERROR" report_file = os.path.join(dp2_job_dtbook_validate.dir_output, "html-report/report.xhtml") if dtbook_validate_status == "WARN": report_doc = ElementTree.parse(report_file) errors = report_doc.xpath( '//*[@class="error" or @class="message-error"]') for error in errors: error_text = " ".join( [e.strip() for e in error.xpath('.//text()')]).strip() error_text = " ".join(error_text.split()).strip() if bool( error_text) else error_text if (bool(error_text) and (error_text.startswith("[tpb124]") or error_text.startswith("[tpb43]") or error_text.startswith("[tpb10] Meta dc:Publisher") or error_text.startswith("[tpb10] Meta dc:Date") or error_text.startswith("[opf3g]") or 'element "h1" not allowed here' in error_text or 'element "h2" not allowed here' in error_text or 'element "h3" not allowed here' in error_text or 'element "h4" not allowed here' in error_text or 'element "h5" not allowed here' in error_text or 'element "h6" not allowed here' in error_text or 'token "toc-brief" invalid' in error_text)): continue # ignorer disse feilmeldingene if error_text.startswith("Incorrect file signature"): magic_number = error.xpath( '*[@class="message-details"]/*[last()]/*[last()]/text()' )[0] magic_number = " ".join(magic_number.split()).strip( ) if bool(magic_number) else magic_number # JFIF already allowed: 0xFF 0xD8 0xFF 0xE0 0x?? 0x?? 0x4A 0x46 0x49 0x46 if magic_number.startswith( "0xFF 0xD8 0xFF 0xDB"): # Also allow JPEG RAW continue elif magic_number[: 19] == "0xFF 0xD8 0xFF 0xE1" and magic_number[ 30:] == ("0x45 0x78 0x69 0x66" ): # Also allow EXIF continue else: dtbook_validate_status = "ERROR" self.utils.report.error(error_text) else: dtbook_validate_status = "ERROR" self.utils.report.error(error_text) # get conversion report if os.path.isfile(report_file): with open(report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-dtbook.html"), dtbook_validate_status) if dtbook_validate_status == "ERROR": self.utils.report.error("Klarte ikke å validere boken") return False if dtbook_validate_status == "WARN": self.utils.report.warn( "DTBoken er ikke valid, men vi fortsetter alikevel.") self.utils.report.info( "Konverterer fra Nordisk DTBook til Nordisk HTML...") temp_htmldir_obj = tempfile.TemporaryDirectory() temp_htmldir = temp_htmldir_obj.name temp_htmlfile = None with DaisyPipelineJob( self, "nordic-dtbook-to-html", { "dtbook": os.path.basename(dtbook), "fail-on-error": "false", "no-legacy": "false" }, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context=dtbook_context) as dp2_job_dtbook_to_html: convert_status = "SUCCESS" if dp2_job_dtbook_to_html.status == "SUCCESS" else "ERROR" convert_report_file = os.path.join( dp2_job_dtbook_to_html.dir_output, "html-report/report.xhtml") if convert_status != "SUCCESS": self.utils.report.error( "Klarte ikke å konvertere boken fra DTBook til HTML") # get conversion report if os.path.isfile(convert_report_file): with open(convert_report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-dtbook-to-html.html"), convert_status) return False dp2_html_dir = os.path.join(dp2_job_dtbook_to_html.dir_output, "output-dir") if not os.path.isdir(dp2_html_dir): self.utils.report.error( "Finner ikke 'output-dir' for den konverterte boken: {}". format(dp2_html_dir)) return False Filesystem.copy(self.utils.report, dp2_html_dir, temp_htmldir) temp_htmlfile = os.path.join(temp_htmldir, metadata["identifier"] + ".xhtml") if not os.path.isfile(temp_htmlfile): self.utils.report.error( "Finner ikke den konverterte boken: {}".format(temp_htmlfile)) self.utils.report.info( "Kanskje filnavnet er forskjellig fra IDen?") return False self.utils.report.info("Rydder opp i nordisk HTML") temp_html_xslt_output_obj = tempfile.NamedTemporaryFile() temp_html_xslt_output = temp_html_xslt_output_obj.name xslt = Xslt(self, stylesheet=os.path.join(NordicDTBookToEpub.xslt_dir, NordicDTBookToEpub.uid, "nordic-cleanup-html.xsl"), source=temp_htmlfile, target=temp_html_xslt_output) if not xslt.success: return False shutil.copy(temp_html_xslt_output, temp_htmlfile) self.utils.report.info( "Konverterer fra Nordisk HTML til Nordisk EPUB3...") # create context for Pipeline 2 job html_dir = os.path.dirname(temp_htmlfile) html_context = {} for root, dirs, files in os.walk(html_dir): for file in files: fullpath = os.path.join(root, file) relpath = os.path.relpath(fullpath, html_dir) html_context[relpath] = fullpath temp_epub_file_obj = tempfile.NamedTemporaryFile() temp_epub_file = temp_epub_file_obj.name with DaisyPipelineJob(self, "nordic-html-to-epub3", { "html": os.path.basename(temp_htmlfile), "fail-on-error": "false" }, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context=html_context) as dp2_job_html_to_epub: convert_status = "SUCCESS" if dp2_job_html_to_epub.status == "SUCCESS" else "ERROR" convert_report_file = os.path.join(dp2_job_html_to_epub.dir_output, "html-report/report.xhtml") if convert_status != "SUCCESS": self.utils.report.error("Klarte ikke å konvertere boken") # get conversion report if os.path.isfile(convert_report_file): with open(convert_report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-html-to-epub3.html"), convert_status) return False dp2_epub_file = os.path.join(dp2_job_html_to_epub.dir_output, "output-dir", metadata["identifier"] + ".epub") if not os.path.isfile(dp2_epub_file): self.utils.report.error( "Finner ikke den konverterte boken: {}".format( dp2_epub_file)) self.utils.report.info( "Kanskje filnavnet er forskjellig fra IDen?") return False self.utils.report.info("Validerer Nordisk EPUB 3...") epub_file = dp2_epub_file.asFile() with DaisyPipelineJob(self, "nordic-epub3-validate", {"epub": os.path.basename(epub_file)}, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context={ os.path.basename(epub_file): epub_file }) as dp2_job_epub_validate: epub_validate_status = "SUCCESS" if dp2_job_epub_validate.status == "SUCCESS" else "ERROR" report_file = os.path.join(dp2_job_epub_validate.dir_output, "html-report/report.xhtml") if epub_validate_status == "ERROR": # attach intermediary file from conversion with open(temp_htmlfile, 'r') as intermediary_htmlfile: self.utils.report.attachment( intermediary_htmlfile.readlines(), os.path.join(self.utils.report.reportDir(), "intermediary-html.html"), "DEBUG") epub_validate_status = "WARN" report_doc = ElementTree.parse(report_file) errors = report_doc.xpath( '//*[@class="error" or @class="message-error"]') for error in errors: error_text = " ".join([ e.strip() for e in error.xpath('.//text()') ]).strip() error_text = " ".join(error_text.split()).strip( ) if bool(error_text) else error_text if (bool(error_text) and (error_text.startswith("[nordic280]") or "PKG-021: Corrupted image file encountered." in error_text)): continue # ignorer disse feilmeldingene else: self.utils.report.warn( "Not ignoring: {}".format(error_text)) if error_text.startswith("Incorrect file signature"): magic_number = error.xpath( '*[@class="message-details"]/*[last()]/*[last()]/text()' )[0] magic_number = " ".join(magic_number.split( )).strip() if bool(magic_number) else magic_number # JFIF already allowed: 0xFF 0xD8 0xFF 0xE0 0x?? 0x?? 0x4A 0x46 0x49 0x46 if magic_number.startswith( "0xFF 0xD8 0xFF 0xDB" ): # Also allow JPEG RAW continue elif magic_number[: 19] == "0xFF 0xD8 0xFF 0xE1" and magic_number[ 30:] == ( "0x45 0x78 0x69 0x66" ): # Also allow EXIF continue else: epub_validate_status = "ERROR" self.utils.report.error(error_text) else: epub_validate_status = "ERROR" self.utils.report.error(error_text) # get conversion report if os.path.isfile(report_file): with open(report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-epub3.html"), epub_validate_status) if epub_validate_status == "ERROR": self.utils.report.error( "Klarte ikke å validere EPUB 3-versjonen av boken") return False Filesystem.copy(self.utils.report, dp2_epub_file, temp_epub_file) epub = Epub(self.utils.report, temp_epub_file) if not epub.isepub(): return False self.utils.report.info( "Boken ble konvertert. Kopierer til EPUB3-fra-DTBook-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( epub.asDir(), metadata["identifier"], overwrite=self.overwrite) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = "{}: {} ble konvertert 👍😄 ({})".format( self.title, metadata["identifier"], metadata["title"]) return True
class Relaxng(): """Class used to validate XML documents using RELAXNG""" uid = "core-utils-relaxng" if Xslt.jing_jar is None: Xslt.init_environment() relaxng_dir = os.path.join(Xslt.xslt_dir, uid) def __init__(self, pipeline=None, relaxng=None, source=None, report=None, cwd=None, attach_report=True): assert pipeline or report and cwd assert relaxng and "/" in relaxng and os.path.isfile(relaxng) assert source and "/" in source and os.path.isfile(source) if not report: report = pipeline.utils.report if not cwd: assert report.pipeline.dir_in is not None, ( "RelaxNG: for pipelines with no input directory, " + "the current working directory needs to be explicitly set.") cwd = report.pipeline.dir_in self.success = False if not (Xslt.jing_jar is None): process = pipeline.utils.filesystem.run( ["java", "-jar", Xslt.jing_jar, "-t", relaxng, source]) if process.returncode == 0: self.success = True process_string = (process.stdout.decode("utf-8").strip()) lines = process_string.splitlines() process_html = """ """ for line in lines: newline = """ <tr> <td class="info">""" + line + """</td> </tr> """ process_html += newline + "\n" # HTML String to attach html = """<!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta charset="utf-8"> <title>Rapport</title><style> html{font-family:Arial, Helvetica, sans-serif; overflow-y:scroll; min-width:1000px;} table{text-align:left;min-width:50%;} </style></head> <body> <h1>Valideringsrapport</h1> <div> <table class="results"> """ + process_html + """ </table> </div> </body> </html> """ if attach_report: relaxng_report_dir = os.path.join(report.reportDir(), "relaxng") os.makedirs(relaxng_report_dir, exist_ok=True) name = ".".join(os.path.basename(relaxng).split(".")[:-1]) available_path = os.path.join(relaxng_report_dir, "{}.html".format(name)) if os.path.exists(available_path): for i in range(2, 100000): available_path = os.path.join( relaxng_report_dir, "{}-{}.html".format( name, i)) # assumes we won't have move than 1000 reports if not os.path.exists(available_path): break if os.path.exists(available_path): report.warn( "Klarte ikke å finne et tilgjengelig filnavn for rapporten" ) else: report.debug("Lagrer rapport som {}".format(available_path)) report.attachment(html, available_path, "SUCCESS" if self.success else "ERROR")
def on_book(self): epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return self.utils.report.info("Lager en kopi av EPUBen med tomme bildefiler") temp_noimages_epubdir_obj = tempfile.TemporaryDirectory() temp_noimages_epubdir = temp_noimages_epubdir_obj.name Filesystem.copy(self.utils.report, epub.asDir(), temp_noimages_epubdir) if os.path.isdir(os.path.join(temp_noimages_epubdir, "EPUB", "images")): temp_xml_obj = tempfile.NamedTemporaryFile() temp_xml = temp_xml_obj.name opf_image_references = [] html_image_references = {} for root, dirs, files in os.walk( os.path.join(temp_noimages_epubdir, "EPUB")): for file in files: if file.endswith(".opf"): opf_file = os.path.join(root, file) self.utils.report.info( "Fjerner alle bildereferanser fra OPFen, og erstatter med en referanse til dummy.jpg..." ) opf_xml_document = ElementTree.parse(opf_file) opf_xml = opf_xml_document.getroot() image_items = opf_xml.xpath( "//*[local-name()='item' and starts-with(@media-type, 'image/')]" ) replaced = False for image_item in image_items: if image_item.attrib[ "href"] not in opf_image_references: opf_image_references.append( image_item.attrib["href"]) if image_item.get("href") == "images/cover.jpg": pass # don't change the reference to cover.jpg elif not replaced: image_item.attrib["href"] = "images/dummy.jpg" replaced = True else: image_item.getparent().remove(image_item) opf_xml_document.write(opf_file, method='XML', xml_declaration=True, encoding='UTF-8', pretty_print=False) if file.endswith(".xhtml"): html_file = os.path.join(root, file) html_xml_document = ElementTree.parse(html_file) html_xml = html_xml_document.getroot() image_references = html_xml.xpath( "//@href | //@src | //@altimg") for reference in image_references: path = reference.split("#")[0] if path.startswith("images/"): if path not in html_image_references: html_image_references[path] = [] html_image_references[path].append(file) self.utils.report.info( "Erstatter alle bildereferanser med images/dummy.jpg..." ) self.utils.report.debug("dummy-jpg.xsl") self.utils.report.debug(" source = " + html_file) self.utils.report.debug(" target = " + temp_xml) xslt = Xslt(self, stylesheet=os.path.join( Xslt.xslt_dir, IncomingNordic.uid, "dummy-jpg.xsl"), source=html_file, target=temp_xml) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, html_file) # validate for the presence of image files here, since epubcheck won't be able to do it anymore after we change the EPUB image_files_present = [] for root, dirs, files in os.walk( os.path.join(temp_noimages_epubdir, "EPUB", "images")): for file in files: fullpath = os.path.join(root, file) relpath = os.path.relpath( fullpath, os.path.join(temp_noimages_epubdir, "EPUB")) image_files_present.append(relpath) image_error = False for file in image_files_present: if file not in opf_image_references: self.utils.report.error( "Bildefilen er ikke deklarert i OPFen: " + file) image_error = True for file in opf_image_references: if file not in image_files_present: self.utils.report.error( "Bildefilen er deklarert i OPFen, men finnes ikke: " + file) image_error = True for file in html_image_references: if file not in opf_image_references: self.utils.report.error( "Bildefilen er deklarert i HTMLen, men finnes ikke: " + file + " (deklarert i: " + ", ".join(html_image_references[file]) + ")") image_error = True if image_error: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False for root, dirs, files in os.walk( os.path.join(temp_noimages_epubdir, "EPUB", "images")): for file in files: if file == "cover.jpg": continue # don't delete the cover file fullpath = os.path.join(root, file) os.remove(fullpath) shutil.copy( os.path.join(Xslt.xslt_dir, IncomingNordic.uid, "reference-files", "demobilde.jpg"), os.path.join(temp_noimages_epubdir, "EPUB", "images", "dummy.jpg")) temp_noimages_epub = Epub(self.utils.report, temp_noimages_epubdir) self.utils.report.info( "Validerer EPUB med epubcheck og nordiske retningslinjer...") epub_noimages_file = temp_noimages_epub.asFile() with DaisyPipelineJob(self, "nordic-epub3-validate", {"epub": os.path.basename(epub_noimages_file)}, priority="high", pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context={ os.path.basename(epub_noimages_file): epub_noimages_file }) as dp2_job: # get validation report report_file = os.path.join(dp2_job.dir_output, "html-report/report.xhtml") if os.path.isfile(report_file): with open(report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report.html"), "SUCCESS" if dp2_job.status == "SUCCESS" else "ERROR") if dp2_job.status != "SUCCESS": self.utils.report.error("Klarte ikke å validere boken") self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return self.utils.report.debug("Making a copy of the EPUB to work on…") epub_fixed, epub_fixed_obj = epub.copy() epub_unzipped = epub_fixed.asDir() nav_path = os.path.join(epub_unzipped, epub_fixed.nav_path()) mathML_validation_result = True mathml_error_count = 0 mathml_errors_not_shown = 0 mathml_report_errors_max = 10 for root, dirs, files in os.walk(epub_unzipped): for f in files: file = os.path.join(root, f) if not file.endswith(".xhtml") or file is nav_path: continue self.utils.report.info("Checking MathML in " + file) mathml_validation = Mathml_validator( self, source=file, report_errors_max=mathml_report_errors_max) if not mathml_validation.success: mathml_error_count += mathml_validation.error_count mathml_errors_not_shown += max( (mathml_validation.error_count - mathml_report_errors_max), 0) if mathml_error_count > mathml_report_errors_max: mathml_report_errors_max = 0 # don't put any more errors for the other HTML documents in the main report mathML_validation_result = False if mathml_errors_not_shown > 0: self.utils.report.error( "{} additional MathML errors not shown in the main report. Check the log for details." .format(mathml_errors_not_shown)) if mathML_validation_result is False: return False self.utils.report.debug( "Making sure that the EPUB has the correct file and directory permissions…" ) epub_fixed.fix_permissions() try: self.utils.report.info("Genererer ACE-rapport...") ace_dir = os.path.join(self.utils.report.reportDir(), "accessibility-report") process = self.utils.filesystem.run( [IncomingNordic.ace_cli, "-o", ace_dir, epub_fixed.asFile()]) if process.returncode == 0: self.utils.report.info("ACE-rapporten ble generert.") else: self.utils.report.warn( "En feil oppstod ved produksjon av ACE-rapporten for " + epub.identifier()) self.utils.report.debug(traceback.format_stack()) # attach report ace_status = None with open(os.path.join(ace_dir, "report.json")) as json_report: ace_status = json.load( json_report)["earl:result"]["earl:outcome"] if ace_status == "pass": ace_status = "SUCCESS" else: ace_status = "WARN" self.utils.report.attachment(None, os.path.join(ace_dir, "report.html"), ace_status) except subprocess.TimeoutExpired: self.utils.report.warn( "Det tok for lang tid å lage ACE-rapporten for " + epub.identifier() + ", og prosessen ble derfor stoppet.") except Exception: self.utils.report.warn( "En feil oppstod ved produksjon av ACE-rapporten for " + epub.identifier()) self.utils.report.debug(traceback.format_exc(), preformatted=True) self.utils.report.info( "Boken er valid. Kopierer til EPUB master-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( epub_fixed.asDir(), epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " er valid 👍😄" + epubTitle self.utils.filesystem.deleteSource() return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False # language must be exctracted from epub or else docx default language (nb) wil be used in the converted file language = "" try: #language = " (" + epub.meta("dc:language") + ") " language = epub.meta("dc:language") except Exception: pass # ---------- lag en kopi av EPUBen ---------- temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self, temp_epubdir) opf_path = temp_epub.opf_path() if not opf_path: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne OPF-fila i EPUBen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False opf_path = os.path.join(temp_epubdir, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() html_file = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) html_file = html_file[0] if html_file else None if not html_file: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False html_file = os.path.join(os.path.dirname(opf_path), html_file) if not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False temp_xml_file_obj = tempfile.NamedTemporaryFile() temp_xml_file = temp_xml_file_obj.name self.utils.report.info( "Konverterer fra ASCIIMath til norsk punktnotasjon…") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NLBpubToDocx.uid, "nordic-asciimath-epub.xsl"), source=html_file, target=temp_xml_file) if not xslt.success: return False shutil.copy(temp_xml_file, html_file) # ---------- konverter HTML-fila til DOCX ---------- temp_docxdir_obj = tempfile.TemporaryDirectory() temp_docxdir = temp_docxdir_obj.name try: self.utils.report.info("Konverterer fra XHTML til DOCX...") process = self.utils.filesystem.run([ "/usr/bin/ebook-convert", html_file, os.path.join(temp_docxdir, epub.identifier() + "_calibre.docx"), "--chapter=/", "--chapter-mark=none", "--page-breaks-before=/", "--no-chapters-in-toc", "--toc-threshold=0", "--docx-page-size=a4", # "--linearize-tables", "--extra-css=" + os.path.join(Xslt.xslt_dir, self.uid, 'extra.css'), # NOTE: microsoft fonts must be installed: # sudo apt-get install ttf-mscorefonts-installer "--embed-font-family=Verdana", "--docx-page-margin-top=42", "--docx-page-margin-bottom=42", "--docx-page-margin-left=70", "--docx-page-margin-right=56", #"--language="+epub.meta('dc:language'), ("--language=" + language) if language else "", "--base-font-size=13", #"--remove-paragraph-spacing", #"--remove-paragraph-spacing-indent-size=-1", "--font-size-mapping=13,13,13,13,13,13,13,13" ]) if process.returncode == 0: self.utils.report.info("Boken ble konvertert.") # ------------- script from kvile --------------- document = Document( os.path.join(temp_docxdir, epub.identifier() + "_calibre.docx")) emptyParagraph = False normalParagraph = "Normal" normalParagraphNoIndent = "NormalNoIndent" headingIndent = Cm(1.25) fontSize = Pt(13) # ny kode 2021-01-20 #folder = os.path.join(temp_docxdir) folder = Path(temp_docxdir) # slutt ny kode #self.utils.report.info("Folder: "+folder) def zipdir(src, dst, zip_name): os.chdir(dst) ziph = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) for root, dirs, files in os.walk(src): for file in files: ziph.write(os.path.join(root, file), arcname=os.path.join( root.replace(src, ""), file)) ziph.close() def writeFile(txt, dst): tempFile = open(folder / dst, "w+") tempFile.write(txt) tempFile.close() def delete_paragraph(paragraph): # self.utils.report.info("Delete paragraph: ") p = paragraph._element p.getparent().remove(p) p._p = p._element = None def delete_element(element): element.getparent().remove(element) element._element = None indent = Cm(0.44) hangingIndentList = Cm(0.63) document.styles[normalParagraph].font.size = fontSize document.styles[ normalParagraph].paragraph_format.first_line_indent = indent styleNoIndent = document.styles.add_style( 'NormalNoIndent', WD_STYLE_TYPE.PARAGRAPH) styleNoIndent.base_style = document.styles[normalParagraph] document.styles[ normalParagraphNoIndent].paragraph_format.first_line_indent = Cm( 0) # set style to normal for regular paragraphs, set keep_with_next to false, remove multiple empty paragraphs, and remove empty p after page nr or heading for paragraph in document.paragraphs: # deleting empty text-elements emptyTextElementList = document.element.xpath( "//w:t[. = '']") for emptyTextElement in emptyTextElementList: delete_element(emptyTextElement) paragraph.paragraph_format.keep_with_next = None if re.match("Para 0[1-9]|[0-9] Block|Para [0-9]", paragraph.style.name ) and paragraph.style.font.underline != True: paragraph.style = normalParagraph if len(paragraph.text) <= 1 or re.match( r"^--- \d+ til ", paragraph.text ) or paragraph.style.name[ 0: 7] == "Heading": # if empty p or page nr or heading paragraph.text = re.sub( r"^\s(.*)", r"\1", paragraph.text) #remove space at beginning av p # self.utils.report.info("Paragraph.text <= 1 ") if len( paragraph.text ) == 0 and emptyParagraph: #if last p also was empty or page nr # self.utils.report.info("Paragraph.text == 0 ") delete_paragraph(paragraph) emptyParagraph = True else: emptyParagraph = False if re.match(r"^\s*STATPED_DUMMYTEXT_LI_OL\s*$", paragraph.text): paragraph.text = "" # no indent after Heading, page-nr, or paragraphs starting with "Bilde: ", paragraphs in only bold (text=^_[^_]*_$) and the paragraph after p in only bold, or on empty p. removeIndent = False for paragraph in document.paragraphs: #remove space at beginning of line after <br/> spaceAfterBreakList = paragraph._element.xpath( r'w:r/w:br[@w:clear="none"]/following::w:t[@xml:space="preserve"][1]' ) if len(spaceAfterBreakList) > 0: for spaceAfterBreakElement in spaceAfterBreakList: if re.match( '^ ', spaceAfterBreakElement.text ) and not (spaceAfterBreakElement.xpath( r'preceding-sibling::*[1][self::w:t]')): spaceAfterBreakElement.text = re.sub( r"^ ", r"", spaceAfterBreakElement.text) #remove break before paragraph end breakBeforeParagraphEndList = paragraph._element.xpath( r'w:r[last()]/w:br[@w:clear="none" and not(following-sibling::*)]' ) if len(breakBeforeParagraphEndList) > 0: delete_element(breakBeforeParagraphEndList[0]) t = paragraph.text.strip() if re.match( r"^Bilde: |^Forklaring: |^--- \d+ til |^_[^_]*_$|^STATPED_DUMMYTEXT_LIST_UNSTYLED|^STATPED_DUMMYTEXT_P_BEFORE_DL", t) or ((removeIndent or len(t) == 0) and paragraph.style.name == "Normal"): paragraph.style = normalParagraphNoIndent # Remove dummy-text and set hengemarg if re.match( r"^(STATPED_DUMMYTEXT_LIST_UNSTYLED|STATPED_DUMMYTEXT_DL)", paragraph.text): paragraph.paragraph_format.left_indent = hangingIndentList #Pt(0) paragraph.paragraph_format.first_line_indent = -hangingIndentList #Pt(-20) if re.match(r"^STATPED_DUMMYTEXT", paragraph.text): paragraph.text = re.sub( r"^(STATPED_DUMMYTEXT_LIST_UNSTYLED|STATPED_DUMMYTEXT_DL|STATPED_DUMMYTEXT_P_BEFORE_DL)", "", paragraph.text) if len(t) == 0 or paragraph.style.name[ 0:7] == "Heading" or re.match( r"^--- \d+ til |^_[^_]*_$", t): removeIndent = True else: removeIndent = False # remove bold from Headings. paraStylesWithoutBoldOrUnderline = [ ] #list of all para-styles without underline or bold paraStylesWithoutUnderline = [ ] #list of all para-styles without underline for style in document.styles: if style.name[0:7] == "Heading": style.font.bold = None style.paragraph_format.left_indent = headingIndent #Pt(0) style.paragraph_format.first_line_indent = -headingIndent #Pt(-20) style.paragraph_format.space_before = Pt(0) style.paragraph_format.space_after = Pt(0) style_element = style._element spacing = style_element.xpath(r'w:pPr/w:spacing')[0] spacing.set(qn('w:beforeLines'), "0") spacing.set(qn('w:afterLines'), "0") if style.name[0:5] == "Para ": if style.font.underline != True: paraStylesWithoutUnderline.append(style.name) if style.font.bold != True: paraStylesWithoutBoldOrUnderline.append( style.name) # find all para-styles with wanted properties in tables and change style paraStylesInTables = [] #for paraStyleWithoutBoldOrUnderline in paraStylesWithoutBoldOrUnderline: for paraStyleWithoutUnderline in paraStylesWithoutUnderline: for element in document.element.xpath( "//w:tbl//w:p//w:pStyle[@w:val = '" + paraStyleWithoutUnderline + "']"): paraStylesInTables.append(element) for paraStyleInTables in paraStylesInTables: paraStyleInTables.attrib[ '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'] = normalParagraphNoIndent # or normalParagraph # uncomment if you want to modify first p in a cell # firstParaStylesInTables = [] # for paraStyleWithoutBoldOrUnderline in paraStylesWithoutBoldOrUnderline: # for element in document.element.xpath("//w:tc//w:p[position()=1]//w:pStyle[@w:val = '" + normalParagraph + "']"): # firstParaStylesInTables.append(element) # for paraStyleInTables in firstParaStylesInTables: # paraStyleInTables.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'] = normalParagraphNoIndent # tables missing required <w:tblGrid>, so throws: docx.oxml.exceptions.InvalidXmlError: required ``<w:tblGrid>`` child element not present #from docx.table import _Cell, Table #from docx.oxml.text.paragraph import CT_P # for row in table.columns: # try: # for cell in row.cells: # firstP = True # for p in cell.paragraphs: # if p.style.font.underline != True and re.match(r"^Para | Block",p.style.name): # if firstP: # p.style = "NormalNoIndent" # firstP = False # else: # p.style = "Normal" # except Exception as e: # pass document.save( os.path.join(temp_docxdir, epub.identifier() + ".docx")) self.utils.report.info( "Temp-fil ble lagret: " + os.path.join(temp_docxdir, epub.identifier() + ".docx")) wordFile = os.path.join(temp_docxdir, epub.identifier() + ".docx") zipDocument = zipfile.ZipFile((folder / wordFile)) tempFolder = "temp" zipDocument.extractall(folder / tempFolder) zipDocument.close() zippedFile = tempFolder + "/word/numbering.xml" xmlFile = open((folder / zippedFile), 'r+') xmlText = xmlFile.read() xmlText = re.sub(r'w:left="1152"', r'w:left="360"', xmlText) xmlText = re.sub(r'w:left="1512"', r'w:left="720"', xmlText) xmlText = re.sub(r'w:left="1872"', r'w:left="1080"', xmlText) xmlText = re.sub( r'<w:numFmt w:val="lowerLetter"/><w:lvlText w:val="%([1-9])\."/>', r'<w:numFmt w:val="lowerLetter"/><w:lvlText w:val="%\1)"/>', xmlText) # a. as a) in lists #xmlText = re.sub(r'<w:lvlText w:val="%(1|2)\."/>', r'<w:lvlText w:val="%\1)"/>', xmlText) # a. as a), and 1. as 1) in lists writeFile(xmlText, zippedFile) zipdir(str(folder / tempFolder), str(folder), os.path.join(temp_docxdir, epub.identifier() + ".docx")) # ---------- end script from kvile ------- else: self.utils.report.error( "En feil oppstod ved konvertering til DOCX for " + epub.identifier()) self.utils.report.debug(traceback.format_stack()) self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False except subprocess.TimeoutExpired: self.utils.report.error( "Det tok for lang tid å konvertere " + epub.identifier() + " til DOCX, og Calibre-prosessen ble derfor stoppet.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False except Exception: self.utils.report.error( "En feil oppstod ved konvertering til DOCX for " + epub.identifier()) self.utils.report.info(traceback.format_exc(), preformatted=True) self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False archived_path, stored = self.utils.filesystem.storeBook( temp_docxdir, epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " ble konvertert 👍😄" + epubTitle return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") self.utils.report.info("Locating HTML file") epub = Epub(self.utils.report, self.book["source"]) if not epub.isepub(): return False assert epub.isepub(), "The input must be an EPUB" spine = epub.spine() if not len(spine) == 1: self.utils.report.warn( "There must only be one item in the EPUB spine") return False html_file = os.path.join(self.book["source"], os.path.dirname(epub.opf_path()), spine[0]["href"]) identifier = epub.identifier() self.utils.report.info("lag en kopi av boka") temp_resultdir_obj = tempfile.TemporaryDirectory() temp_resultdir = temp_resultdir_obj.name Filesystem.copy(self.utils.report, os.path.dirname(html_file), temp_resultdir) temp_result = os.path.join(temp_resultdir, identifier + ".xml") self.utils.report.info("sletter EPUB-spesifikke filer") for root, dirs, files in os.walk(temp_resultdir): for file in files: if Path(file).suffix.lower() in [ ".xhtml", ".html", ".smil", ".mp3", ".wav", ".opf" ]: os.remove(os.path.join(root, file)) shutil.copy(html_file, temp_result) temp_xslt_output_obj = tempfile.NamedTemporaryFile() temp_xslt_output = temp_xslt_output_obj.name # MATHML to stem self.utils.report.info("Erstatter evt. MathML i boka...") mathml_validation = Mathml_validator(self, source=temp_result) if not mathml_validation.success: return False mathML_result = Mathml_to_text(self, source=temp_result, target=temp_result) if not mathML_result.success: return False self.utils.report.info("Fikser Webarch-oppmerking") self.utils.report.debug("webarch-fixup.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToNarrationEpub.uid, "webarch-fixup.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) self.utils.report.info("Setter inn lydbokavtalen...") self.utils.report.debug("bokinfo-tts-dtbook.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "bokinfo-tts-dtbook.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) creative_work_metadata = None timeout = 0 while creative_work_metadata is None and timeout < 5: timeout = timeout + 1 creative_work_metadata = Metadata.get_creative_work_from_api( identifier, editions_metadata="all", use_cache_if_possible=True, creative_work_metadata="all") if creative_work_metadata is not None: if creative_work_metadata["magazine"] is True: self.utils.report.info( "Fjerner sidetall fordi det er et tidsskrift...") self.utils.report.debug("remove-pagenum.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "remove-pagenum.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) break if creative_work_metadata is None: self.utils.report.warning( "Klarte ikke finne et åndsverk tilknyttet denne utgaven. Konverterer likevel." ) library = epub.meta("schema:library") library = library.upper() if library else library logo = os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "{}_logo.png".format(library)) if os.path.isfile(logo): # epub_dir = os.path.join(temp_resultdir, "EPUB") image_dir = os.path.join(temp_resultdir, "images") if not os.path.isdir(image_dir): os.mkdir(image_dir) shutil.copy(logo, image_dir) self.utils.report.info("Konverterer fra XHTML5 til DTBook...") self.utils.report.debug("html-to-dtbook.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "html-to-dtbook.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) self.utils.report.info("Gjør tilpasninger i DTBook") self.utils.report.debug("dtbook-cleanup.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "dtbook-cleanup.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) # Fjern denne transformasjonen hvis det oppstår kritiske proplemer med håndteringen av komplekst innhold self.utils.report.info( "Legger inn ekstra informasjon om komplekst innhold") self.utils.report.debug("optimaliser-komplekst-innhold.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "optimaliser-komplekst-innhold.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) self.utils.report.info("Validerer DTBook...") # NOTE: This RelaxNG schema assumes that we're using DTBook 2005-3 and MathML 3.0 dtbook_relax = Relaxng( self, relaxng=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "dtbook-schema/rng/dtbook-2005-3.mathml-3.integration.rng"), source=temp_result) dtbook_sch = Schematron(self, schematron=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "dtbook-schema/sch/dtbook.mathml.sch"), source=temp_result) if not dtbook_relax.success: self.utils.report.error("Validering av DTBook feilet (RelaxNG)") if not dtbook_sch.success: self.utils.report.error("Validering av DTBook feilet (Schematron)") if not dtbook_relax.success or not dtbook_sch.success: tempfile_stored = os.path.join(self.utils.report.reportDir(), os.path.basename(temp_result)) shutil.copy(temp_result, tempfile_stored) self.utils.report.info( f"Validering av DTBook feilet, lagrer temp fil for feilsøking: {tempfile_stored}" ) self.utils.report.attachment(None, tempfile_stored, "DEBUG") return False self.utils.report.info( "Boken ble konvertert. Kopierer til DTBook-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( temp_resultdir, identifier) self.utils.report.attachment(None, archived_path, "DEBUG") return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return # ---------- lag en kopi av EPUBen ---------- temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self.utils.report, temp_epubdir) # ---------- gjør tilpasninger i HTML-fila med XSLT ---------- opf_path = temp_epub.opf_path() if not opf_path: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne OPF-fila i EPUBen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return opf_path = os.path.join(temp_epubdir, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() html_file = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) html_file = html_file[0] if html_file else None if not html_file: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return html_file = os.path.join(os.path.dirname(opf_path), html_file) if not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return temp_xml_obj = tempfile.NamedTemporaryFile() temp_xml = temp_xml_obj.name self.utils.report.info("Flater ut NLBPUB") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid, "nlbpub-flatten.xsl"), source=html_file, target=temp_xml) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return shutil.copy(temp_xml, html_file) self.utils.report.info("Deler opp NLBPUB i flere HTML-filer") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid, "nlbpub-split.xsl"), source=html_file, target=temp_xml, parameters={"output-dir": os.path.dirname(html_file)}) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return os.remove(html_file) spine_hrefs = [] for href in sorted(os.listdir(os.path.dirname(html_file))): if href.endswith(".xhtml") and href not in [ "nav.xhtml", os.path.basename(html_file) ]: spine_hrefs.append(href) self.utils.report.info("Oppdaterer OPF-fil") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid, "update-opf.xsl"), source=opf_path, target=temp_xml, parameters={"spine-hrefs": ",".join(spine_hrefs)}) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return shutil.copy(temp_xml, opf_path) nav_path = os.path.join(temp_epubdir, temp_epub.nav_path()) self.utils.report.info("Lager nytt navigasjonsdokument") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid, "generate-nav.xsl"), source=opf_path, target=nav_path) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return self.utils.report.info("Legger til properties i OPF etter behov") temp_epub.update_opf_properties() if Epubcheck.isavailable(): epubcheck = Epubcheck(self, opf_path) if not epubcheck.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return else: self.utils.report.warn( "Epubcheck not available, EPUB will not be validated!") self.utils.report.info( "Boken ble konvertert. Kopierer til e-bok-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( temp_epubdir, temp_epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") Bibliofil.book_available(NlbpubToEpub.publication_format, temp_epub.identifier()) self.utils.report.title = self.title + ": " + epub.identifier( ) + " ble konvertert 👍😄" + epubTitle return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False # ---------- lag en kopi av EPUBen ---------- temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self.utils.report, temp_epubdir) # ---------- gjør tilpasninger i HTML-fila med XSLT ---------- opf_path = temp_epub.opf_path() if not opf_path: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne OPF-fila i EPUBen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False opf_path = os.path.join(temp_epubdir, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() html_file = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) html_file = html_file[0] if html_file else None if not html_file: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False html_dir = os.path.dirname(opf_path) html_file = os.path.join(html_dir, html_file) if not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False temp_xml_obj = tempfile.NamedTemporaryFile() temp_xml = temp_xml_obj.name # MATHML to stem self.utils.report.info("Erstatter evt. MathML i boka...") mathml_validation = Mathml_validator(self, source=html_file) if not mathml_validation.success: self.utils.report.error( "NLBPUB contains MathML errors, aborting...") return False mathML_result = Mathml_to_text(self, source=html_file, target=html_file) if not mathML_result.success: return False self.utils.report.info( "Lager skjulte overskrifter der det er nødvendig") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "create-hidden-headlines.xsl"), source=html_file, target=temp_xml, parameters={ "cover-headlines": "from-type", "frontmatter-headlines": "from-type", "bodymatter-headlines": "from-text", "backmatter-headlines": "from-type" }) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, html_file) self.utils.report.info("Tilpasser innhold for e-bok...") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "prepare-for-ebook.xsl"), source=html_file, target=temp_xml) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, html_file) # Use library-specific logo and stylesheet if available library = temp_epub.meta("schema:library") library = library.upper() if library else library logo = os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "{}_logo.png".format(library)) if os.path.isfile(logo): shutil.copy(logo, os.path.join(html_dir, os.path.basename(logo))) PrepareForEbook.update_css() stylesheet = PrepareForEbook.css_tempfile_obj.name if library is not None and library.lower() == "statped": stylesheet = PrepareForEbook.css_tempfile_statped_obj.name shutil.copy(stylesheet, os.path.join(html_dir, "ebok.css")) self.utils.report.info("Legger til logoen i OPF-manifestet") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "add-to-opf-manifest.xsl"), source=opf_path, target=temp_xml, parameters={ "href": os.path.basename(logo), "media-type": "image/png" }) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, opf_path) self.utils.report.info("Legger til CSS-fila i OPF-manifestet") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "add-to-opf-manifest.xsl"), source=opf_path, target=temp_xml, parameters={ "href": "ebok.css", "media-type": "text/css" }) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, opf_path) # add cover if missing opf_xml = ElementTree.parse(opf_path).getroot() cover_id = opf_xml.xpath( "/*/*[local-name()='manifest']/*[contains(concat(' ', @properties, ' '), ' cover-image ')]/@id" ) # from properties if not cover_id: cover_id = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@name='cover']/@content" ) # from metadata if not cover_id: cover_id = opf_xml.xpath( "/*/*[local-name()='manifest']/*[starts-with(@media-type, 'image/') and contains(@href, 'cover')]/@id" ) # from filename cover_id = cover_id[0] if cover_id else None if not cover_id: # cover not found in the book, let's try NLBs API # NOTE: identifier at this point is the e-book identifier edition_url = "{}/editions/{}?creative-work-metadata=none&edition-metadata=all".format( Config.get("nlb_api_url"), epub.identifier()) response = requests.get(edition_url) self.utils.report.debug( "looking for cover image in: {}".format(edition_url)) if response.status_code == 200: response_json = response.json() if "data" not in response_json: self.utils.report.debug("response as JSON:") self.utils.report.debug(str(response_json)) raise Exception( "No 'data' in response: {}".format(edition_url)) data = response_json["data"] cover_url = data["coverUrlLarge"] if cover_url is not None and cover_url.startswith("http"): response = requests.get(cover_url) if response.status_code == 200: _, extension = os.path.splitext(cover_url) target_href = "cover" + extension target_dir = os.path.dirname(opf_path) with open(os.path.join(target_dir, target_href), "wb") as target_file: target_file.write(response.content) self.utils.report.info( "Legger til bildet av bokomslaget i OPF-manifestet" ) media_type = None if extension.lower() in [ ".png" ]: # check for png, just in case. Should always be jpg though. media_type = "image/png" else: media_type = "image/jpeg" xslt = Xslt(self, stylesheet=os.path.join( Xslt.xslt_dir, PrepareForEbook.uid, "add-to-opf-manifest.xsl"), source=opf_path, target=temp_xml, parameters={ "href": target_href, "media-type": media_type }) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_xml, opf_path) opf_xml = ElementTree.parse(opf_path).getroot() cover_id = opf_xml.xpath( "/*/*[local-name()='manifest']/*[@href = '{}']/@id" .format(target_href)) # from filename cover_id = cover_id[0] if cover_id else None if cover_id is None or len(cover_id) == 0: self.utils.report.warn( "Klarte ikke å finne bilde av bokomslaget for {}".format( epub.identifier())) self.utils.report.info("Legger til properties i OPF etter behov") temp_epub.update_opf_properties() # validate with epubcheck if Epubcheck.isavailable(): epubcheck = Epubcheck(self, opf_path) if not epubcheck.success: tempfile_stored_opf = os.path.join( self.utils.report.reportDir(), os.path.basename(opf_path)) shutil.copy(opf_path, tempfile_stored_opf) tempfile_stored = os.path.join(self.utils.report.reportDir(), os.path.basename(html_file)) shutil.copy(html_file, tempfile_stored) self.utils.report.info( f"Validering av DTBook feilet, lagrer temp fil for feilsøking: {tempfile_stored}" ) self.utils.report.attachment(None, tempfile_stored, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return else: self.utils.report.warn( "Epubcheck er ikke tilgjengelig, EPUB blir ikke validert!") # ---------- lagre filsett ---------- self.utils.report.info( "Boken ble konvertert. Kopierer til HTML-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( temp_epubdir, epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " ble konvertert 👍😄" + epubTitle return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): return False if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") return False if epub.identifier() != self.book["name"].split(".")[0]: self.utils.report.error( self.book["name"] + ": Filnavn stemmer ikke overens med dc:identifier: {}".format( epub.identifier())) return False temp_xml_file_obj = tempfile.NamedTemporaryFile() temp_xml_file = temp_xml_file_obj.name self.utils.report.info("Lager en kopi av EPUBen") temp_epubdir_withimages_obj = tempfile.TemporaryDirectory() temp_epubdir_withimages = temp_epubdir_withimages_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir_withimages) self.utils.report.info("Lager en kopi av EPUBen med tomme bildefiler") temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, temp_epubdir_withimages, temp_epubdir) for root, dirs, files in os.walk( os.path.join(temp_epubdir, "EPUB", "images")): for file in files: fullpath = os.path.join(root, file) os.remove(fullpath) Path(fullpath).touch() temp_epub = Epub(self.utils.report, temp_epubdir) self.utils.report.info("Rydder opp i nordisk EPUB nav.xhtml") nav_path = os.path.join(temp_epubdir, temp_epub.nav_path()) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid, "nordic-cleanup-nav.xsl"), source=nav_path, target=temp_xml_file, parameters={ "cover": " ".join([item["href"] for item in temp_epub.spine()]), "base": os.path.dirname( os.path.join(temp_epubdir, temp_epub.opf_path())) + "/" }) if not xslt.success: return False shutil.copy(temp_xml_file, nav_path) self.utils.report.info("Rydder opp i nordisk EPUB package.opf") opf_path = os.path.join(temp_epubdir, temp_epub.opf_path()) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid, "nordic-cleanup-opf.xsl"), source=opf_path, target=temp_xml_file) if not xslt.success: return False shutil.copy(temp_xml_file, opf_path) html_dir_obj = tempfile.TemporaryDirectory() html_dir = html_dir_obj.name html_file = os.path.join(html_dir, epub.identifier() + ".xhtml") self.utils.report.info("Finner ut hvilket bibliotek boka tilhører…") edition_metadata = Metadata.get_edition_from_api( epub.identifier(), report=self.utils.report) library = None if edition_metadata is not None and edition_metadata[ "library"] is not None: library = edition_metadata["library"] else: library = Metadata.get_library_from_identifier( epub.identifier(), self.utils.report) self.utils.report.info(f"Boka tilhører '{library}'") self.utils.report.info("Zipper oppdatert versjon av EPUBen...") temp_epub.asFile(rebuild=True) self.utils.report.info( "Konverterer fra Nordisk EPUB 3 til Nordisk HTML 5...") epub_file = temp_epub.asFile() with DaisyPipelineJob(self, "nordic-epub3-to-html", { "epub": os.path.basename(epub_file), "fail-on-error": "false" }, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context={os.path.basename(epub_file): epub_file}) as dp2_job_convert: convert_status = "SUCCESS" if dp2_job_convert.status == "SUCCESS" else "ERROR" if convert_status != "SUCCESS": self.utils.report.error("Klarte ikke å konvertere boken") return False dp2_html_dir = os.path.join(dp2_job_convert.dir_output, "output-dir", epub.identifier()) dp2_html_file = os.path.join(dp2_job_convert.dir_output, "output-dir", epub.identifier(), epub.identifier() + ".xhtml") if not os.path.isdir(dp2_html_dir): self.utils.report.error( "Finner ikke den konverterte boken: {}".format( dp2_html_dir)) return False if not os.path.isfile(dp2_html_file): self.utils.report.error( "Finner ikke den konverterte boken: {}".format( dp2_html_file)) self.utils.report.info( "Kanskje filnavnet er forskjellig fra IDen?") return False Filesystem.copy(self.utils.report, dp2_html_dir, html_dir) self.utils.report.info("Rydder opp i nordisk HTML") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid, "nordic-cleanup.xsl"), source=html_file, target=temp_xml_file) if not xslt.success: return False shutil.copy(temp_xml_file, html_file) self.utils.report.info("Rydder opp i ns0 i page-normal") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid, "ns0-cleanup.xsl"), source=html_file, target=temp_xml_file) if not xslt.success: return False shutil.copy(temp_xml_file, html_file) self.utils.report.info("Rydder opp i innholdsfortegnelsen") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid, "fix-toc-span.xsl"), source=html_file, target=temp_xml_file) if not xslt.success: return False shutil.copy(temp_xml_file, html_file) self.utils.report.info( "Legger til EPUB-filer (OPF, NAV, container.xml, mediatype)...") nlbpub_tempdir_obj = tempfile.TemporaryDirectory() nlbpub_tempdir = nlbpub_tempdir_obj.name nlbpub = Epub.from_html(self, html_dir, nlbpub_tempdir) if nlbpub is None: return False self.utils.report.info( "Erstatter tomme bildefiler med faktiske bildefiler") for root, dirs, files in os.walk( os.path.join(nlbpub_tempdir, "EPUB", "images")): for file in files: fullpath = os.path.join(root, file) relpath = os.path.relpath(fullpath, nlbpub_tempdir) os.remove(fullpath) Filesystem.copy(self.utils.report, os.path.join(temp_epubdir_withimages, relpath), fullpath) temp_epub = Epub(self.utils.report, temp_epubdir) nlbpub.update_prefixes() self.utils.report.info( "Boken ble konvertert. Kopierer til NLBPUB-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( nlbpub.asDir(), temp_epub.identifier(), overwrite=self.overwrite) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " ble konvertert 👍😄" + epubTitle return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False # ---------- lag en kopi av EPUBen ---------- narration_epubdir_obj = tempfile.TemporaryDirectory() narration_epubdir = narration_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], narration_epubdir) nlbpub = Epub(self.utils.report, narration_epubdir) # ---------- gjør tilpasninger i HTML-fila med XSLT ---------- opf_path = nlbpub.opf_path() if not opf_path: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne OPF-fila i EPUBen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False opf_path = os.path.join(narration_epubdir, opf_path) xml = ElementTree.parse(opf_path).getroot() html_file = xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) html_file = html_file[0] if html_file else None if not html_file: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False html_file = os.path.join(os.path.dirname(opf_path), html_file) if not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False temp_html_obj = tempfile.NamedTemporaryFile() temp_html = temp_html_obj.name self.utils.report.info( "Fjerner elementer som ikke skal være med i lydboka...") self.utils.report.debug("ta-vekk-innhold.xsl") self.utils.report.debug(" source = " + html_file) self.utils.report.debug(" target = " + temp_html) xslt = Xslt(self, stylesheet=os.path.join(NlbpubToNarrationEpub.xslt_dir, NlbpubToNarrationEpub.uid, "ta-vekk-innhold.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_html, html_file) self.utils.report.info("Fikser Webarch-oppmerking") self.utils.report.debug("webarch-fixup.xsl") self.utils.report.debug(" source = " + html_file) self.utils.report.debug(" target = " + temp_html) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToNarrationEpub.uid, "webarch-fixup.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_html, html_file) self.utils.report.info("Fikser dikt-oppmerking") self.utils.report.debug("unwrap-poem-chapters.xsl") self.utils.report.debug(" source = " + html_file) self.utils.report.debug(" target = " + temp_html) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToNarrationEpub.uid, "unwrap-poem-chapters.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_html, html_file) self.utils.report.info("Lager usynlige overskrifter der det trengs...") self.utils.report.debug("create-hidden-headlines.xsl") self.utils.report.debug(" source = " + html_file) self.utils.report.debug(" target = " + temp_html) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "create-hidden-headlines.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_html, html_file) self.utils.report.info("Tilpasser innhold for innlesing...") self.utils.report.debug("prepare-for-narration.xsl") self.utils.report.debug(" source = " + html_file) self.utils.report.debug(" target = " + temp_html) xslt = Xslt(self, stylesheet=os.path.join(NlbpubToNarrationEpub.xslt_dir, NlbpubToNarrationEpub.uid, "prepare-for-narration.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_html, html_file) self.utils.report.info("Lager synkroniseringspunkter...") self.utils.report.debug("lag-synkroniseringspunkter.xsl") self.utils.report.debug(" source = " + html_file) self.utils.report.debug(" target = " + temp_html) xslt = Xslt(self, stylesheet=os.path.join(NlbpubToNarrationEpub.xslt_dir, NlbpubToNarrationEpub.uid, "lag-synkroniseringspunkter.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_html, html_file) self.utils.report.info("Gjør HTMLen litt penere...") self.utils.report.debug("pretty-print.xsl") self.utils.report.debug(" source = " + html_file) self.utils.report.debug(" target = " + temp_html) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, Epub.uid, "pretty-print.xsl"), source=html_file, target=temp_html) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_html, html_file) # ---------- erstatt metadata i OPF med metadata fra HTML ---------- temp_opf_obj = tempfile.NamedTemporaryFile() temp_opf = temp_opf_obj.name xslt = Epub.html_to_opf(self, opf_path, temp_opf) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False shutil.copy(temp_opf, opf_path) # ---------- hent nytt filnavn fra OPF (det endrer seg basert på boknummer) ---------- try: xml = ElementTree.parse(opf_path).getroot() new_html_file = xml.xpath( "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href" ) new_html_file = os.path.join( os.path.dirname(opf_path), new_html_file[0]) if new_html_file else None except Exception: self.utils.report.info(traceback.format_exc(), preformatted=True) self.utils.report.error(self.book["name"] + ": Klarte ikke å finne HTML-fila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False if html_file != new_html_file: shutil.copy(html_file, new_html_file) os.remove(html_file) html_file = new_html_file # ---------- lag nav.xhtml på nytt ---------- nav_path = nlbpub.nav_path() if not nav_path: self.utils.report.error( self.book["name"] + ": Klarte ikke å finne navigasjonsfila i OPFen.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return False nav_path = os.path.join(narration_epubdir, nav_path) xslt = Epub.html_to_nav(self, html_file, nav_path) if not xslt.success: self.utils.report.title = self.title + ": " + epub.identifier( ) + " feilet 😭👎" + epubTitle return False # ---------- legg til logo ---------- library = nlbpub.meta("schema:library") library = library.upper() if library else library logo = os.path.join(Xslt.xslt_dir, PrepareForEbook.uid, "{}_logo.png".format(library)) if os.path.isfile(logo) and library == "STATPED": shutil.copy( logo, os.path.join(os.path.dirname(html_file), os.path.basename(logo))) # ---------- save EPUB ---------- self.utils.report.info( "Boken ble konvertert. Kopierer til innlesingsklart EPUB-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( nlbpub.asFile(), nlbpub.identifier(), file_extension="epub", move=True) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " ble konvertert 👍😄" + epubTitle return True