Ejemplo n.º 1
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        # ---------- lag en kopi av EPUBen ----------

        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)
        temp_epub = Epub(self.utils.report, temp_epubdir)

        # ---------- gjør tilpasninger i HTML-fila med XSLT ----------

        opf_path = temp_epub.opf_path()
        if not opf_path:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne OPF-fila i EPUBen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        opf_path = os.path.join(temp_epubdir, opf_path)
        opf_xml = ElementTree.parse(opf_path).getroot()

        html_file = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
        )
        html_file = html_file[0] if html_file else None
        if not html_file:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        html_file = os.path.join(os.path.dirname(opf_path), html_file)
        if not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        temp_html_obj = tempfile.NamedTemporaryFile()
        temp_html = temp_html_obj.name

        self.utils.report.info("Tilpasser innhold for punktskrift...")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            PrepareForBraille.uid,
                                            "prepare-for-braille.xsl"),
                    source=html_file,
                    target=temp_html)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_html, html_file)

        # ---------- hent nytt boknummer fra /html/head/meta[@name='dc:identifier'] og bruk som filnavn ----------

        html_xml = ElementTree.parse(temp_html).getroot()
        result_identifier = html_xml.xpath(
            "/*/*[local-name()='head']/*[@name='dc:identifier']")
        result_identifier = result_identifier[0].attrib[
            "content"] if result_identifier and "content" in result_identifier[
                0].attrib else None
        if not result_identifier:
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å finne boknummer i ny HTML-fil.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        shutil.copy(html_file, temp_html)
        os.remove(html_file)
        html_file = os.path.join(
            os.path.dirname(html_file), result_identifier +
            ".html")  # Bruk html istedenfor xhtml når det ikke er en EPUB
        shutil.copy(temp_html, html_file)
        # TODO: sett inn HTML5 doctype: <!DOCTYPE html>

        # ---------- slett EPUB-spesifikke filer ----------

        items = opf_xml.xpath("/*/*[local-name()='manifest']/*")
        for item in items:
            delete = False

            if "properties" in item.attrib and "nav" in re.split(
                    r'\s+', item.attrib["properties"]):
                delete = True

            if "media-type" in item.attrib:
                if item.attrib["media-type"].startswith("audio/"):
                    delete = True
                elif item.attrib["media-type"] == "application/smil+xml":
                    delete = True

            if not delete or "href" not in item.attrib:
                continue

            fullpath = os.path.join(os.path.dirname(opf_path),
                                    item.attrib["href"])
            os.remove(fullpath)
        os.remove(opf_path)

        # ---------- lagre HTML-filsett ----------

        html_dir = os.path.dirname(opf_path)

        self.utils.report.info(
            "Boken ble konvertert. Kopierer til arkiv for punkt-klare HTML-filer."
        )

        archived_path, stored = self.utils.filesystem.storeBook(
            html_dir, self.book["name"])
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + self.book[
            "name"] + " ble konvertert 👍😄" + epubTitle
        return True
Ejemplo n.º 2
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        # language must be exctracted from epub or else docx default language (nb) wil be used in the converted file
        language = ""
        try:
            #language = " (" + epub.meta("dc:language") + ") "
            language = epub.meta("dc:language")

        except Exception:
            pass

        # ---------- lag en kopi av EPUBen ----------

        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)
        temp_epub = Epub(self, temp_epubdir)

        opf_path = temp_epub.opf_path()
        if not opf_path:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne OPF-fila i EPUBen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        opf_path = os.path.join(temp_epubdir, opf_path)
        opf_xml = ElementTree.parse(opf_path).getroot()

        html_file = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
        )
        html_file = html_file[0] if html_file else None
        if not html_file:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        html_file = os.path.join(os.path.dirname(opf_path), html_file)
        if not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        temp_xml_file_obj = tempfile.NamedTemporaryFile()
        temp_xml_file = temp_xml_file_obj.name

        self.utils.report.info(
            "Konverterer fra ASCIIMath til norsk punktnotasjon…")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NLBpubToDocx.uid,
                                            "nordic-asciimath-epub.xsl"),
                    source=html_file,
                    target=temp_xml_file)
        if not xslt.success:
            return False
        shutil.copy(temp_xml_file, html_file)

        # ---------- konverter HTML-fila til DOCX ----------

        temp_docxdir_obj = tempfile.TemporaryDirectory()
        temp_docxdir = temp_docxdir_obj.name

        try:
            self.utils.report.info("Konverterer fra XHTML til DOCX...")
            process = self.utils.filesystem.run([
                "/usr/bin/ebook-convert",
                html_file,
                os.path.join(temp_docxdir,
                             epub.identifier() + "_calibre.docx"),
                "--chapter=/",
                "--chapter-mark=none",
                "--page-breaks-before=/",
                "--no-chapters-in-toc",
                "--toc-threshold=0",
                "--docx-page-size=a4",
                # "--linearize-tables",
                "--extra-css=" +
                os.path.join(Xslt.xslt_dir, self.uid, 'extra.css'),

                # NOTE: microsoft fonts must be installed:
                # sudo apt-get install ttf-mscorefonts-installer
                "--embed-font-family=Verdana",
                "--docx-page-margin-top=42",
                "--docx-page-margin-bottom=42",
                "--docx-page-margin-left=70",
                "--docx-page-margin-right=56",
                #"--language="+epub.meta('dc:language'),
                ("--language=" + language) if language else "",
                "--base-font-size=13",
                #"--remove-paragraph-spacing",
                #"--remove-paragraph-spacing-indent-size=-1",
                "--font-size-mapping=13,13,13,13,13,13,13,13"
            ])

            if process.returncode == 0:
                self.utils.report.info("Boken ble konvertert.")

                # -------------  script from kvile ---------------
                document = Document(
                    os.path.join(temp_docxdir,
                                 epub.identifier() + "_calibre.docx"))
                emptyParagraph = False
                normalParagraph = "Normal"
                normalParagraphNoIndent = "NormalNoIndent"
                headingIndent = Cm(1.25)
                fontSize = Pt(13)
                # ny kode 2021-01-20
                #folder = os.path.join(temp_docxdir)

                folder = Path(temp_docxdir)

                # slutt ny kode

                #self.utils.report.info("Folder: "+folder)

                def zipdir(src, dst, zip_name):
                    os.chdir(dst)
                    ziph = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
                    for root, dirs, files in os.walk(src):
                        for file in files:
                            ziph.write(os.path.join(root, file),
                                       arcname=os.path.join(
                                           root.replace(src, ""), file))
                    ziph.close()

                def writeFile(txt, dst):
                    tempFile = open(folder / dst, "w+")
                    tempFile.write(txt)
                    tempFile.close()

                def delete_paragraph(paragraph):
                    # self.utils.report.info("Delete paragraph: ")
                    p = paragraph._element
                    p.getparent().remove(p)
                    p._p = p._element = None

                def delete_element(element):
                    element.getparent().remove(element)
                    element._element = None

                indent = Cm(0.44)
                hangingIndentList = Cm(0.63)
                document.styles[normalParagraph].font.size = fontSize
                document.styles[
                    normalParagraph].paragraph_format.first_line_indent = indent
                styleNoIndent = document.styles.add_style(
                    'NormalNoIndent', WD_STYLE_TYPE.PARAGRAPH)
                styleNoIndent.base_style = document.styles[normalParagraph]
                document.styles[
                    normalParagraphNoIndent].paragraph_format.first_line_indent = Cm(
                        0)

                # set style to normal for regular paragraphs, set keep_with_next to false, remove multiple empty paragraphs, and remove empty p after page nr or heading
                for paragraph in document.paragraphs:
                    # deleting empty text-elements
                    emptyTextElementList = document.element.xpath(
                        "//w:t[. = '']")
                    for emptyTextElement in emptyTextElementList:
                        delete_element(emptyTextElement)
                    paragraph.paragraph_format.keep_with_next = None
                    if re.match("Para 0[1-9]|[0-9] Block|Para [0-9]",
                                paragraph.style.name
                                ) and paragraph.style.font.underline != True:
                        paragraph.style = normalParagraph
                    if len(paragraph.text) <= 1 or re.match(
                            r"^--- \d+ til ", paragraph.text
                    ) or paragraph.style.name[
                            0:
                            7] == "Heading":  # if empty p or page nr or heading
                        paragraph.text = re.sub(
                            r"^\s(.*)", r"\1",
                            paragraph.text)  #remove space at beginning av p
                        # self.utils.report.info("Paragraph.text <= 1 ")
                        if len(
                                paragraph.text
                        ) == 0 and emptyParagraph:  #if last p also was empty or page nr
                            #        self.utils.report.info("Paragraph.text == 0 ")
                            delete_paragraph(paragraph)
                        emptyParagraph = True
                    else:
                        emptyParagraph = False
                        if re.match(r"^\s*STATPED_DUMMYTEXT_LI_OL\s*$",
                                    paragraph.text):
                            paragraph.text = ""
                # no indent after Heading, page-nr, or paragraphs starting with "Bilde: ", paragraphs in only bold (text=^_[^_]*_$) and the paragraph after p in only bold, or on empty p.
                removeIndent = False
                for paragraph in document.paragraphs:
                    #remove space at beginning of line after <br/>
                    spaceAfterBreakList = paragraph._element.xpath(
                        r'w:r/w:br[@w:clear="none"]/following::w:t[@xml:space="preserve"][1]'
                    )
                    if len(spaceAfterBreakList) > 0:
                        for spaceAfterBreakElement in spaceAfterBreakList:
                            if re.match(
                                    '^ ', spaceAfterBreakElement.text
                            ) and not (spaceAfterBreakElement.xpath(
                                    r'preceding-sibling::*[1][self::w:t]')):
                                spaceAfterBreakElement.text = re.sub(
                                    r"^ ", r"", spaceAfterBreakElement.text)
                    #remove break before paragraph end
                        breakBeforeParagraphEndList = paragraph._element.xpath(
                            r'w:r[last()]/w:br[@w:clear="none" and not(following-sibling::*)]'
                        )
                        if len(breakBeforeParagraphEndList) > 0:
                            delete_element(breakBeforeParagraphEndList[0])

                    t = paragraph.text.strip()
                    if re.match(
                            r"^Bilde: |^Forklaring: |^--- \d+ til |^_[^_]*_$|^STATPED_DUMMYTEXT_LIST_UNSTYLED|^STATPED_DUMMYTEXT_P_BEFORE_DL",
                            t) or ((removeIndent or len(t) == 0)
                                   and paragraph.style.name == "Normal"):
                        paragraph.style = normalParagraphNoIndent
                    # Remove dummy-text and set hengemarg
                    if re.match(
                            r"^(STATPED_DUMMYTEXT_LIST_UNSTYLED|STATPED_DUMMYTEXT_DL)",
                            paragraph.text):
                        paragraph.paragraph_format.left_indent = hangingIndentList  #Pt(0)
                        paragraph.paragraph_format.first_line_indent = -hangingIndentList  #Pt(-20)
                    if re.match(r"^STATPED_DUMMYTEXT", paragraph.text):
                        paragraph.text = re.sub(
                            r"^(STATPED_DUMMYTEXT_LIST_UNSTYLED|STATPED_DUMMYTEXT_DL|STATPED_DUMMYTEXT_P_BEFORE_DL)",
                            "", paragraph.text)
                    if len(t) == 0 or paragraph.style.name[
                            0:7] == "Heading" or re.match(
                                r"^--- \d+ til |^_[^_]*_$", t):
                        removeIndent = True
                    else:
                        removeIndent = False

                # remove bold from Headings.
                paraStylesWithoutBoldOrUnderline = [
                ]  #list of all para-styles without underline or bold
                paraStylesWithoutUnderline = [
                ]  #list of all para-styles without underline
                for style in document.styles:
                    if style.name[0:7] == "Heading":
                        style.font.bold = None
                        style.paragraph_format.left_indent = headingIndent  #Pt(0)
                        style.paragraph_format.first_line_indent = -headingIndent  #Pt(-20)
                        style.paragraph_format.space_before = Pt(0)
                        style.paragraph_format.space_after = Pt(0)
                        style_element = style._element
                        spacing = style_element.xpath(r'w:pPr/w:spacing')[0]
                        spacing.set(qn('w:beforeLines'), "0")
                        spacing.set(qn('w:afterLines'), "0")
                    if style.name[0:5] == "Para ":
                        if style.font.underline != True:
                            paraStylesWithoutUnderline.append(style.name)
                            if style.font.bold != True:
                                paraStylesWithoutBoldOrUnderline.append(
                                    style.name)

                # find all para-styles with wanted properties in tables and change style
                paraStylesInTables = []
                #for paraStyleWithoutBoldOrUnderline in paraStylesWithoutBoldOrUnderline:
                for paraStyleWithoutUnderline in paraStylesWithoutUnderline:
                    for element in document.element.xpath(
                            "//w:tbl//w:p//w:pStyle[@w:val = '" +
                            paraStyleWithoutUnderline + "']"):
                        paraStylesInTables.append(element)
                for paraStyleInTables in paraStylesInTables:
                    paraStyleInTables.attrib[
                        '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'] = normalParagraphNoIndent  # or normalParagraph

                # uncomment if you want to modify first p in a cell
                # firstParaStylesInTables = []
                # for paraStyleWithoutBoldOrUnderline in paraStylesWithoutBoldOrUnderline:
                #     for element in document.element.xpath("//w:tc//w:p[position()=1]//w:pStyle[@w:val = '" + normalParagraph + "']"):
                #         firstParaStylesInTables.append(element)
                # for paraStyleInTables in firstParaStylesInTables:
                #     paraStyleInTables.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'] = normalParagraphNoIndent

                # tables missing required <w:tblGrid>, so throws: docx.oxml.exceptions.InvalidXmlError: required ``<w:tblGrid>`` child element not present
                #from docx.table import _Cell, Table
                #from docx.oxml.text.paragraph import CT_P
                # for row in table.columns:
                #     try:
                #         for cell in row.cells:
                #             firstP = True
                #             for p in cell.paragraphs:
                #                 if p.style.font.underline != True and re.match(r"^Para | Block",p.style.name):
                #                     if firstP:
                #                         p.style = "NormalNoIndent"
                #                         firstP = False
                #                     else:
                #                         p.style = "Normal"
                #     except Exception as e:
                #         pass

                document.save(
                    os.path.join(temp_docxdir,
                                 epub.identifier() + ".docx"))
                self.utils.report.info(
                    "Temp-fil ble lagret: " +
                    os.path.join(temp_docxdir,
                                 epub.identifier() + ".docx"))

                wordFile = os.path.join(temp_docxdir,
                                        epub.identifier() + ".docx")

                zipDocument = zipfile.ZipFile((folder / wordFile))
                tempFolder = "temp"
                zipDocument.extractall(folder / tempFolder)
                zipDocument.close()
                zippedFile = tempFolder + "/word/numbering.xml"
                xmlFile = open((folder / zippedFile), 'r+')
                xmlText = xmlFile.read()
                xmlText = re.sub(r'w:left="1152"', r'w:left="360"', xmlText)
                xmlText = re.sub(r'w:left="1512"', r'w:left="720"', xmlText)
                xmlText = re.sub(r'w:left="1872"', r'w:left="1080"', xmlText)
                xmlText = re.sub(
                    r'<w:numFmt w:val="lowerLetter"/><w:lvlText w:val="%([1-9])\."/>',
                    r'<w:numFmt w:val="lowerLetter"/><w:lvlText w:val="%\1)"/>',
                    xmlText)  # a. as a) in lists
                #xmlText = re.sub(r'<w:lvlText w:val="%(1|2)\."/>', r'<w:lvlText w:val="%\1)"/>', xmlText) # a. as a), and 1. as 1) in lists

                writeFile(xmlText, zippedFile)
                zipdir(str(folder / tempFolder), str(folder),
                       os.path.join(temp_docxdir,
                                    epub.identifier() + ".docx"))


# ---------- end script from kvile -------

            else:
                self.utils.report.error(
                    "En feil oppstod ved konvertering til DOCX for " +
                    epub.identifier())
                self.utils.report.debug(traceback.format_stack())
                self.utils.report.title = self.title + ": " + self.book[
                    "name"] + " feilet 😭👎" + epubTitle
                return False

        except subprocess.TimeoutExpired:
            self.utils.report.error(
                "Det tok for lang tid å konvertere " + epub.identifier() +
                " til DOCX, og Calibre-prosessen ble derfor stoppet.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        except Exception:
            self.utils.report.error(
                "En feil oppstod ved konvertering til DOCX for " +
                epub.identifier())
            self.utils.report.info(traceback.format_exc(), preformatted=True)
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        archived_path, stored = self.utils.filesystem.storeBook(
            temp_docxdir, epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " ble konvertert 👍😄" + epubTitle
        return True
Ejemplo n.º 3
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        # ---------- lag en kopi av EPUBen ----------

        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)
        temp_epub = Epub(self, temp_epubdir)

        # ---------- gjør tilpasninger i HTML-fila med XSLT ----------

        opf_path = temp_epub.opf_path()
        if not opf_path:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne OPF-fila i EPUBen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        opf_path = os.path.join(temp_epubdir, opf_path)
        opf_xml = ElementTree.parse(opf_path).getroot()

        html_file = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
        )
        html_file = html_file[0] if html_file else None
        if not html_file:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        html_dir = os.path.dirname(opf_path)
        html_file = os.path.join(html_dir, html_file)
        if not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        temp_html_obj = tempfile.NamedTemporaryFile()
        temp_html = temp_html_obj.name

        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, PrepareForDocx.uid,
                                            "prepare-for-docx.xsl"),
                    source=html_file,
                    target=temp_html)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_html, html_file)

        archived_path, stored = self.utils.filesystem.storeBook(
            temp_epubdir, epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " ble konvertert 👍😄" + epubTitle
        return True
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")

        self.utils.report.info("Locating HTML file")
        epub = Epub(self.utils.report, self.book["source"])
        if not epub.isepub():
            return False
        assert epub.isepub(), "The input must be an EPUB"
        spine = epub.spine()
        if not len(spine) == 1:
            self.utils.report.warn(
                "There must only be one item in the EPUB spine")
            return False
        html_file = os.path.join(self.book["source"],
                                 os.path.dirname(epub.opf_path()),
                                 spine[0]["href"])

        identifier = epub.identifier()

        self.utils.report.info("lag en kopi av boka")
        temp_resultdir_obj = tempfile.TemporaryDirectory()
        temp_resultdir = temp_resultdir_obj.name
        Filesystem.copy(self.utils.report, os.path.dirname(html_file),
                        temp_resultdir)
        temp_result = os.path.join(temp_resultdir, identifier + ".xml")

        self.utils.report.info("sletter EPUB-spesifikke filer")
        for root, dirs, files in os.walk(temp_resultdir):
            for file in files:
                if Path(file).suffix.lower() in [
                        ".xhtml", ".html", ".smil", ".mp3", ".wav", ".opf"
                ]:
                    os.remove(os.path.join(root, file))
        shutil.copy(html_file, temp_result)

        temp_xslt_output_obj = tempfile.NamedTemporaryFile()
        temp_xslt_output = temp_xslt_output_obj.name

        # MATHML to stem
        self.utils.report.info("Erstatter evt. MathML i boka...")
        mathml_validation = Mathml_validator(self, source=temp_result)
        if not mathml_validation.success:
            return False

        mathML_result = Mathml_to_text(self,
                                       source=temp_result,
                                       target=temp_result)

        if not mathML_result.success:
            return False

        self.utils.report.info("Fikser Webarch-oppmerking")
        self.utils.report.debug("webarch-fixup.xsl")
        self.utils.report.debug("    source = " + temp_result)
        self.utils.report.debug("    target = " + temp_xslt_output)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            NlbpubToNarrationEpub.uid,
                                            "webarch-fixup.xsl"),
                    source=temp_result,
                    target=temp_xslt_output)
        if not xslt.success:
            return False
        shutil.copy(temp_xslt_output, temp_result)

        self.utils.report.info("Setter inn lydbokavtalen...")
        self.utils.report.debug("bokinfo-tts-dtbook.xsl")
        self.utils.report.debug("    source = " + temp_result)
        self.utils.report.debug("    target = " + temp_xslt_output)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            NlbpubToTtsDtbook.uid,
                                            "bokinfo-tts-dtbook.xsl"),
                    source=temp_result,
                    target=temp_xslt_output)
        if not xslt.success:
            return False
        shutil.copy(temp_xslt_output, temp_result)

        creative_work_metadata = None
        timeout = 0

        while creative_work_metadata is None and timeout < 5:

            timeout = timeout + 1
            creative_work_metadata = Metadata.get_creative_work_from_api(
                identifier,
                editions_metadata="all",
                use_cache_if_possible=True,
                creative_work_metadata="all")
            if creative_work_metadata is not None:
                if creative_work_metadata["magazine"] is True:
                    self.utils.report.info(
                        "Fjerner sidetall fordi det er et tidsskrift...")
                    self.utils.report.debug("remove-pagenum.xsl")
                    self.utils.report.debug("    source = " + temp_result)
                    self.utils.report.debug("    target = " + temp_xslt_output)
                    xslt = Xslt(self,
                                stylesheet=os.path.join(
                                    Xslt.xslt_dir, NlbpubToTtsDtbook.uid,
                                    "remove-pagenum.xsl"),
                                source=temp_result,
                                target=temp_xslt_output)
                    if not xslt.success:
                        return False
                    shutil.copy(temp_xslt_output, temp_result)
                break

        if creative_work_metadata is None:
            self.utils.report.warning(
                "Klarte ikke finne et åndsverk tilknyttet denne utgaven. Konverterer likevel."
            )

        library = epub.meta("schema:library")
        library = library.upper() if library else library
        logo = os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid,
                            "{}_logo.png".format(library))

        if os.path.isfile(logo):
            # epub_dir = os.path.join(temp_resultdir, "EPUB")
            image_dir = os.path.join(temp_resultdir, "images")
            if not os.path.isdir(image_dir):
                os.mkdir(image_dir)
            shutil.copy(logo, image_dir)

        self.utils.report.info("Konverterer fra XHTML5 til DTBook...")
        self.utils.report.debug("html-to-dtbook.xsl")
        self.utils.report.debug("    source = " + temp_result)
        self.utils.report.debug("    target = " + temp_xslt_output)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            NlbpubToTtsDtbook.uid,
                                            "html-to-dtbook.xsl"),
                    source=temp_result,
                    target=temp_xslt_output)
        if not xslt.success:
            return False
        shutil.copy(temp_xslt_output, temp_result)

        self.utils.report.info("Gjør tilpasninger i DTBook")
        self.utils.report.debug("dtbook-cleanup.xsl")
        self.utils.report.debug("    source = " + temp_result)
        self.utils.report.debug("    target = " + temp_xslt_output)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            NlbpubToTtsDtbook.uid,
                                            "dtbook-cleanup.xsl"),
                    source=temp_result,
                    target=temp_xslt_output)
        if not xslt.success:
            return False
        shutil.copy(temp_xslt_output, temp_result)

        # Fjern denne transformasjonen hvis det oppstår kritiske proplemer med håndteringen av komplekst innhold
        self.utils.report.info(
            "Legger inn ekstra informasjon om komplekst innhold")
        self.utils.report.debug("optimaliser-komplekst-innhold.xsl")
        self.utils.report.debug("    source = " + temp_result)
        self.utils.report.debug("    target = " + temp_xslt_output)
        xslt = Xslt(self,
                    stylesheet=os.path.join(
                        Xslt.xslt_dir, NlbpubToTtsDtbook.uid,
                        "optimaliser-komplekst-innhold.xsl"),
                    source=temp_result,
                    target=temp_xslt_output)
        if not xslt.success:
            return False
        shutil.copy(temp_xslt_output, temp_result)

        self.utils.report.info("Validerer DTBook...")
        # NOTE: This RelaxNG schema assumes that we're using DTBook 2005-3 and MathML 3.0
        dtbook_relax = Relaxng(
            self,
            relaxng=os.path.join(
                Xslt.xslt_dir, NlbpubToTtsDtbook.uid,
                "dtbook-schema/rng/dtbook-2005-3.mathml-3.integration.rng"),
            source=temp_result)
        dtbook_sch = Schematron(self,
                                schematron=os.path.join(
                                    Xslt.xslt_dir, NlbpubToTtsDtbook.uid,
                                    "dtbook-schema/sch/dtbook.mathml.sch"),
                                source=temp_result)
        if not dtbook_relax.success:
            self.utils.report.error("Validering av DTBook feilet (RelaxNG)")
        if not dtbook_sch.success:
            self.utils.report.error("Validering av DTBook feilet (Schematron)")
        if not dtbook_relax.success or not dtbook_sch.success:
            tempfile_stored = os.path.join(self.utils.report.reportDir(),
                                           os.path.basename(temp_result))
            shutil.copy(temp_result, tempfile_stored)
            self.utils.report.info(
                f"Validering av DTBook feilet, lagrer temp fil for feilsøking: {tempfile_stored}"
            )
            self.utils.report.attachment(None, tempfile_stored, "DEBUG")
            return False

        self.utils.report.info(
            "Boken ble konvertert. Kopierer til DTBook-arkiv.")
        archived_path, stored = self.utils.filesystem.storeBook(
            temp_resultdir, identifier)
        self.utils.report.attachment(None, archived_path, "DEBUG")
        return True
Ejemplo n.º 5
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return

        # ---------- lag en kopi av EPUBen ----------

        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)
        temp_epub = Epub(self.utils.report, temp_epubdir)

        # ---------- gjør tilpasninger i HTML-fila med XSLT ----------

        opf_path = temp_epub.opf_path()
        if not opf_path:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne OPF-fila i EPUBen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return
        opf_path = os.path.join(temp_epubdir, opf_path)
        opf_xml = ElementTree.parse(opf_path).getroot()

        html_file = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
        )
        html_file = html_file[0] if html_file else None
        if not html_file:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return
        html_file = os.path.join(os.path.dirname(opf_path), html_file)
        if not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return

        temp_xml_obj = tempfile.NamedTemporaryFile()
        temp_xml = temp_xml_obj.name

        self.utils.report.info("Flater ut NLBPUB")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid,
                                            "nlbpub-flatten.xsl"),
                    source=html_file,
                    target=temp_xml)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return
        shutil.copy(temp_xml, html_file)

        self.utils.report.info("Deler opp NLBPUB i flere HTML-filer")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid,
                                            "nlbpub-split.xsl"),
                    source=html_file,
                    target=temp_xml,
                    parameters={"output-dir": os.path.dirname(html_file)})
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return
        os.remove(html_file)

        spine_hrefs = []
        for href in sorted(os.listdir(os.path.dirname(html_file))):
            if href.endswith(".xhtml") and href not in [
                    "nav.xhtml", os.path.basename(html_file)
            ]:
                spine_hrefs.append(href)

        self.utils.report.info("Oppdaterer OPF-fil")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid,
                                            "update-opf.xsl"),
                    source=opf_path,
                    target=temp_xml,
                    parameters={"spine-hrefs": ",".join(spine_hrefs)})
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return
        shutil.copy(temp_xml, opf_path)

        nav_path = os.path.join(temp_epubdir, temp_epub.nav_path())

        self.utils.report.info("Lager nytt navigasjonsdokument")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid,
                                            "generate-nav.xsl"),
                    source=opf_path,
                    target=nav_path)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return

        self.utils.report.info("Legger til properties i OPF etter behov")
        temp_epub.update_opf_properties()

        if Epubcheck.isavailable():
            epubcheck = Epubcheck(self, opf_path)
            if not epubcheck.success:
                self.utils.report.title = self.title + ": " + epub.identifier(
                ) + " feilet 😭👎" + epubTitle
                return
        else:
            self.utils.report.warn(
                "Epubcheck not available, EPUB will not be validated!")

        self.utils.report.info(
            "Boken ble konvertert. Kopierer til e-bok-arkiv.")

        archived_path, stored = self.utils.filesystem.storeBook(
            temp_epubdir, temp_epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")
        Bibliofil.book_available(NlbpubToEpub.publication_format,
                                 temp_epub.identifier())
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " ble konvertert 👍😄" + epubTitle
        return True
Ejemplo n.º 6
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        # ---------- lag en kopi av EPUBen ----------

        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)
        temp_epub = Epub(self.utils.report, temp_epubdir)

        # ---------- gjør tilpasninger i HTML-fila med XSLT ----------

        opf_path = temp_epub.opf_path()
        if not opf_path:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne OPF-fila i EPUBen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        opf_path = os.path.join(temp_epubdir, opf_path)
        opf_xml = ElementTree.parse(opf_path).getroot()

        html_file = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
        )
        html_file = html_file[0] if html_file else None
        if not html_file:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        html_dir = os.path.dirname(opf_path)
        html_file = os.path.join(html_dir, html_file)
        if not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        temp_xml_obj = tempfile.NamedTemporaryFile()
        temp_xml = temp_xml_obj.name

        # MATHML to stem
        self.utils.report.info("Erstatter evt. MathML i boka...")
        mathml_validation = Mathml_validator(self, source=html_file)
        if not mathml_validation.success:
            self.utils.report.error(
                "NLBPUB contains MathML errors, aborting...")
            return False

        mathML_result = Mathml_to_text(self,
                                       source=html_file,
                                       target=html_file)

        if not mathML_result.success:
            return False

        self.utils.report.info(
            "Lager skjulte overskrifter der det er nødvendig")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                                            "create-hidden-headlines.xsl"),
                    source=html_file,
                    target=temp_xml,
                    parameters={
                        "cover-headlines": "from-type",
                        "frontmatter-headlines": "from-type",
                        "bodymatter-headlines": "from-text",
                        "backmatter-headlines": "from-type"
                    })
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_xml, html_file)

        self.utils.report.info("Tilpasser innhold for e-bok...")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                                            "prepare-for-ebook.xsl"),
                    source=html_file,
                    target=temp_xml)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_xml, html_file)

        # Use library-specific logo and stylesheet if available

        library = temp_epub.meta("schema:library")
        library = library.upper() if library else library
        logo = os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                            "{}_logo.png".format(library))

        if os.path.isfile(logo):
            shutil.copy(logo, os.path.join(html_dir, os.path.basename(logo)))

        PrepareForEbook.update_css()

        stylesheet = PrepareForEbook.css_tempfile_obj.name
        if library is not None and library.lower() == "statped":
            stylesheet = PrepareForEbook.css_tempfile_statped_obj.name
        shutil.copy(stylesheet, os.path.join(html_dir, "ebok.css"))

        self.utils.report.info("Legger til logoen i OPF-manifestet")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                                            "add-to-opf-manifest.xsl"),
                    source=opf_path,
                    target=temp_xml,
                    parameters={
                        "href": os.path.basename(logo),
                        "media-type": "image/png"
                    })
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_xml, opf_path)

        self.utils.report.info("Legger til CSS-fila i OPF-manifestet")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                                            "add-to-opf-manifest.xsl"),
                    source=opf_path,
                    target=temp_xml,
                    parameters={
                        "href": "ebok.css",
                        "media-type": "text/css"
                    })
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_xml, opf_path)

        # add cover if missing

        opf_xml = ElementTree.parse(opf_path).getroot()
        cover_id = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[contains(concat(' ', @properties, ' '), ' cover-image ')]/@id"
        )  # from properties
        if not cover_id:
            cover_id = opf_xml.xpath(
                "/*/*[local-name()='manifest']/*[@name='cover']/@content"
            )  # from metadata
        if not cover_id:
            cover_id = opf_xml.xpath(
                "/*/*[local-name()='manifest']/*[starts-with(@media-type, 'image/') and contains(@href, 'cover')]/@id"
            )  # from filename
        cover_id = cover_id[0] if cover_id else None

        if not cover_id:
            # cover not found in the book, let's try NLBs API

            # NOTE: identifier at this point is the e-book identifier
            edition_url = "{}/editions/{}?creative-work-metadata=none&edition-metadata=all".format(
                Config.get("nlb_api_url"), epub.identifier())

            response = requests.get(edition_url)
            self.utils.report.debug(
                "looking for cover image in: {}".format(edition_url))
            if response.status_code == 200:
                response_json = response.json()
                if "data" not in response_json:
                    self.utils.report.debug("response as JSON:")
                    self.utils.report.debug(str(response_json))
                    raise Exception(
                        "No 'data' in response: {}".format(edition_url))
                data = response_json["data"]
                cover_url = data["coverUrlLarge"]
                if cover_url is not None and cover_url.startswith("http"):
                    response = requests.get(cover_url)
                    if response.status_code == 200:
                        _, extension = os.path.splitext(cover_url)
                        target_href = "cover" + extension
                        target_dir = os.path.dirname(opf_path)
                        with open(os.path.join(target_dir, target_href),
                                  "wb") as target_file:
                            target_file.write(response.content)

                        self.utils.report.info(
                            "Legger til bildet av bokomslaget i OPF-manifestet"
                        )
                        media_type = None
                        if extension.lower() in [
                                ".png"
                        ]:  # check for png, just in case. Should always be jpg though.
                            media_type = "image/png"
                        else:
                            media_type = "image/jpeg"
                        xslt = Xslt(self,
                                    stylesheet=os.path.join(
                                        Xslt.xslt_dir, PrepareForEbook.uid,
                                        "add-to-opf-manifest.xsl"),
                                    source=opf_path,
                                    target=temp_xml,
                                    parameters={
                                        "href": target_href,
                                        "media-type": media_type
                                    })
                        if not xslt.success:
                            self.utils.report.title = self.title + ": " + epub.identifier(
                            ) + " feilet 😭👎" + epubTitle
                            return False
                        shutil.copy(temp_xml, opf_path)

                        opf_xml = ElementTree.parse(opf_path).getroot()
                        cover_id = opf_xml.xpath(
                            "/*/*[local-name()='manifest']/*[@href = '{}']/@id"
                            .format(target_href))  # from filename
                        cover_id = cover_id[0] if cover_id else None

        if cover_id is None or len(cover_id) == 0:
            self.utils.report.warn(
                "Klarte ikke å finne bilde av bokomslaget for {}".format(
                    epub.identifier()))

        self.utils.report.info("Legger til properties i OPF etter behov")
        temp_epub.update_opf_properties()

        # validate with epubcheck
        if Epubcheck.isavailable():
            epubcheck = Epubcheck(self, opf_path)
            if not epubcheck.success:
                tempfile_stored_opf = os.path.join(
                    self.utils.report.reportDir(), os.path.basename(opf_path))
                shutil.copy(opf_path, tempfile_stored_opf)
                tempfile_stored = os.path.join(self.utils.report.reportDir(),
                                               os.path.basename(html_file))
                shutil.copy(html_file, tempfile_stored)
                self.utils.report.info(
                    f"Validering av DTBook feilet, lagrer temp fil for feilsøking: {tempfile_stored}"
                )
                self.utils.report.attachment(None, tempfile_stored, "DEBUG")
                self.utils.report.title = self.title + ": " + epub.identifier(
                ) + " feilet 😭👎" + epubTitle
                return
        else:
            self.utils.report.warn(
                "Epubcheck er ikke tilgjengelig, EPUB blir ikke validert!")

        # ---------- lagre filsett ----------

        self.utils.report.info(
            "Boken ble konvertert. Kopierer til HTML-arkiv.")

        archived_path, stored = self.utils.filesystem.storeBook(
            temp_epubdir, epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " ble konvertert 👍😄" + epubTitle
        return True
Ejemplo n.º 7
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            return False

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            return False

        if epub.identifier() != self.book["name"].split(".")[0]:
            self.utils.report.error(
                self.book["name"] +
                ": Filnavn stemmer ikke overens med dc:identifier: {}".format(
                    epub.identifier()))
            return False

        temp_xml_file_obj = tempfile.NamedTemporaryFile()
        temp_xml_file = temp_xml_file_obj.name

        self.utils.report.info("Lager en kopi av EPUBen")
        temp_epubdir_withimages_obj = tempfile.TemporaryDirectory()
        temp_epubdir_withimages = temp_epubdir_withimages_obj.name
        Filesystem.copy(self.utils.report, self.book["source"],
                        temp_epubdir_withimages)

        self.utils.report.info("Lager en kopi av EPUBen med tomme bildefiler")
        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, temp_epubdir_withimages,
                        temp_epubdir)
        for root, dirs, files in os.walk(
                os.path.join(temp_epubdir, "EPUB", "images")):
            for file in files:
                fullpath = os.path.join(root, file)
                os.remove(fullpath)
                Path(fullpath).touch()
        temp_epub = Epub(self.utils.report, temp_epubdir)

        self.utils.report.info("Rydder opp i nordisk EPUB nav.xhtml")
        nav_path = os.path.join(temp_epubdir, temp_epub.nav_path())
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid,
                                            "nordic-cleanup-nav.xsl"),
                    source=nav_path,
                    target=temp_xml_file,
                    parameters={
                        "cover":
                        " ".join([item["href"] for item in temp_epub.spine()]),
                        "base":
                        os.path.dirname(
                            os.path.join(temp_epubdir, temp_epub.opf_path())) +
                        "/"
                    })
        if not xslt.success:
            return False
        shutil.copy(temp_xml_file, nav_path)

        self.utils.report.info("Rydder opp i nordisk EPUB package.opf")
        opf_path = os.path.join(temp_epubdir, temp_epub.opf_path())
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid,
                                            "nordic-cleanup-opf.xsl"),
                    source=opf_path,
                    target=temp_xml_file)
        if not xslt.success:
            return False
        shutil.copy(temp_xml_file, opf_path)

        html_dir_obj = tempfile.TemporaryDirectory()
        html_dir = html_dir_obj.name
        html_file = os.path.join(html_dir, epub.identifier() + ".xhtml")

        self.utils.report.info("Finner ut hvilket bibliotek boka tilhører…")
        edition_metadata = Metadata.get_edition_from_api(
            epub.identifier(), report=self.utils.report)
        library = None
        if edition_metadata is not None and edition_metadata[
                "library"] is not None:
            library = edition_metadata["library"]
        else:
            library = Metadata.get_library_from_identifier(
                epub.identifier(), self.utils.report)
        self.utils.report.info(f"Boka tilhører '{library}'")

        self.utils.report.info("Zipper oppdatert versjon av EPUBen...")
        temp_epub.asFile(rebuild=True)

        self.utils.report.info(
            "Konverterer fra Nordisk EPUB 3 til Nordisk HTML 5...")
        epub_file = temp_epub.asFile()
        with DaisyPipelineJob(self,
                              "nordic-epub3-to-html", {
                                  "epub": os.path.basename(epub_file),
                                  "fail-on-error": "false"
                              },
                              pipeline_and_script_version=[
                                  ("1.13.6", "1.4.6"),
                                  ("1.13.4", "1.4.5"),
                                  ("1.12.1", "1.4.2"),
                                  ("1.11.1-SNAPSHOT", "1.3.0"),
                              ],
                              context={os.path.basename(epub_file):
                                       epub_file}) as dp2_job_convert:
            convert_status = "SUCCESS" if dp2_job_convert.status == "SUCCESS" else "ERROR"

            if convert_status != "SUCCESS":
                self.utils.report.error("Klarte ikke å konvertere boken")
                return False

            dp2_html_dir = os.path.join(dp2_job_convert.dir_output,
                                        "output-dir", epub.identifier())
            dp2_html_file = os.path.join(dp2_job_convert.dir_output,
                                         "output-dir", epub.identifier(),
                                         epub.identifier() + ".xhtml")

            if not os.path.isdir(dp2_html_dir):
                self.utils.report.error(
                    "Finner ikke den konverterte boken: {}".format(
                        dp2_html_dir))
                return False

            if not os.path.isfile(dp2_html_file):
                self.utils.report.error(
                    "Finner ikke den konverterte boken: {}".format(
                        dp2_html_file))
                self.utils.report.info(
                    "Kanskje filnavnet er forskjellig fra IDen?")
                return False

            Filesystem.copy(self.utils.report, dp2_html_dir, html_dir)

        self.utils.report.info("Rydder opp i nordisk HTML")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid,
                                            "nordic-cleanup.xsl"),
                    source=html_file,
                    target=temp_xml_file)
        if not xslt.success:
            return False
        shutil.copy(temp_xml_file, html_file)

        self.utils.report.info("Rydder opp i ns0 i page-normal")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid,
                                            "ns0-cleanup.xsl"),
                    source=html_file,
                    target=temp_xml_file)
        if not xslt.success:
            return False
        shutil.copy(temp_xml_file, html_file)

        self.utils.report.info("Rydder opp i innholdsfortegnelsen")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid,
                                            "fix-toc-span.xsl"),
                    source=html_file,
                    target=temp_xml_file)
        if not xslt.success:
            return False
        shutil.copy(temp_xml_file, html_file)

        self.utils.report.info(
            "Legger til EPUB-filer (OPF, NAV, container.xml, mediatype)...")
        nlbpub_tempdir_obj = tempfile.TemporaryDirectory()
        nlbpub_tempdir = nlbpub_tempdir_obj.name

        nlbpub = Epub.from_html(self, html_dir, nlbpub_tempdir)
        if nlbpub is None:
            return False

        self.utils.report.info(
            "Erstatter tomme bildefiler med faktiske bildefiler")
        for root, dirs, files in os.walk(
                os.path.join(nlbpub_tempdir, "EPUB", "images")):
            for file in files:
                fullpath = os.path.join(root, file)
                relpath = os.path.relpath(fullpath, nlbpub_tempdir)
                os.remove(fullpath)
                Filesystem.copy(self.utils.report,
                                os.path.join(temp_epubdir_withimages, relpath),
                                fullpath)
        temp_epub = Epub(self.utils.report, temp_epubdir)

        nlbpub.update_prefixes()

        self.utils.report.info(
            "Boken ble konvertert. Kopierer til NLBPUB-arkiv.")
        archived_path, stored = self.utils.filesystem.storeBook(
            nlbpub.asDir(), temp_epub.identifier(), overwrite=self.overwrite)
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " ble konvertert 👍😄" + epubTitle
        return True
Ejemplo n.º 8
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        # ---------- lag en kopi av EPUBen ----------

        narration_epubdir_obj = tempfile.TemporaryDirectory()
        narration_epubdir = narration_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"],
                        narration_epubdir)
        nlbpub = Epub(self.utils.report, narration_epubdir)

        # ---------- gjør tilpasninger i HTML-fila med XSLT ----------

        opf_path = nlbpub.opf_path()
        if not opf_path:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne OPF-fila i EPUBen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        opf_path = os.path.join(narration_epubdir, opf_path)

        xml = ElementTree.parse(opf_path).getroot()
        html_file = xml.xpath(
            "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
        )
        html_file = html_file[0] if html_file else None
        if not html_file:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        html_file = os.path.join(os.path.dirname(opf_path), html_file)
        if not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        temp_html_obj = tempfile.NamedTemporaryFile()
        temp_html = temp_html_obj.name

        self.utils.report.info(
            "Fjerner elementer som ikke skal være med i lydboka...")
        self.utils.report.debug("ta-vekk-innhold.xsl")
        self.utils.report.debug("    source = " + html_file)
        self.utils.report.debug("    target = " + temp_html)
        xslt = Xslt(self,
                    stylesheet=os.path.join(NlbpubToNarrationEpub.xslt_dir,
                                            NlbpubToNarrationEpub.uid,
                                            "ta-vekk-innhold.xsl"),
                    source=html_file,
                    target=temp_html)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_html, html_file)

        self.utils.report.info("Fikser Webarch-oppmerking")
        self.utils.report.debug("webarch-fixup.xsl")
        self.utils.report.debug("    source = " + html_file)
        self.utils.report.debug("    target = " + temp_html)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            NlbpubToNarrationEpub.uid,
                                            "webarch-fixup.xsl"),
                    source=html_file,
                    target=temp_html)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_html, html_file)

        self.utils.report.info("Fikser dikt-oppmerking")
        self.utils.report.debug("unwrap-poem-chapters.xsl")
        self.utils.report.debug("    source = " + html_file)
        self.utils.report.debug("    target = " + temp_html)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            NlbpubToNarrationEpub.uid,
                                            "unwrap-poem-chapters.xsl"),
                    source=html_file,
                    target=temp_html)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_html, html_file)

        self.utils.report.info("Lager usynlige overskrifter der det trengs...")
        self.utils.report.debug("create-hidden-headlines.xsl")
        self.utils.report.debug("    source = " + html_file)
        self.utils.report.debug("    target = " + temp_html)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                                            "create-hidden-headlines.xsl"),
                    source=html_file,
                    target=temp_html)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_html, html_file)

        self.utils.report.info("Tilpasser innhold for innlesing...")
        self.utils.report.debug("prepare-for-narration.xsl")
        self.utils.report.debug("    source = " + html_file)
        self.utils.report.debug("    target = " + temp_html)
        xslt = Xslt(self,
                    stylesheet=os.path.join(NlbpubToNarrationEpub.xslt_dir,
                                            NlbpubToNarrationEpub.uid,
                                            "prepare-for-narration.xsl"),
                    source=html_file,
                    target=temp_html)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_html, html_file)

        self.utils.report.info("Lager synkroniseringspunkter...")
        self.utils.report.debug("lag-synkroniseringspunkter.xsl")
        self.utils.report.debug("    source = " + html_file)
        self.utils.report.debug("    target = " + temp_html)
        xslt = Xslt(self,
                    stylesheet=os.path.join(NlbpubToNarrationEpub.xslt_dir,
                                            NlbpubToNarrationEpub.uid,
                                            "lag-synkroniseringspunkter.xsl"),
                    source=html_file,
                    target=temp_html)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_html, html_file)

        self.utils.report.info("Gjør HTMLen litt penere...")
        self.utils.report.debug("pretty-print.xsl")
        self.utils.report.debug("    source = " + html_file)
        self.utils.report.debug("    target = " + temp_html)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, Epub.uid,
                                            "pretty-print.xsl"),
                    source=html_file,
                    target=temp_html)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_html, html_file)

        # ---------- erstatt metadata i OPF med metadata fra HTML ----------

        temp_opf_obj = tempfile.NamedTemporaryFile()
        temp_opf = temp_opf_obj.name

        xslt = Epub.html_to_opf(self, opf_path, temp_opf)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False

        shutil.copy(temp_opf, opf_path)

        # ---------- hent nytt filnavn fra OPF (det endrer seg basert på boknummer) ----------
        try:
            xml = ElementTree.parse(opf_path).getroot()
            new_html_file = xml.xpath(
                "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
            )
            new_html_file = os.path.join(
                os.path.dirname(opf_path),
                new_html_file[0]) if new_html_file else None
        except Exception:
            self.utils.report.info(traceback.format_exc(), preformatted=True)
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        if html_file != new_html_file:
            shutil.copy(html_file, new_html_file)
            os.remove(html_file)
            html_file = new_html_file

        # ---------- lag nav.xhtml på nytt ----------

        nav_path = nlbpub.nav_path()
        if not nav_path:
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å finne navigasjonsfila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        nav_path = os.path.join(narration_epubdir, nav_path)

        xslt = Epub.html_to_nav(self, html_file, nav_path)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False

        # ---------- legg til logo ----------
        library = nlbpub.meta("schema:library")
        library = library.upper() if library else library
        logo = os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                            "{}_logo.png".format(library))

        if os.path.isfile(logo) and library == "STATPED":
            shutil.copy(
                logo,
                os.path.join(os.path.dirname(html_file),
                             os.path.basename(logo)))

        # ---------- save EPUB ----------

        self.utils.report.info(
            "Boken ble konvertert. Kopierer til innlesingsklart EPUB-arkiv.")

        archived_path, stored = self.utils.filesystem.storeBook(
            nlbpub.asFile(),
            nlbpub.identifier(),
            file_extension="epub",
            move=True)
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " ble konvertert 👍😄" + epubTitle
        return True