Example #1
0
    def extract_content(self, output_dir):
        from calibre.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html
        from calibre.ebooks.pml.pmlconverter import PML_HTMLizer

        output_dir = os.path.abspath(output_dir)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        title = self.mi.title
        if not isinstance(title, unicode_type):
            title = title.decode('utf-8', 'replace')
        html = u'<html><head><title>%s</title></head><body>' % title

        pml = u''
        for i in range(1, self.header_record.num_text_pages + 1):
            self.log.debug('Extracting text page %i' % i)
            pml += self.get_text_page(i)
        hizer = PML_HTMLizer()
        html += hizer.parse_pml(pml, 'index.html')
        toc = hizer.get_toc()

        if self.header_record.footnote_count > 0:
            html += '<br /><h1>%s</h1>' % _('Footnotes')
            footnoteids = re.findall(
                '\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
                self.log.debug('Extracting footnote page %i' % i)
                if fid < len(footnoteids):
                    fid = footnoteids[fid]
                else:
                    fid = ''
                html += footnote_to_html(fid, self.decompress_text(i))

        if self.header_record.sidebar_count > 0:
            html += '<br /><h1>%s</h1>' % _('Sidebar')
            sidebarids = re.findall(
                '\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
                self.log.debug('Extracting sidebar page %i' % i)
                if sid < len(sidebarids):
                    sid = sidebarids[sid]
                else:
                    sid = ''
                html += sidebar_to_html(sid, self.decompress_text(i))

        html += '</body></html>'

        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))

        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        images = []
        with CurrentDir(os.path.join(output_dir, 'images/')):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                images.append(name)
                with open(name, 'wb') as imgf:
                    self.log.debug('Writing image %s to images/' % name)
                    imgf.write(img)

        opf_path = self.create_opf(output_dir, images, toc)

        return opf_path
Example #2
0
    def extract_content(self, output_dir):
        from calibre.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html
        from calibre.ebooks.pml.pmlconverter import PML_HTMLizer

        output_dir = os.path.abspath(output_dir)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        title = self.mi.title
        if not isinstance(title, unicode):
            title = title.decode('utf-8', 'replace')
        html = u'<html><head><title>%s</title></head><body>' % title

        pml = u''
        for i in range(1, self.header_record.num_text_pages + 1):
            self.log.debug('Extracting text page %i' % i)
            pml += self.get_text_page(i)
        hizer = PML_HTMLizer()
        html += hizer.parse_pml(pml, 'index.html')
        toc = hizer.get_toc()

        if self.header_record.footnote_count > 0:
            html += '<br /><h1>%s</h1>' % _('Footnotes')
            footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
                self.log.debug('Extracting footnote page %i' % i)
                if fid < len(footnoteids):
                    fid = footnoteids[fid]
                else:
                    fid = ''
                html += footnote_to_html(fid, self.decompress_text(i))

        if self.header_record.sidebar_count > 0:
            html += '<br /><h1>%s</h1>' % _('Sidebar')
            sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
                self.log.debug('Extracting sidebar page %i' % i)
                if sid < len(sidebarids):
                    sid = sidebarids[sid]
                else:
                    sid = ''
                html += sidebar_to_html(sid, self.decompress_text(i))

        html += '</body></html>'

        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))

        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        images = []
        with CurrentDir(os.path.join(output_dir, 'images/')):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                images.append(name)
                with open(name, 'wb') as imgf:
                    self.log.debug('Writing image %s to images/' % name)
                    imgf.write(img)

        opf_path = self.create_opf(output_dir, images, toc)

        return opf_path