コード例 #1
0
ファイル: pdf_input.py プロジェクト: 089git/calibre
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.pdf.pdftohtml import pdftohtml

        log.debug('Converting file to html...')
        # The main html file will be named index.html
        self.opts, self.log = options, log
        if options.new_pdf_engine:
            return self.convert_new(stream, accelerators)
        pdftohtml(os.getcwdu(), stream.name, options.no_images)

        from calibre.ebooks.metadata.meta import get_metadata
        log.debug('Retrieving document metadata...')
        mi = get_metadata(stream, 'pdf')
        opf = OPFCreator(os.getcwdu(), mi)

        manifest = [(u'index.html', None)]

        images = os.listdir(os.getcwdu())
        images.remove('index.html')
        for i in images:
            manifest.append((i, None))
        log.debug('Generating manifest...')
        opf.create_manifest(manifest)

        opf.create_spine([u'index.html'])
        log.debug('Rendering manifest...')
        with open(u'metadata.opf', 'wb') as opffile:
            opf.render(opffile)

        return os.path.join(os.getcwdu(), u'metadata.opf')
コード例 #2
0
ファイル: pml_input.py プロジェクト: j-howell/calibre
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.metadata.toc import TOC
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.utils.zipfile import ZipFile

        self.options = options
        self.log = log
        pages, images = [], []
        toc = TOC()

        if file_ext == 'pmlz':
            log.debug('De-compressing content to temporary directory...')
            with TemporaryDirectory('_unpmlz') as tdir:
                zf = ZipFile(stream)
                zf.extractall(tdir)

                pmls = glob.glob(os.path.join(tdir, '*.pml'))
                for pml in pmls:
                    html_name = os.path.splitext(os.path.basename(pml))[0]+'.html'
                    html_path = os.path.join(getcwd(), html_name)

                    pages.append(html_name)
                    log.debug('Processing PML item %s...' % pml)
                    ttoc = self.process_pml(pml, html_path)
                    toc += ttoc
                images = self.get_images(stream, tdir, True)
        else:
            toc = self.process_pml(stream, 'index.html')
            pages.append('index.html')

            if hasattr(stream, 'name'):
                images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name)))

        # We want pages to be orded alphabetically.
        pages.sort()

        manifest_items = []
        for item in pages+images:
            manifest_items.append((item, None))

        from calibre.ebooks.metadata.meta import get_metadata
        log.debug('Reading metadata from input file...')
        mi = get_metadata(stream, 'pml')
        if 'images/cover.png' in images:
            mi.cover = 'images/cover.png'
        opf = OPFCreator(getcwd(), mi)
        log.debug('Generating manifest...')
        opf.create_manifest(manifest_items)
        opf.create_spine(pages)
        opf.set_toc(toc)
        with lopen('metadata.opf', 'wb') as opffile:
            with lopen('toc.ncx', 'wb') as tocfile:
                opf.render(opffile, tocfile, 'toc.ncx')

        return os.path.join(getcwd(), 'metadata.opf')
コード例 #3
0
ファイル: reader.py プロジェクト: MarioJC/calibre
    def create_opf(self, output_dir, pages, images):
        with CurrentDir(output_dir):
            opf = OPFCreator(output_dir, self.mi)

            manifest = []
            for page in pages+images:
                manifest.append((page, None))

            opf.create_manifest(manifest)
            opf.create_spine(pages)
            with open('metadata.opf', 'wb') as opffile:
                opf.render(opffile)

        return os.path.join(output_dir, 'metadata.opf')
コード例 #4
0
ファイル: reader202.py プロジェクト: 089git/calibre
    def create_opf(self, output_dir, images):
        with CurrentDir(output_dir):
            opf = OPFCreator(output_dir, self.mi)

            manifest = [('index.html', None)]

            for i in images:
                manifest.append((os.path.join('images/', i), None))

            opf.create_manifest(manifest)
            opf.create_spine(['index.html'])
            with open('metadata.opf', 'wb') as opffile:
                opf.render(opffile)

        return os.path.join(output_dir, 'metadata.opf')
コード例 #5
0
ファイル: reader132.py プロジェクト: JimmXinu/calibre
    def create_opf(self, output_dir, images, toc):
        with CurrentDir(output_dir):
            if 'cover.png' in images:
                self.mi.cover = os.path.join('images', 'cover.png')

            opf = OPFCreator(output_dir, self.mi)

            manifest = [('index.html', None)]

            for i in images:
                manifest.append((os.path.join('images', i), None))

            opf.create_manifest(manifest)
            opf.create_spine(['index.html'])
            opf.set_toc(toc)
            with open('metadata.opf', 'wb') as opffile:
                with open('toc.ncx', 'wb') as tocfile:
                    opf.render(opffile, tocfile, 'toc.ncx')

        return os.path.join(output_dir, 'metadata.opf')
コード例 #6
0
ファイル: pdf_input.py プロジェクト: j-howell/calibre
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.pdf.pdftohtml import pdftohtml

        log.debug('Converting file to html...')
        # The main html file will be named index.html
        self.opts, self.log = options, log
        if options.new_pdf_engine:
            return self.convert_new(stream, accelerators)
        pdftohtml(getcwd(), stream.name, options.no_images)

        from calibre.ebooks.metadata.meta import get_metadata
        log.debug('Retrieving document metadata...')
        mi = get_metadata(stream, 'pdf')
        opf = OPFCreator(getcwd(), mi)

        manifest = [('index.html', None)]

        images = os.listdir(getcwd())
        images.remove('index.html')
        for i in images:
            manifest.append((i, None))
        log.debug('Generating manifest...')
        opf.create_manifest(manifest)

        opf.create_spine(['index.html'])
        log.debug('Rendering manifest...')
        with lopen('metadata.opf', 'wb') as opffile:
            opf.render(opffile)
        if os.path.exists('toc.ncx'):
            ncxid = opf.manifest.id_for_path('toc.ncx')
            if ncxid:
                with lopen('metadata.opf', 'r+b') as f:
                    raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
                    f.seek(0)
                    f.write(raw)

        return os.path.join(getcwd(), 'metadata.opf')
コード例 #7
0
ファイル: input.py プロジェクト: 089git/calibre
    def __call__(self, stream, odir, log):
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.metadata.odt import get_metadata
        from calibre.ebooks.metadata.opf2 import OPFCreator

        if not os.path.exists(odir):
            os.makedirs(odir)
        with CurrentDir(odir):
            log('Extracting ODT file...')
            stream.seek(0)
            mi = get_metadata(stream, 'odt')
            if not mi.title:
                mi.title = _('Unknown')
            if not mi.authors:
                mi.authors = [_('Unknown')]
            self.filter_load(stream, mi, log)
            html = self.xhtml()
            # A blanket img specification like this causes problems
            # with EPUB output as the containing element often has
            # an absolute height and width set that is larger than
            # the available screen real estate
            html = html.replace('img { width: 100%; height: 100%; }', '')
            # odf2xhtml creates empty title tag
            html = html.replace('<title></title>','<title>%s</title>'%(mi.title,))
            try:
                html = self.fix_markup(html, log)
            except:
                log.exception('Failed to filter CSS, conversion may be slow')
            with open('index.xhtml', 'wb') as f:
                f.write(html.encode('utf-8'))
            zf = ZipFile(stream, 'r')
            self.extract_pictures(zf)
            opf = OPFCreator(os.path.abspath(os.getcwdu()), mi)
            opf.create_manifest([(os.path.abspath(f2), None) for f2 in
                walk(os.getcwdu())])
            opf.create_spine([os.path.abspath('index.xhtml')])
            with open('metadata.opf', 'wb') as f:
                opf.render(f)
            return os.path.abspath('metadata.opf')
コード例 #8
0
ファイル: processor.py プロジェクト: AEliu/calibre
def opf_writer(path, opf_name, manifest, spine, mi):
    opf = OPFCreator(path, mi)
    opf.create_manifest(manifest)
    opf.create_spine(spine)
    with open(os.path.join(path, opf_name), 'wb') as opffile:
        opf.render(opffile)
コード例 #9
0
    def convert(self, stream, opts, file_ext, log, accelerators):
        from calibre.ebooks.metadata import MetaInformation
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.metadata.toc import TOC

        self.opts, self.log= opts, log
        if file_ext == 'cbc':
            comics_ = self.get_comics_from_collection(stream)
        else:
            comics_ = [['Comic', os.path.abspath(stream.name)]]
        stream.close()
        comics = []
        for i, x in enumerate(comics_):
            title, fname = x
            cdir = u'comic_%d'%(i+1) if len(comics_) > 1 else u'.'
            cdir = os.path.abspath(cdir)
            if not os.path.exists(cdir):
                os.makedirs(cdir)
            pages = self.get_pages(fname, cdir)
            if not pages:
                continue
            wrappers = self.create_wrappers(pages)
            comics.append((title, pages, wrappers))

        if not comics:
            raise ValueError('No comic pages found in %s'%stream.name)

        mi  = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
            [_('Unknown')])
        opf = OPFCreator(os.getcwdu(), mi)
        entries = []

        def href(x):
            if len(comics) == 1:
                return os.path.basename(x)
            return '/'.join(x.split(os.sep)[-2:])

        for comic in comics:
            pages, wrappers = comic[1:]
            entries += [(w, None) for w in map(href, wrappers)] + \
                    [(x, None) for x in map(href, pages)]
        opf.create_manifest(entries)
        spine = []
        for comic in comics:
            spine.extend(map(href, comic[2]))
        self._images = []
        for comic in comics:
            self._images.extend(comic[1])
        opf.create_spine(spine)
        toc = TOC()
        if len(comics) == 1:
            wrappers = comics[0][2]
            for i, x in enumerate(wrappers):
                toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
                        play_order=i)
        else:
            po = 0
            for comic in comics:
                po += 1
                wrappers = comic[2]
                stoc = toc.add_item(href(wrappers[0]),
                        None, comic[0], play_order=po)
                if not opts.dont_add_comic_pages_to_toc:
                    for i, x in enumerate(wrappers):
                        stoc.add_item(href(x), None,
                                _('Page')+' %d'%(i+1), play_order=po)
                        po += 1
        opf.set_toc(toc)
        m, n = open(u'metadata.opf', 'wb'), open('toc.ncx', 'wb')
        opf.render(m, n, u'toc.ncx')
        return os.path.abspath(u'metadata.opf')
コード例 #10
0
ファイル: fb2_input.py プロジェクト: bjhemens/calibre
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from lxml import etree
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
        from calibre.ebooks.chardet import xml_to_unicode
        NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
        self.log = log
        log.debug('Parsing XML...')
        raw = stream.read().replace('\0', '')
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
            assume_utf8=True, resolve_entities=True)[0]
        try:
            doc = etree.fromstring(raw)
        except etree.XMLSyntaxError:
            try:
                doc = etree.fromstring(raw, parser=RECOVER_PARSER)
                if doc is None:
                    raise Exception('parse failed')
            except:
                doc = etree.fromstring(raw.replace('& ', '&amp;'),
                        parser=RECOVER_PARSER)
        if doc is None:
            raise ValueError('The FB2 file is not valid XML')
        stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
        css = ''
        for s in stylesheets:
            css += etree.tostring(s, encoding=unicode, method='text',
                    with_tail=False) + '\n\n'
        if css:
            import cssutils, logging
            parser = cssutils.CSSParser(fetcher=None,
                    log=logging.getLogger('calibre.css'))

            XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
            text = XHTML_CSS_NAMESPACE + css
            log.debug('Parsing stylesheet...')
            stylesheet = parser.parseString(text)
            stylesheet.namespaces['h'] = XHTML_NS
            css = unicode(stylesheet.cssText).replace('h|style', 'h|span')
            css = re.sub(r'name\s*=\s*', 'class=', css)
        self.extract_embedded_content(doc)
        log.debug('Converting XML to HTML...')
        ss = open(P('templates/fb2.xsl'), 'rb').read()
        if options.no_inline_fb2_toc:
            log('Disabling generation of inline FB2 TOC')
            ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
                    re.DOTALL).sub('', ss)

        styledoc = etree.fromstring(ss)

        transform = etree.XSLT(styledoc)
        result = transform(doc)
        for img in result.xpath('//img[@src]'):
            src = img.get('src')
            img.set('src', self.binary_map.get(src, src))
        index = transform.tostring(result)
        open(u'index.xhtml', 'wb').write(index)
        open(u'inline-styles.css', 'wb').write(css)
        stream.seek(0)
        mi = get_metadata(stream, 'fb2')
        if not mi.title:
            mi.title = _('Unknown')
        if not mi.authors:
            mi.authors = [_('Unknown')]
        cpath = None
        if mi.cover_data and mi.cover_data[1]:
            with open(u'fb2_cover_calibre_mi.jpg', 'wb') as f:
                f.write(mi.cover_data[1])
            cpath = os.path.abspath(u'fb2_cover_calibre_mi.jpg')
        else:
            for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
                href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
                if href is not None:
                    if href.startswith('#'):
                        href = href[1:]
                    cpath = os.path.abspath(href)
                    break

        opf = OPFCreator(os.getcwdu(), mi)
        entries = [(f2, guess_type(f)[0]) for f2 in os.listdir(u'.')]
        opf.create_manifest(entries)
        opf.create_spine([u'index.xhtml'])
        if cpath:
            opf.guide.set_cover(cpath)
        with open(u'metadata.opf', 'wb') as f:
            opf.render(f)
        return os.path.join(os.getcwdu(), u'metadata.opf')
コード例 #11
0
def opf_writer(path, opf_name, manifest, spine, mi):
    opf = OPFCreator(path, mi)
    opf.create_manifest(manifest)
    opf.create_spine(spine)
    with open(os.path.join(path, opf_name), 'wb') as opffile:
        opf.render(opffile)
コード例 #12
0
ファイル: mobi6.py プロジェクト: HaraldGustafsson/calibre
    def create_opf(self, htmlfile, guide=None, root=None):
        mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
        if mi is None:
            mi = MetaInformation(self.book_header.title, [_('Unknown')])
        opf = OPFCreator(os.path.dirname(htmlfile), mi)
        if hasattr(self.book_header.exth, 'cover_offset'):
            opf.cover = u'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1)
        elif mi.cover is not None:
            opf.cover = mi.cover
        else:
            opf.cover = u'images/%05d.jpg' % 1
            if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
                * opf.cover.split('/'))):
                opf.cover = None

        cover = opf.cover
        cover_copied = None
        if cover is not None:
            cover = cover.replace('/', os.sep)
            if os.path.exists(cover):
                ncover = u'images'+os.sep+u'calibre_cover.jpg'
                if os.path.exists(ncover):
                    os.remove(ncover)
                shutil.copyfile(cover, ncover)
                cover_copied = os.path.abspath(ncover)
                opf.cover = ncover.replace(os.sep, '/')

        manifest = [(htmlfile, 'application/xhtml+xml'),
            (os.path.abspath(u'styles.css'), 'text/css')]
        bp = os.path.dirname(htmlfile)
        added = set([])
        for i in getattr(self, 'image_names', []):
            path = os.path.join(bp, 'images', i)
            added.add(path)
            manifest.append((path, 'image/jpeg'))
        if cover_copied is not None:
            manifest.append((cover_copied, 'image/jpeg'))

        opf.create_manifest(manifest)
        opf.create_spine([os.path.basename(htmlfile)])
        toc = None
        if guide is not None:
            opf.create_guide(guide)
            for ref in opf.guide:
                if ref.type.lower() == 'toc':
                    toc = ref.href()

        ncx_manifest_entry = None
        if toc:
            ncx_manifest_entry = 'toc.ncx'
            elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
            tocobj = None
            ent_pat = re.compile(r'&(\S+?);')
            if elems:
                tocobj = TOC()
                found = False
                reached = False
                for x in root.iter():
                    if x == elems[-1]:
                        reached = True
                        continue
                    if reached and x.tag == 'a':
                        href = x.get('href', '')
                        if href and re.match('\w+://', href) is None:
                            try:
                                text = u' '.join([t.strip() for t in
                                    x.xpath('descendant::text()')])
                            except:
                                text = ''
                            text = ent_pat.sub(entity_to_unicode, text)
                            item = tocobj.add_item(toc.partition('#')[0], href[1:],
                                text)
                            item.left_space = int(self.get_left_whitespace(x))
                            found = True
                    if reached and found and x.get('class', None) == 'mbp_pagebreak':
                        break
            if tocobj is not None:
                tocobj = self.structure_toc(tocobj)
                opf.set_toc(tocobj)

        return opf, ncx_manifest_entry
コード例 #13
0
ファイル: comic_input.py プロジェクト: qykth-git/calibre
    def convert(self, stream, opts, file_ext, log, accelerators):
        from calibre.ebooks.metadata import MetaInformation
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.metadata.toc import TOC

        self.opts, self.log = opts, log
        if file_ext == 'cbc':
            comics_ = self.get_comics_from_collection(stream)
        else:
            comics_ = [['Comic', os.path.abspath(stream.name)]]
        stream.close()
        comics = []
        num_pages_per_comic = []
        for i, x in enumerate(comics_):
            title, fname = x
            cdir = 'comic_%d' % (i + 1) if len(comics_) > 1 else '.'
            cdir = os.path.abspath(cdir)
            if not os.path.exists(cdir):
                os.makedirs(cdir)
            pages = self.get_pages(fname, cdir)
            if not pages:
                continue
            num_pages_per_comic.append(len(pages))
            if self.for_viewer:
                comics.append(
                    (title, pages, [self.create_viewer_wrapper(pages, cdir)]))
            else:
                wrappers = self.create_wrappers(pages)
                comics.append((title, pages, wrappers))

        if not comics:
            raise ValueError('No comic pages found in %s' % stream.name)

        mi = MetaInformation(
            os.path.basename(stream.name).rpartition('.')[0], [_('Unknown')])
        opf = OPFCreator(os.getcwd(), mi)
        entries = []

        def href(x):
            if len(comics) == 1:
                return os.path.basename(x)
            return '/'.join(x.split(os.sep)[-2:])

        cover_href = None
        for comic in comics:
            pages, wrappers = comic[1:]
            page_entries = [(x, None) for x in map(href, pages)]
            entries += [(w, None) for w in map(href, wrappers)] + page_entries
            if cover_href is None and page_entries:
                cover_href = page_entries[0][0]
        opf.create_manifest(entries)
        spine = []
        for comic in comics:
            spine.extend(map(href, comic[2]))
        self._images = []
        for comic in comics:
            self._images.extend(comic[1])
        opf.create_spine(spine)
        if self.for_viewer and cover_href:
            if os.path.isabs(cover_href):
                cover_href = os.path.relpath(cover_href).replace(os.sep, '/')
            opf.guide.set_cover(cover_href)
        toc = TOC()
        if len(comics) == 1:
            wrappers = comics[0][2]
            if self.for_viewer:
                wrapper_page_href = href(wrappers[0])
                for i in range(num_pages_per_comic[0]):
                    toc.add_item('{}#page_{}'.format(wrapper_page_href, i + 1),
                                 None,
                                 _('Page') + ' %d' % (i + 1),
                                 play_order=i)

            else:
                for i, x in enumerate(wrappers):
                    toc.add_item(href(x),
                                 None,
                                 _('Page') + ' %d' % (i + 1),
                                 play_order=i)
        else:
            po = 0
            for num_pages, comic in zip(num_pages_per_comic, comics):
                po += 1
                wrappers = comic[2]
                stoc = toc.add_item(href(wrappers[0]),
                                    None,
                                    comic[0],
                                    play_order=po)
                if not opts.dont_add_comic_pages_to_toc:
                    if self.for_viewer:
                        wrapper_page_href = href(wrappers[0])
                        for i in range(num_pages):
                            stoc.add_item('{}#page_{}'.format(
                                wrapper_page_href, i + 1),
                                          None,
                                          _('Page') + ' %d' % (i + 1),
                                          play_order=po)
                            po += 1
                    else:
                        for i, x in enumerate(wrappers):
                            stoc.add_item(href(x),
                                          None,
                                          _('Page') + ' %d' % (i + 1),
                                          play_order=po)
                            po += 1
        opf.set_toc(toc)
        with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
            opf.render(m, n, 'toc.ncx')
        return os.path.abspath('metadata.opf')
コード例 #14
0
ファイル: rtf_input.py プロジェクト: Eksmo/calibre
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from lxml import etree
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
        from calibre.ebooks.rtf.input import InlineClass
        self.opts = options
        self.log = log
        self.log('Converting RTF to XML...')
        try:
            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException as e:
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.\n%s')%e)

        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
        if d:
            imap = {}
            try:
                imap = self.extract_images(d[0])
            except:
                self.log.exception('Failed to extract images...')

        self.log('Parsing XML...')
        parser = etree.XMLParser(recover=True, no_network=True)
        doc = etree.fromstring(xml, parser=parser)
        border_styles = self.convert_borders(doc)
        for pict in doc.xpath('//rtf:pict[@num]',
                namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
            num = int(pict.get('num'))
            name = imap.get(num, None)
            if name is not None:
                pict.set('num', name)

        self.log('Converting XML to HTML...')
        inline_class = InlineClass(self.log)
        styledoc = etree.fromstring(P('templates/rtf.xsl', data=True))
        extensions = { ('calibre', 'inline-class') : inline_class }
        transform = etree.XSLT(styledoc, extensions=extensions)
        result = transform(doc)
        html = u'index.xhtml'
        with open(html, 'wb') as f:
            res = transform.tostring(result)
            # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            #clean multiple \n
            res = re.sub('\n+', '\n', res)
            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
            # res = re.sub('\s*<body>', '<body>', res)
            # res = re.sub('(?<=\n)\n{2}',
                    # u'<p>\u00a0</p>\n'.encode('utf-8'), res)
            f.write(res)
        self.write_inline_css(inline_class, border_styles)
        stream.seek(0)
        mi = get_metadata(stream, 'rtf')
        if not mi.title:
            mi.title = _('Unknown')
        if not mi.authors:
            mi.authors = [_('Unknown')]
        opf = OPFCreator(os.getcwdu(), mi)
        opf.create_manifest([(u'index.xhtml', None)])
        opf.create_spine([u'index.xhtml'])
        opf.render(open(u'metadata.opf', 'wb'))
        return os.path.abspath(u'metadata.opf')
コード例 #15
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from lxml import etree
        from calibre.utils.xml_parse import safe_xml_fromstring
        from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
        from calibre.ebooks.chardet import xml_to_unicode
        self.log = log
        log.debug('Parsing XML...')
        raw = get_fb2_data(stream)[0]
        raw = raw.replace(b'\0', b'')
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             assume_utf8=True,
                             resolve_entities=True)[0]
        try:
            doc = safe_xml_fromstring(raw)
        except etree.XMLSyntaxError:
            doc = safe_xml_fromstring(raw.replace('& ', '&amp;'))
        if doc is None:
            raise ValueError('The FB2 file is not valid XML')
        doc = ensure_namespace(doc)
        try:
            fb_ns = doc.nsmap[doc.prefix]
        except Exception:
            fb_ns = FB2NS

        NAMESPACES = {'f': fb_ns, 'l': XLINK_NS}
        stylesheets = doc.xpath(
            '//*[local-name() = "stylesheet" and @type="text/css"]')
        css = ''
        for s in stylesheets:
            css += etree.tostring(
                s, encoding='unicode', method='text', with_tail=False) + '\n\n'
        if css:
            import css_parser, logging
            parser = css_parser.CSSParser(fetcher=None,
                                          log=logging.getLogger('calibre.css'))

            XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
            text = XHTML_CSS_NAMESPACE + css
            log.debug('Parsing stylesheet...')
            stylesheet = parser.parseString(text)
            stylesheet.namespaces['h'] = XHTML_NS
            css = stylesheet.cssText
            if isinstance(css, bytes):
                css = css.decode('utf-8', 'replace')
            css = css.replace('h|style', 'h|span')
            css = re.sub(r'name\s*=\s*', 'class=', css)
        self.extract_embedded_content(doc)
        log.debug('Converting XML to HTML...')
        with open(P('templates/fb2.xsl'), 'rb') as f:
            ss = f.read().decode('utf-8')
        ss = ss.replace("__FB_NS__", fb_ns)
        if options.no_inline_fb2_toc:
            log('Disabling generation of inline FB2 TOC')
            ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
                            re.DOTALL).sub('', ss)

        styledoc = safe_xml_fromstring(ss)

        transform = etree.XSLT(styledoc)
        result = transform(doc)

        # Handle links of type note and cite
        notes = {
            a.get('href')[1:]: a
            for a in result.xpath('//a[@link_note and @href]')
            if a.get('href').startswith('#')
        }
        cites = {
            a.get('link_cite'): a
            for a in result.xpath('//a[@link_cite]') if not a.get('href', '')
        }
        all_ids = {x for x in result.xpath('//*/@id')}
        for cite, a in iteritems(cites):
            note = notes.get(cite, None)
            if note:
                c = 1
                while 'cite%d' % c in all_ids:
                    c += 1
                if not note.get('id', None):
                    note.set('id', 'cite%d' % c)
                    all_ids.add(note.get('id'))
                a.set('href', '#%s' % note.get('id'))
        for x in result.xpath('//*[@link_note or @link_cite]'):
            x.attrib.pop('link_note', None)
            x.attrib.pop('link_cite', None)

        for img in result.xpath('//img[@src]'):
            src = img.get('src')
            img.set('src', self.binary_map.get(src, src))
        index = transform.tostring(result)
        with open('index.xhtml', 'wb') as f:
            f.write(index.encode('utf-8'))
        with open('inline-styles.css', 'wb') as f:
            f.write(css.encode('utf-8'))
        stream.seek(0)
        mi = get_metadata(stream, 'fb2')
        if not mi.title:
            mi.title = _('Unknown')
        if not mi.authors:
            mi.authors = [_('Unknown')]
        cpath = None
        if mi.cover_data and mi.cover_data[1]:
            with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
                f.write(mi.cover_data[1])
            cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
        else:
            for img in doc.xpath('//f:coverpage/f:image',
                                 namespaces=NAMESPACES):
                href = img.get('{%s}href' % XLINK_NS, img.get('href', None))
                if href is not None:
                    if href.startswith('#'):
                        href = href[1:]
                    cpath = os.path.abspath(href)
                    break

        opf = OPFCreator(getcwd(), mi)
        entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
        opf.create_manifest(entries)
        opf.create_spine(['index.xhtml'])
        if cpath:
            opf.guide.set_cover(cpath)
        with open('metadata.opf', 'wb') as f:
            opf.render(f)
        return os.path.join(getcwd(), 'metadata.opf')
コード例 #16
0
ファイル: fb2_input.py プロジェクト: j-howell/calibre
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from lxml import etree
        from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
        from calibre.ebooks.chardet import xml_to_unicode
        self.log = log
        log.debug('Parsing XML...')
        raw = get_fb2_data(stream)[0]
        raw = raw.replace(b'\0', b'')
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
            assume_utf8=True, resolve_entities=True)[0]
        try:
            doc = etree.fromstring(raw)
        except etree.XMLSyntaxError:
            try:
                doc = etree.fromstring(raw, parser=RECOVER_PARSER)
                if doc is None:
                    raise Exception('parse failed')
            except:
                doc = etree.fromstring(raw.replace('& ', '&amp;'),
                        parser=RECOVER_PARSER)
        if doc is None:
            raise ValueError('The FB2 file is not valid XML')
        doc = ensure_namespace(doc)
        try:
            fb_ns = doc.nsmap[doc.prefix]
        except Exception:
            fb_ns = FB2NS

        NAMESPACES = {'f':fb_ns, 'l':XLINK_NS}
        stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
        css = ''
        for s in stylesheets:
            css += etree.tostring(s, encoding='unicode', method='text',
                    with_tail=False) + '\n\n'
        if css:
            import css_parser, logging
            parser = css_parser.CSSParser(fetcher=None,
                    log=logging.getLogger('calibre.css'))

            XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
            text = XHTML_CSS_NAMESPACE + css
            log.debug('Parsing stylesheet...')
            stylesheet = parser.parseString(text)
            stylesheet.namespaces['h'] = XHTML_NS
            css = stylesheet.cssText
            if isinstance(css, bytes):
                css = css.decode('utf-8', 'replace')
            css = css.replace('h|style', 'h|span')
            css = re.sub(r'name\s*=\s*', 'class=', css)
        self.extract_embedded_content(doc)
        log.debug('Converting XML to HTML...')
        with open(P('templates/fb2.xsl'), 'rb') as f:
            ss = f.read().decode('utf-8')
        ss = ss.replace("__FB_NS__", fb_ns)
        if options.no_inline_fb2_toc:
            log('Disabling generation of inline FB2 TOC')
            ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
                    re.DOTALL).sub('', ss)

        styledoc = etree.fromstring(ss)

        transform = etree.XSLT(styledoc)
        result = transform(doc)

        # Handle links of type note and cite
        notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')}
        cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')}
        all_ids = {x for x in result.xpath('//*/@id')}
        for cite, a in iteritems(cites):
            note = notes.get(cite, None)
            if note:
                c = 1
                while 'cite%d' % c in all_ids:
                    c += 1
                if not note.get('id', None):
                    note.set('id', 'cite%d' % c)
                    all_ids.add(note.get('id'))
                a.set('href', '#%s' % note.get('id'))
        for x in result.xpath('//*[@link_note or @link_cite]'):
            x.attrib.pop('link_note', None)
            x.attrib.pop('link_cite', None)

        for img in result.xpath('//img[@src]'):
            src = img.get('src')
            img.set('src', self.binary_map.get(src, src))
        index = transform.tostring(result)
        with open(u'index.xhtml', 'wb') as f:
            f.write(index.encode('utf-8'))
        with open(u'inline-styles.css', 'wb') as f:
            f.write(css.encode('utf-8'))
        stream.seek(0)
        mi = get_metadata(stream, 'fb2')
        if not mi.title:
            mi.title = _('Unknown')
        if not mi.authors:
            mi.authors = [_('Unknown')]
        cpath = None
        if mi.cover_data and mi.cover_data[1]:
            with open(u'fb2_cover_calibre_mi.jpg', 'wb') as f:
                f.write(mi.cover_data[1])
            cpath = os.path.abspath(u'fb2_cover_calibre_mi.jpg')
        else:
            for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
                href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
                if href is not None:
                    if href.startswith('#'):
                        href = href[1:]
                    cpath = os.path.abspath(href)
                    break

        opf = OPFCreator(getcwd(), mi)
        entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
        opf.create_manifest(entries)
        opf.create_spine([u'index.xhtml'])
        if cpath:
            opf.guide.set_cover(cpath)
        with open(u'metadata.opf', 'wb') as f:
            opf.render(f)
        return os.path.join(getcwd(), u'metadata.opf')
コード例 #17
0
    def create_opf(self, htmlfile, guide=None, root=None):
        mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
        if mi is None:
            mi = MetaInformation(self.book_header.title, [_('Unknown')])
        opf = OPFCreator(os.path.dirname(htmlfile), mi)
        if hasattr(self.book_header.exth, 'cover_offset'):
            opf.cover = u'images/%05d.jpg' % (
                self.book_header.exth.cover_offset + 1)
        elif mi.cover is not None:
            opf.cover = mi.cover
        else:
            opf.cover = u'images/%05d.jpg' % 1
            if not os.path.exists(
                    os.path.join(os.path.dirname(htmlfile),
                                 *opf.cover.split('/'))):
                opf.cover = None

        cover = opf.cover
        cover_copied = None
        if cover is not None:
            cover = cover.replace('/', os.sep)
            if os.path.exists(cover):
                ncover = u'images' + os.sep + u'calibre_cover.jpg'
                if os.path.exists(ncover):
                    os.remove(ncover)
                shutil.copyfile(cover, ncover)
                cover_copied = os.path.abspath(ncover)
                opf.cover = ncover.replace(os.sep, '/')

        manifest = [(htmlfile, 'application/xhtml+xml'),
                    (os.path.abspath(u'styles.css'), 'text/css')]
        bp = os.path.dirname(htmlfile)
        added = set([])
        for i in getattr(self, 'image_names', []):
            path = os.path.join(bp, 'images', i)
            added.add(path)
            manifest.append((path, 'image/jpeg'))
        if cover_copied is not None:
            manifest.append((cover_copied, 'image/jpeg'))

        opf.create_manifest(manifest)
        opf.create_spine([os.path.basename(htmlfile)])
        toc = None
        if guide is not None:
            opf.create_guide(guide)
            for ref in opf.guide:
                if ref.type.lower() == 'toc':
                    toc = ref.href()

        ncx_manifest_entry = None
        if toc:
            ncx_manifest_entry = 'toc.ncx'
            elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
            tocobj = None
            ent_pat = re.compile(r'&(\S+?);')
            if elems:
                tocobj = TOC()
                found = False
                reached = False
                for x in root.iter():
                    if x == elems[-1]:
                        reached = True
                        continue
                    if reached and x.tag == 'a':
                        href = x.get('href', '')
                        if href and re.match('\w+://', href) is None:
                            try:
                                text = u' '.join([
                                    t.strip()
                                    for t in x.xpath('descendant::text()')
                                ])
                            except:
                                text = ''
                            text = ent_pat.sub(entity_to_unicode, text)
                            item = tocobj.add_item(
                                toc.partition('#')[0], href[1:], text)
                            item.left_space = int(self.get_left_whitespace(x))
                            found = True
                    if reached and found and x.get('class',
                                                   None) == 'mbp_pagebreak':
                        break
            if tocobj is not None:
                tocobj = self.structure_toc(tocobj)
                opf.set_toc(tocobj)

        return opf, ncx_manifest_entry
コード例 #18
0
ファイル: comic_input.py プロジェクト: smartass101/calibre
    def convert(self, stream, opts, file_ext, log, accelerators):
        from calibre.ebooks.metadata import MetaInformation
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.metadata.toc import TOC

        self.opts, self.log= opts, log
        if file_ext == 'cbc':
            comics_ = self.get_comics_from_collection(stream)
        else:
            comics_ = [['Comic', os.path.abspath(stream.name)]]
        stream.close()
        comics = []
        for i, x in enumerate(comics_):
            title, fname = x
            cdir = u'comic_%d'%(i+1) if len(comics_) > 1 else u'.'
            cdir = os.path.abspath(cdir)
            if not os.path.exists(cdir):
                os.makedirs(cdir)
            pages = self.get_pages(fname, cdir)
            if not pages: continue
            wrappers = self.create_wrappers(pages)
            comics.append((title, pages, wrappers))

        if not comics:
            raise ValueError('No comic pages found in %s'%stream.name)

        mi  = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
            [_('Unknown')])
        opf = OPFCreator(os.getcwdu(), mi)
        entries = []

        def href(x):
            if len(comics) == 1: return os.path.basename(x)
            return '/'.join(x.split(os.sep)[-2:])

        for comic in comics:
            pages, wrappers = comic[1:]
            entries += [(w, None) for w in map(href, wrappers)] + \
                    [(x, None) for x in map(href, pages)]
        opf.create_manifest(entries)
        spine = []
        for comic in comics:
            spine.extend(map(href, comic[2]))
        self._images = []
        for comic in comics:
            self._images.extend(comic[1])
        opf.create_spine(spine)
        toc = TOC()
        if len(comics) == 1:
            wrappers = comics[0][2]
            for i, x in enumerate(wrappers):
                toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
                        play_order=i)
        else:
            po = 0
            for comic in comics:
                po += 1
                wrappers = comic[2]
                stoc = toc.add_item(href(wrappers[0]),
                        None, comic[0], play_order=po)
                if not opts.dont_add_comic_pages_to_toc:
                    for i, x in enumerate(wrappers):
                        stoc.add_item(href(x), None,
                                _('Page')+' %d'%(i+1), play_order=po)
                        po += 1
        opf.set_toc(toc)
        m, n = open(u'metadata.opf', 'wb'), open('toc.ncx', 'wb')
        opf.render(m, n, u'toc.ncx')
        return os.path.abspath(u'metadata.opf')
コード例 #19
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from lxml import etree
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
        from calibre.ebooks.rtf.input import InlineClass
        self.opts = options
        self.log = log
        self.log('Converting RTF to XML...')
        try:
            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException as e:
            self.log.exception('Unable to parse RTF')
            raise ValueError(
                _('This RTF file has a feature calibre does not '
                  'support. Convert it to HTML first and then try it.\n%s') %
                e)

        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
        if d:
            imap = {}
            try:
                imap = self.extract_images(d[0])
            except:
                self.log.exception('Failed to extract images...')

        self.log('Parsing XML...')
        parser = etree.XMLParser(recover=True, no_network=True)
        doc = etree.fromstring(xml, parser=parser)
        border_styles = self.convert_borders(doc)
        for pict in doc.xpath(
                '//rtf:pict[@num]',
                namespaces={'rtf': 'http://rtf2xml.sourceforge.net/'}):
            num = int(pict.get('num'))
            name = imap.get(num, None)
            if name is not None:
                pict.set('num', name)

        self.log('Converting XML to HTML...')
        inline_class = InlineClass(self.log)
        styledoc = etree.fromstring(P('templates/rtf.xsl', data=True))
        extensions = {('calibre', 'inline-class'): inline_class}
        transform = etree.XSLT(styledoc, extensions=extensions)
        result = transform(doc)
        html = u'index.xhtml'
        with open(html, 'wb') as f:
            res = as_bytes(transform.tostring(result))
            # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            # clean multiple \n
            res = re.sub(b'\n+', b'\n', res)
            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
            # res = re.sub('\s*<body>', '<body>', res)
            # res = re.sub('(?<=\n)\n{2}',
            # u'<p>\u00a0</p>\n'.encode('utf-8'), res)
            f.write(res)
        self.write_inline_css(inline_class, border_styles)
        stream.seek(0)
        mi = get_metadata(stream, 'rtf')
        if not mi.title:
            mi.title = _('Unknown')
        if not mi.authors:
            mi.authors = [_('Unknown')]
        opf = OPFCreator(getcwd(), mi)
        opf.create_manifest([(u'index.xhtml', None)])
        opf.create_spine([u'index.xhtml'])
        opf.render(open(u'metadata.opf', 'wb'))
        return os.path.abspath(u'metadata.opf')