Ejemplo n.º 1
0
 def test_toc_detection(self):
     ep = os.path.join(self.tdir, 'book.epub')
     create_book(Metadata('Test ToC'), ep)
     c = get_container(ep, tdir=os.path.join(self.tdir, 'container'), tweak_mode=True)
     self.assertEqual(2, c.opf_version_parsed.major)
     self.assertTrue(len(get_toc(c)))
     c.opf.set('version', '3.0')
     self.assertEqual(3, c.opf_version_parsed.major)
     self.assertTrue(len(get_toc(c)))  # detect NCX toc even in epub 3 files
     c.add_file('nav.html', b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">'
                '<body><nav epub:type="toc"><ol><li><a href="start.xhtml">EPUB 3 nav</a></li></ol></nav></body></html>',
                process_manifest_item=lambda item:item.set('properties', 'nav'))
     toc = get_toc(c)
     self.assertTrue(len(toc))
     self.assertEqual(toc.as_dict['children'][0]['title'], 'EPUB 3 nav')
Ejemplo n.º 2
0
    def __init__(self, path_to_ebook, tdir, log=None, book_hash=None):
        log = log or default_log
        book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log)
        ContainerBase.__init__(self, tdir, opfpath, log)
        excluded_names = {
            name for name, mt in self.mime_map.iteritems() if
            name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or
            name == 'mimetype'
        }

        self.book_render_data = data = {
            'version': RENDER_VERSION,
            'toc':get_toc(self).as_dict,
            'spine':[name for name, is_linear in self.spine_names],
            'link_uid': uuid4(),
            'book_hash': book_hash,
            'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'},
        }
        # Mark the spine as dirty since we have to ensure it is normalized
        for name in data['spine']:
            self.parsed(name), self.dirty(name)
        self.virtualized_names = set()
        self.virtualize_resources()
        def manifest_data(name):
            return {'size':os.path.getsize(self.name_path_map[name]), 'is_virtualized': name in self.virtualized_names, 'mimetype':self.mime_map.get(name)}
        data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names}
        self.commit()
        for name in excluded_names:
            os.remove(self.name_path_map[name])
        with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f:
            f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))
Ejemplo n.º 3
0
 def __init__(self, path_to_ebook, tdir, log=None):
     log = log or default_log
     book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log)
     ContainerBase.__init__(self, tdir, opfpath, log)
     excluded_names = {
         name for name, mt in self.mime_map.iteritems() if
         name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/')
     }
     self.book_render_data = data = {
         'version': self.RENDER_VERSION,
         'toc':get_toc(self).as_dict,
         'spine':[name for name, is_linear in self.spine_names],
         'link_uid': uuid4(),
         'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'},
         'manifest': list(set(self.name_path_map) - excluded_names),
     }
     # Mark the spine as dirty since we have to ensure it is normalized
     for name in data['spine']:
         self.parsed(name), self.dirty(name)
     self.virtualize_resources()
     self.commit()
     for name in excluded_names:
         os.remove(self.name_path_map[name])
     with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f:
         f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))
Ejemplo n.º 4
0
    def test_toc_detection(self):
        ep = os.path.join(self.tdir, 'book.epub')
        create_book(Metadata('Test ToC'), ep)
        c = get_container(ep,
                          tdir=os.path.join(self.tdir, 'container'),
                          tweak_mode=True)
        self.assertEqual(2, c.opf_version_parsed.major)
        self.assertTrue(len(get_toc(c)))
        c.opf.set('version', '3.0')
        self.assertEqual(3, c.opf_version_parsed.major)
        self.assertTrue(len(get_toc(c)))  # detect NCX toc even in epub 3 files
        c.add_file(
            'nav.html',
            b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">'
            b'<body><nav epub:type="toc"><ol><li><a href="start.xhtml">EPUB 3 nav</a></li></ol></nav></body></html>',
            process_manifest_item=lambda item: item.set('properties', 'nav'))
        toc = get_toc(c)
        self.assertTrue(len(toc))
        self.assertEqual(toc.as_dict['children'][0]['title'], 'EPUB 3 nav')

        def tfx(linear, expected):
            items = ['<t{0}>{0}</t{0}>'.format(x) for x in linear]
            html = '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">'
            html += '<body>%s</body></html>' % '\n'.join(items)
            with c.open('nav.html', 'wb') as f:
                f.write(html.encode('utf-8'))
            toc = toc_from_xpaths(c,
                                  ['//h:t' + x for x in sorted(set(linear))])

            def p(node):
                ans = ''
                if node.children:
                    ans += '['
                    for c in node.children:
                        ans += c.title + p(c)
                    ans += ']'
                return ans

            self.assertEqual('[%s]' % expected, p(toc))

        tfx('121333', '1[2]1[333]')
        tfx('1223424', '1[22[3[4]]2[4]]')
        tfx('32123', '321[2[3]]')
        tfx('123123', '1[2[3]]1[2[3]]')
Ejemplo n.º 5
0
 def test_toc_detection(self):
     ep = os.path.join(self.tdir, 'book.epub')
     create_book(Metadata('Test ToC'), ep)
     c = get_container(ep,
                       tdir=os.path.join(self.tdir, 'container'),
                       tweak_mode=True)
     self.assertEqual(2, c.opf_version_parsed.major)
     self.assertTrue(len(get_toc(c)))
     c.opf.set('version', '3.0')
     self.assertEqual(3, c.opf_version_parsed.major)
     self.assertTrue(len(get_toc(c)))  # detect NCX toc even in epub 3 files
     c.add_file(
         'nav.html',
         b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">'
         '<body><nav epub:type="toc"><ol><li><a href="start.xhtml">EPUB 3 nav</a></li></ol></nav></body></html>',
         process_manifest_item=lambda item: item.set('properties', 'nav'))
     toc = get_toc(c)
     self.assertTrue(len(toc))
     self.assertEqual(toc.as_dict['children'][0]['title'], 'EPUB 3 nav')
Ejemplo n.º 6
0
    def __init__(self, path_to_ebook, tdir, log=None, book_hash=None):
        log = log or default_log
        book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log)
        ContainerBase.__init__(self, tdir, opfpath, log)
        excluded_names = {
            name for name, mt in self.mime_map.iteritems() if
            name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or
            name == 'mimetype'
        }
        raster_cover_name, titlepage_name = self.create_cover_page(input_fmt.lower())

        self.book_render_data = data = {
            'version': RENDER_VERSION,
            'toc':get_toc(self).as_dict,
            'spine':[name for name, is_linear in self.spine_names],
            'link_uid': uuid4(),
            'book_hash': book_hash,
            'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'},
            'raster_cover_name': raster_cover_name,
            'title_page_name': titlepage_name,
            'has_maths': False,
            'total_length': 0,
            'spine_length': 0,
        }
        # Mark the spine as dirty since we have to ensure it is normalized
        for name in data['spine']:
            self.parsed(name), self.dirty(name)
        self.transform_css()
        self.virtualized_names = set()
        self.virtualize_resources()

        def manifest_data(name):
            mt = (self.mime_map.get(name) or 'application/octet-stream').lower()
            ans = {
                'size':os.path.getsize(self.name_path_map[name]),
                'is_virtualized': name in self.virtualized_names,
                'mimetype':mt,
                'is_html': mt in OEB_DOCS,
            }
            if ans['is_html']:
                root = self.parsed(name)
                ans['length'] = l = get_length(root)
                self.book_render_data['total_length'] += l
                if name in data['spine']:
                    self.book_render_data['spine_length'] += l
                ans['has_maths'] = hm = check_for_maths(root)
                if hm:
                    self.book_render_data['has_maths'] = True
            return ans
        data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names}
        self.commit()
        for name in excluded_names:
            os.remove(self.name_path_map[name])
        with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f:
            f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))
Ejemplo n.º 7
0
    def test_toc_detection(self):
        ep = os.path.join(self.tdir, 'book.epub')
        create_book(Metadata('Test ToC'), ep)
        c = get_container(ep, tdir=os.path.join(self.tdir, 'container'), tweak_mode=True)
        self.assertEqual(2, c.opf_version_parsed.major)
        self.assertTrue(len(get_toc(c)))
        c.opf.set('version', '3.0')
        self.assertEqual(3, c.opf_version_parsed.major)
        self.assertTrue(len(get_toc(c)))  # detect NCX toc even in epub 3 files
        c.add_file('nav.html', b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">'
                   '<body><nav epub:type="toc"><ol><li><a href="start.xhtml">EPUB 3 nav</a></li></ol></nav></body></html>',
                   process_manifest_item=lambda item:item.set('properties', 'nav'))
        toc = get_toc(c)
        self.assertTrue(len(toc))
        self.assertEqual(toc.as_dict['children'][0]['title'], 'EPUB 3 nav')

        def tfx(linear, expected):
            items = ['<t{0}>{0}</t{0}>'.format(x) for x in linear]
            html = '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">'
            html += '<body>%s</body></html>' % '\n'.join(items)
            with c.open('nav.html', 'wb') as f:
                f.write(html.encode('utf-8'))
            toc = toc_from_xpaths(c, ['//h:t'+x for x in sorted(set(linear))])

            def p(node):
                ans = ''
                if node.children:
                    ans += '['
                    for c in node.children:
                        ans += c.title + p(c)
                    ans += ']'
                return ans
            self.assertEqual('[%s]'%expected, p(toc))

        tfx('121333', '1[2]1[333]')
        tfx('1223424', '1[22[3[4]]2[4]]')
        tfx('32123', '321[2[3]]')
        tfx('123123', '1[2[3]]1[2[3]]')
def epub_2_to_3(container, report, previous_nav=None):
    upgrade_metadata(container.opf)
    collect_properties(container)
    toc = get_toc(container)
    toc_name = find_existing_ncx_toc(container)
    if toc_name:
        container.remove_item(toc_name)
    container.opf_xpath('./opf:spine')[0].attrib.pop('toc', None)
    landmarks = get_landmarks(container)
    for guide in container.opf_xpath('./opf:guide'):
        guide.getparent().remove(guide)
    create_nav(container, toc, landmarks, previous_nav)
    container.opf.set('version', '3.0')
    fix_font_mime_types(container)
    container.dirty(container.opf_name)
Ejemplo n.º 9
0
def epub_2_to_3(container, report, previous_nav=None):
    upgrade_metadata(container.opf)
    collect_properties(container)
    toc = get_toc(container)
    toc_name = find_existing_ncx_toc(container)
    if toc_name:
        container.remove_item(toc_name)
    container.opf_xpath('./opf:spine')[0].attrib.pop('toc', None)
    landmarks = get_landmarks(container)
    for guide in container.opf_xpath('./opf:guide'):
        guide.getparent().remove(guide)
    create_nav(container, toc, landmarks, previous_nav)
    container.opf.set('version', '3.0')
    fix_font_mime_types(container)
    container.dirty(container.opf_name)
Ejemplo n.º 10
0
    def build(self):
        c = current_container()
        if c is None:
            return
        toc = get_toc(c, verify_destinations=False)

        def process_node(toc, parent):
            for child in toc:
                node = QTreeWidgetItem(parent)
                node.setText(0, child.title or '')
                node.setData(0, DEST_ROLE, child.dest or '')
                node.setData(0, FRAG_ROLE, child.frag or '')
                tt = _('File: {0}\nAnchor: {1}').format(
                    child.dest or '', child.frag or _('Top of file'))
                node.setData(0, Qt.ToolTipRole, tt)
                process_node(child, node)

        self.view.clear()
        process_node(toc, self.view.invisibleRootItem())
Ejemplo n.º 11
0
    def build(self):
        c = current_container()
        if c is None:
            return
        toc = get_toc(c, verify_destinations=False)

        def process_node(toc, parent):
            for child in toc:
                node = QTreeWidgetItem(parent)
                node.setText(0, child.title or '')
                node.setData(0, DEST_ROLE, child.dest or '')
                node.setData(0, FRAG_ROLE, child.frag or '')
                tt = _('File: {0}\nAnchor: {1}').format(
                    child.dest or '', child.frag or _('Top of file'))
                node.setData(0, Qt.ToolTipRole, tt)
                process_node(child, node)

        self.view.clear()
        process_node(toc, self.view.invisibleRootItem())
Ejemplo n.º 12
0
    def __call__(self, ebook):
        self.ebook = ebook
        if not isinstance(ebook, AZW3Container):
            self.item_view.hide_azw3_warning()
        self.toc = get_toc(self.ebook)
        self.toc_lang, self.toc_uid = self.toc.lang, self.toc.uid
        self.blank = QIcon(I('blank.png'))
        self.ok = QIcon(I('ok.png'))
        self.err = QIcon(I('dot_red.png'))
        self.icon_map = {None:self.blank, True:self.ok, False:self.err}

        def process_item(toc_node, parent):
            for child in toc_node:
                c = self.create_item(parent, child)
                process_item(child, c)

        root = self.root = self.tocw.invisibleRootItem()
        root.setData(0, Qt.UserRole, self.toc)
        process_item(self.toc, root)
        self.tocw.model().dataChanged.connect(self.data_changed)
        self.tocw.currentItemChanged.connect(self.current_item_changed)
        self.tocw.setCurrentItem(None)
Ejemplo n.º 13
0
    def __call__(self, ebook):
        self.ebook = ebook
        if not isinstance(ebook, AZW3Container):
            self.item_view.hide_azw3_warning()
        self.toc = get_toc(self.ebook)
        self.toc_lang, self.toc_uid = self.toc.lang, self.toc.uid
        self.blank = QIcon(I('blank.png'))
        self.ok = QIcon(I('ok.png'))
        self.err = QIcon(I('dot_red.png'))
        self.icon_map = {None: self.blank, True: self.ok, False: self.err}

        def process_item(toc_node, parent):
            for child in toc_node:
                c = self.create_item(parent, child)
                process_item(child, c)

        root = self.root = self.tocw.invisibleRootItem()
        root.setData(0, Qt.UserRole, self.toc)
        process_item(self.toc, root)
        self.tocw.model().dataChanged.connect(self.data_changed)
        self.tocw.currentItemChanged.connect(self.current_item_changed)
        self.tocw.setCurrentItem(None)
Ejemplo n.º 14
0
def convert(opf_path,
            opts,
            metadata=None,
            output_path=None,
            log=default_log,
            cover_data=None,
            report_progress=lambda x, y: None):
    container = Container(opf_path, log)
    fix_markup(container)
    report_progress(0.05, _('Parsed all content for markup transformation'))
    if opts.pdf_hyphenate:
        from calibre.ebooks.oeb.polish.hyphenation import add_soft_hyphens
        add_soft_hyphens(container)
    has_maths = add_maths_script(container)
    fix_fullscreen_images(container)

    name_anchor_map = make_anchors_unique(container, log)
    margin_files = tuple(create_margin_files(container))
    toc = get_toc(container, verify_destinations=False)
    has_toc = toc and len(toc)
    links_page_uuid = add_all_links(container, margin_files)
    container.commit()
    report_progress(0.1, _('Completed markup transformation'))

    manager = RenderManager(opts, log, container.root)
    page_layout = get_page_layout(opts)
    pdf_doc = None
    anchor_locations = {}
    jobs = []
    for margin_file in margin_files:
        jobs.append(
            job_for_name(container, margin_file.name, margin_file.margins,
                         page_layout))
    results = manager.convert_html_files(jobs,
                                         settle_time=1,
                                         has_maths=has_maths)
    num_pages = 0
    page_margins_map = []
    for margin_file in margin_files:
        name = margin_file.name
        data = results[name]
        if not isinstance(data, bytes):
            raise SystemExit(data)
        doc = data_as_pdf_doc(data)
        anchor_locations.update(
            get_anchor_locations(name, doc, num_pages + 1, links_page_uuid,
                                 log))
        doc_pages = doc.page_count()
        page_margins_map.extend(
            repeat(resolve_margins(margin_file.margins, page_layout),
                   doc_pages))
        num_pages += doc_pages

        if pdf_doc is None:
            pdf_doc = doc
        else:
            pdf_doc.append(doc)

    page_number_display_map = get_page_number_display_map(
        manager, opts, num_pages, log)

    if has_toc:
        annotate_toc(toc, anchor_locations, name_anchor_map, log)
        if opts.pdf_add_toc:
            tocname = create_skeleton(container)
            root = container.parsed(tocname)
            add_pagenum_toc(root, toc, opts, page_number_display_map)
            container.commit()
            jobs = [job_for_name(container, tocname, None, page_layout)]
            results = manager.convert_html_files(jobs, settle_time=1)
            tocdoc = data_as_pdf_doc(results[tocname])
            page_margins_map.extend(
                repeat(resolve_margins(None, page_layout),
                       tocdoc.page_count()))
            pdf_doc.append(tocdoc)

    report_progress(0.7, _('Rendered all HTML as PDF'))

    fix_links(pdf_doc, anchor_locations, name_anchor_map, opts.pdf_mark_links,
              log)
    if toc and len(toc):
        add_toc(PDFOutlineRoot(pdf_doc), toc)
    report_progress(0.75, _('Added links to PDF content'))

    pdf_metadata = PDFMetadata(metadata)
    add_header_footer(manager, opts, pdf_doc, container,
                      page_number_display_map, page_layout, page_margins_map,
                      pdf_metadata, report_progress, toc if has_toc else None)

    merge_fonts(pdf_doc, log)
    num_removed = dedup_type3_fonts(pdf_doc)
    if num_removed:
        log('Removed', num_removed, 'duplicated Type3 glyphs')

    num_removed = remove_unused_fonts(pdf_doc)
    if num_removed:
        log('Removed', num_removed, 'unused fonts')

    # Needed because of https://bugreports.qt.io/browse/QTBUG-88976
    subset_fonts(pdf_doc, log)

    num_removed = pdf_doc.dedup_images()
    if num_removed:
        log('Removed', num_removed, 'duplicate images')

    if opts.pdf_odd_even_offset:
        for i in range(1, pdf_doc.page_count()):
            margins = page_margins_map[i]
            mult = -1 if i % 2 else 1
            val = opts.pdf_odd_even_offset
            if abs(val) < min(margins.left, margins.right):
                box = list(pdf_doc.get_page_box("CropBox", i))
                box[0] += val * mult
                pdf_doc.set_page_box("CropBox", i, *box)

    if cover_data:
        add_cover(pdf_doc, cover_data, page_layout, opts)

    if metadata is not None:
        update_metadata(pdf_doc, pdf_metadata)
    report_progress(1, _('Updated metadata in PDF'))

    if opts.uncompressed_pdf:
        pdf_doc.uncompress()

    pdf_data = pdf_doc.write()
    if output_path is None:
        return pdf_data
    with open(output_path, 'wb') as f:
        f.write(pdf_data)
Ejemplo n.º 15
0
def process_exploded_book(book_fmt,
                          opfpath,
                          input_fmt,
                          tdir,
                          render_manager,
                          log=None,
                          book_hash=None,
                          save_bookmark_data=False,
                          book_metadata=None,
                          virtualize_resources=True):
    log = log or default_log
    container = SimpleContainer(tdir, opfpath, log)
    input_plugin = plugin_for_input_format(input_fmt)
    is_comic = bool(getattr(input_plugin, 'is_image_collection', False))

    def needs_work(mt):
        return mt in OEB_STYLES or mt in OEB_DOCS or mt == 'image/svg+xml'

    def work_priority(name):
        # ensure workers with large files or stylesheets
        # have the less names
        size = os.path.getsize(container.name_path_map[name]),
        is_html = container.mime_map.get(name) in OEB_DOCS
        return (0 if is_html else 1), size

    if not is_comic:
        render_manager.launch_workers(
            tuple(n for n, mt in iteritems(container.mime_map)
                  if needs_work(mt)), container)

    bookmark_data = None
    if save_bookmark_data:
        bm_file = 'META-INF/calibre_bookmarks.txt'
        if container.exists(bm_file):
            with container.open(bm_file, 'rb') as f:
                bookmark_data = f.read()

    # We do not add zero byte sized files as the IndexedDB API in the
    # browser has no good way to distinguish between zero byte files and
    # load failures.
    excluded_names = {
        name
        for name, mt in iteritems(container.mime_map)
        if name == container.opf_name or mt == guess_type('a.ncx')
        or name.startswith('META-INF/') or name == 'mimetype'
        or not container.has_name_and_is_not_empty(name)
    }
    raster_cover_name, titlepage_name = create_cover_page(
        container, input_fmt.lower(), is_comic, book_metadata)

    toc = get_toc(container, verify_destinations=False).to_dict(count())
    if not toc or not toc.get('children'):
        toc = from_xpaths(container,
                          ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
    spine = [name for name, is_linear in container.spine_names]
    spineq = frozenset(spine)
    landmarks = [l for l in get_landmarks(container) if l['dest'] in spineq]

    book_render_data = {
        'version': RENDER_VERSION,
        'toc': toc,
        'book_format': book_fmt,
        'spine': spine,
        'link_uid': uuid4(),
        'book_hash': book_hash,
        'is_comic': is_comic,
        'raster_cover_name': raster_cover_name,
        'title_page_name': titlepage_name,
        'has_maths': False,
        'total_length': 0,
        'spine_length': 0,
        'toc_anchor_map': toc_anchor_map(toc),
        'landmarks': landmarks,
        'link_to_map': {},
    }

    names = sorted(
        (n for n, mt in iteritems(container.mime_map) if needs_work(mt)),
        key=work_priority)

    results = render_manager(
        names,
        (tdir, opfpath, virtualize_resources, book_render_data['link_uid'],
         container.data_for_clone()), container)
    ltm = book_render_data['link_to_map']
    html_data = {}
    virtualized_names = set()

    def merge_ltm(dest, src):
        for k, v in iteritems(src):
            if k in dest:
                dest[k] |= v
            else:
                dest[k] = v

    for link_to_map, hdata, vnames in results:
        html_data.update(hdata)
        virtualized_names |= vnames
        for k, v in iteritems(link_to_map):
            if k in ltm:
                merge_ltm(ltm[k], v)
            else:
                ltm[k] = v

    def manifest_data(name):
        mt = (container.mime_map.get(name)
              or 'application/octet-stream').lower()
        ans = {
            'size': os.path.getsize(container.name_path_map[name]),
            'is_virtualized': name in virtualized_names,
            'mimetype': mt,
            'is_html': mt in OEB_DOCS,
        }
        if ans['is_html']:
            data = html_data[name]
            ans['length'] = l = data['length']
            book_render_data['total_length'] += l
            if name in book_render_data['spine']:
                book_render_data['spine_length'] += l
            ans['has_maths'] = hm = data['has_maths']
            if hm:
                book_render_data['has_maths'] = True
            ans['anchor_map'] = data['anchor_map']
        return ans

    book_render_data['files'] = {
        name: manifest_data(name)
        for name in set(container.name_path_map) - excluded_names
    }
    container.commit()

    for name in excluded_names:
        os.remove(container.name_path_map[name])

    ltm = book_render_data['link_to_map']
    for name, amap in iteritems(ltm):
        for k, v in tuple(iteritems(amap)):
            amap[k] = tuple(v)  # needed for JSON serialization

    data = as_bytes(json.dumps(book_render_data, ensure_ascii=False))
    with lopen(os.path.join(container.root, 'calibre-book-manifest.json'),
               'wb') as f:
        f.write(data)

    return container, bookmark_data
Ejemplo n.º 16
0
def set_metadata_toc(container, language, criteria, changed_files, converter):
    # Returns True if either the metadata or TOC files changed
    # changed_files is updated
    
    opfChanged = False
    tocChanged = False
    # List of dc items in OPF file that get a simple text replacement
    # Add more items to this list if needed
    dc_list = ['//opf:metadata/dc:title',
               '//opf:metadata/dc:description',
               '//opf:metadata/dc:publisher',
               '//opf:metadata/dc:subject'
               '//opf:metadata/dc:contributor',
               '//opf:metadata/dc:coverage',
               '//opf:metadata/dc:rights'];
    # Update the OPF metadata
    # The language and creator fields are special
    # Only update the dc language if the original language was a Chinese type and epub format
    if container.book_type == u'epub':
        items = container.opf_xpath('//opf:metadata/dc:language')
        if len(items) > 0:
            for item in items:
                old_item = item.text
                if re.search('zh-\w+|zh', item.text, flags=re.IGNORECASE) != None:
                    item.text = language
                if item.text != old_item:
                    opfChanged = True
        # Update the creator text and file-as attribute
    items = container.opf_xpath('//opf:metadata/dc:creator')
    if len(items) > 0:
        for item in items:
            old_item = item.text
            if (item.text != None):
                item.text = converter.convert(item.text)
                if item.text != old_item:
                    opfChanged = True
            for attribute in item.attrib: # update file-as attribute
                item.attrib[attribute] = converter.convert(item.attrib[attribute])
    # Update the remaining dc items using a loop
    for dc_item in dc_list:
        items = container.opf_xpath(dc_item)
        if len(items) > 0:
            for item in items:
                old_item = item.text
                if (item.text != None):
                    item.text = converter.convert(item.text)
                    if item.text != old_item:
                        opfChanged = True

    # Update the TOC - Do this after modifying the OPF data
    # Just grab all <text> fields (AKA "title" attribute in a TOC object)
    # and convert to the desired Chinese. Let Calibre set the title and
    # language automatically from the OPF file modified earlier
    book_toc = get_toc(container)
    for item in book_toc.iterdescendants():
        if(item.title != None):
            old_title = item.title
            item.title = converter.convert(item.title)
            if old_title != item.title:
                tocChanged = True

    # Update the files with the changes
    if tocChanged:
        commit_toc(container, book_toc)
        container.dirty(book_toc.toc_file_name)
        changed_files.append(book_toc.toc_file_name)
    if opfChanged:
        container.dirty(container.opf_name)
        changed_files.append(container.opf_name)
    return(tocChanged or opfChanged)
Ejemplo n.º 17
0
    def __init__(self, path_to_ebook, tdir, log=None, book_hash=None):
        log = log or default_log
        book_fmt, opfpath, input_fmt = extract_book(path_to_ebook,
                                                    tdir,
                                                    log=log)
        ContainerBase.__init__(self, tdir, opfpath, log)
        excluded_names = {
            name
            for name, mt in self.mime_map.iteritems()
            if name == self.opf_name or mt == guess_type('a.ncx')
            or name.startswith('META-INF/') or name == 'mimetype'
        }
        raster_cover_name, titlepage_name = self.create_cover_page(
            input_fmt.lower())
        toc = get_toc(self).to_dict(count())
        spine = [name for name, is_linear in self.spine_names]
        spineq = frozenset(spine)
        landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq]

        self.book_render_data = data = {
            'version': RENDER_VERSION,
            'toc': toc,
            'spine': spine,
            'link_uid': uuid4(),
            'book_hash': book_hash,
            'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'},
            'raster_cover_name': raster_cover_name,
            'title_page_name': titlepage_name,
            'has_maths': False,
            'total_length': 0,
            'spine_length': 0,
            'toc_anchor_map': toc_anchor_map(toc),
            'landmarks': landmarks,
            'link_to_map': {},
        }
        # Mark the spine as dirty since we have to ensure it is normalized
        for name in data['spine']:
            self.parsed(name), self.dirty(name)
        self.transform_css()
        self.virtualized_names = set()
        self.virtualize_resources()

        def manifest_data(name):
            mt = (self.mime_map.get(name)
                  or 'application/octet-stream').lower()
            ans = {
                'size': os.path.getsize(self.name_path_map[name]),
                'is_virtualized': name in self.virtualized_names,
                'mimetype': mt,
                'is_html': mt in OEB_DOCS,
            }
            if ans['is_html']:
                root = self.parsed(name)
                ans['length'] = l = get_length(root)
                self.book_render_data['total_length'] += l
                if name in data['spine']:
                    self.book_render_data['spine_length'] += l
                ans['has_maths'] = hm = check_for_maths(root)
                if hm:
                    self.book_render_data['has_maths'] = True
                ans['anchor_map'] = anchor_map(root)
            return ans

        data['files'] = {
            name: manifest_data(name)
            for name in set(self.name_path_map) - excluded_names
        }
        self.commit()
        for name in excluded_names:
            os.remove(self.name_path_map[name])
        with lopen(os.path.join(self.root, 'calibre-book-manifest.json'),
                   'wb') as f:
            f.write(
                json.dumps(self.book_render_data,
                           ensure_ascii=False).encode('utf-8'))
Ejemplo n.º 18
0
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None, report_progress=lambda x, y: None):
    container = Container(opf_path, log)
    report_progress(0.05, _('Parsed all content for markup transformation'))
    has_maths = add_maths_script(container)
    fix_fullscreen_images(container)

    name_anchor_map = make_anchors_unique(container, log)
    margin_files = tuple(create_margin_files(container))
    toc = get_toc(container, verify_destinations=False)
    has_toc = toc and len(toc)
    links_page_uuid = add_all_links(container, margin_files)
    container.commit()
    report_progress(0.1, _('Completed markup transformation'))

    manager = RenderManager(opts, log, container.root)
    page_layout = get_page_layout(opts)
    pdf_doc = None
    anchor_locations = {}
    jobs = []
    for margin_file in margin_files:
        jobs.append(job_for_name(container, margin_file.name, margin_file.margins, page_layout))
    results = manager.convert_html_files(jobs, settle_time=1, has_maths=has_maths)
    num_pages = 0
    page_margins_map = []
    for margin_file in margin_files:
        name = margin_file.name
        data = results[name]
        if not isinstance(data, bytes):
            raise SystemExit(data)
        doc = data_as_pdf_doc(data)
        anchor_locations.update(get_anchor_locations(doc, num_pages + 1, links_page_uuid))
        doc_pages = doc.page_count()
        page_margins_map.extend(repeat(resolve_margins(margin_file.margins, page_layout), doc_pages))
        num_pages += doc_pages

        if pdf_doc is None:
            pdf_doc = doc
        else:
            pdf_doc.append(doc)

    page_number_display_map = get_page_number_display_map(manager, opts, num_pages, log)

    if has_toc:
        annotate_toc(toc, anchor_locations, name_anchor_map, log)
        if opts.pdf_add_toc:
            tocname = create_skeleton(container)
            root = container.parsed(tocname)
            add_pagenum_toc(root, toc, opts, page_number_display_map)
            container.commit()
            jobs = [job_for_name(container, tocname, None, page_layout)]
            results = manager.convert_html_files(jobs, settle_time=1)
            tocdoc = data_as_pdf_doc(results[tocname])
            page_margins_map.extend(repeat(resolve_margins(None, page_layout), tocdoc.page_count()))
            pdf_doc.append(tocdoc)

    report_progress(0.7, _('Rendered all HTML as PDF'))

    fix_links(pdf_doc, anchor_locations, name_anchor_map, opts.pdf_mark_links, log)
    if toc and len(toc):
        add_toc(PDFOutlineRoot(pdf_doc), toc)
    report_progress(0.75, _('Added links to PDF content'))

    pdf_metadata = PDFMetadata(metadata)
    add_header_footer(
        manager, opts, pdf_doc, container,
        page_number_display_map, page_layout, page_margins_map,
        pdf_metadata, report_progress, toc if has_toc else None)

    merge_fonts(pdf_doc)
    num_removed = dedup_type3_fonts(pdf_doc)
    if num_removed:
        log('Removed', num_removed, 'duplicated Type3 glyphs')

    num_removed = remove_unused_fonts(pdf_doc)
    if num_removed:
        log('Removed', num_removed, 'unused fonts')

    num_removed = pdf_doc.dedup_images()
    if num_removed:
        log('Removed', num_removed, 'duplicate images')

    if cover_data:
        add_cover(pdf_doc, cover_data, page_layout, opts)

    if metadata is not None:
        update_metadata(pdf_doc, pdf_metadata)
    report_progress(1, _('Updated metadata in PDF'))

    if opts.uncompressed_pdf:
        pdf_doc.uncompress()

    pdf_data = pdf_doc.write()
    if output_path is None:
        return pdf_data
    with open(output_path, 'wb') as f:
        f.write(pdf_data)
Ejemplo n.º 19
0
    def __init__(self, path_to_ebook, tdir, log=None, book_hash=None):
        log = log or default_log
        book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log)
        ContainerBase.__init__(self, tdir, opfpath, log)
        # We do not add zero byte sized files as the IndexedDB API in the
        # browser has no good way to distinguish between zero byte files and
        # load failures.
        excluded_names = {
            name for name, mt in iteritems(self.mime_map) if
            name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or
            name == 'mimetype' or not self.has_name_and_is_not_empty(name)}
        raster_cover_name, titlepage_name = self.create_cover_page(input_fmt.lower())
        toc = get_toc(self).to_dict(count())
        spine = [name for name, is_linear in self.spine_names]
        spineq = frozenset(spine)
        landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq]

        self.book_render_data = data = {
            'version': RENDER_VERSION,
            'toc':toc,
            'spine':spine,
            'link_uid': uuid4(),
            'book_hash': book_hash,
            'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'},
            'raster_cover_name': raster_cover_name,
            'title_page_name': titlepage_name,
            'has_maths': False,
            'total_length': 0,
            'spine_length': 0,
            'toc_anchor_map': toc_anchor_map(toc),
            'landmarks': landmarks,
            'link_to_map': {},
        }
        # Mark the spine as dirty since we have to ensure it is normalized
        for name in data['spine']:
            self.parsed(name), self.dirty(name)
        self.transform_css()
        self.virtualized_names = set()
        self.virtualize_resources()

        def manifest_data(name):
            mt = (self.mime_map.get(name) or 'application/octet-stream').lower()
            ans = {
                'size':os.path.getsize(self.name_path_map[name]),
                'is_virtualized': name in self.virtualized_names,
                'mimetype':mt,
                'is_html': mt in OEB_DOCS,
            }
            if ans['is_html']:
                root = self.parsed(name)
                ans['length'] = l = get_length(root)
                self.book_render_data['total_length'] += l
                if name in data['spine']:
                    self.book_render_data['spine_length'] += l
                ans['has_maths'] = hm = check_for_maths(root)
                if hm:
                    self.book_render_data['has_maths'] = True
                ans['anchor_map'] = anchor_map(root)
            return ans
        data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names}
        self.commit()
        for name in excluded_names:
            os.remove(self.name_path_map[name])
        data = json.dumps(self.book_render_data, ensure_ascii=False)
        if not isinstance(data, bytes):
            data = data.encode('utf-8')
        with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f:
            f.write(data)
Ejemplo n.º 20
0
    def __init__(self,
                 path_to_ebook,
                 tdir,
                 log=None,
                 book_hash=None,
                 save_bookmark_data=False,
                 book_metadata=None):
        log = log or default_log
        book_fmt, opfpath, input_fmt = extract_book(path_to_ebook,
                                                    tdir,
                                                    log=log)
        ContainerBase.__init__(self, tdir, opfpath, log)
        self.book_metadata = book_metadata
        if save_bookmark_data:
            bm_file = 'META-INF/calibre_bookmarks.txt'
            self.bookmark_data = None
            if self.exists(bm_file):
                with self.open(bm_file, 'rb') as f:
                    self.bookmark_data = f.read()
        # We do not add zero byte sized files as the IndexedDB API in the
        # browser has no good way to distinguish between zero byte files and
        # load failures.
        excluded_names = {
            name
            for name, mt in iteritems(self.mime_map) if name == self.opf_name
            or mt == guess_type('a.ncx') or name.startswith('META-INF/')
            or name == 'mimetype' or not self.has_name_and_is_not_empty(name)
        }
        raster_cover_name, titlepage_name = self.create_cover_page(
            input_fmt.lower())

        toc = get_toc(self).to_dict(count())
        if not toc or not toc.get('children'):
            toc = from_xpaths(self,
                              ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
        spine = [name for name, is_linear in self.spine_names]
        spineq = frozenset(spine)
        landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq]

        self.book_render_data = data = {
            'version': RENDER_VERSION,
            'toc': toc,
            'book_format': book_fmt,
            'spine': spine,
            'link_uid': uuid4(),
            'book_hash': book_hash,
            'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'},
            'raster_cover_name': raster_cover_name,
            'title_page_name': titlepage_name,
            'has_maths': False,
            'total_length': 0,
            'spine_length': 0,
            'toc_anchor_map': toc_anchor_map(toc),
            'landmarks': landmarks,
            'link_to_map': {},
        }
        # Mark the spine as dirty since we have to ensure it is normalized
        for name in data['spine']:
            self.parsed(name), self.dirty(name)
        self.transform_css()
        self.virtualized_names = set()
        self.virtualize_resources()

        def manifest_data(name):
            mt = (self.mime_map.get(name)
                  or 'application/octet-stream').lower()
            ans = {
                'size': os.path.getsize(self.name_path_map[name]),
                'is_virtualized': name in self.virtualized_names,
                'mimetype': mt,
                'is_html': mt in OEB_DOCS,
            }
            if ans['is_html']:
                root = self.parsed(name)
                ans['length'] = l = get_length(root)
                self.book_render_data['total_length'] += l
                if name in data['spine']:
                    self.book_render_data['spine_length'] += l
                ans['has_maths'] = hm = check_for_maths(root)
                if hm:
                    self.book_render_data['has_maths'] = True
                ans['anchor_map'] = anchor_map(root)
            return ans

        data['files'] = {
            name: manifest_data(name)
            for name in set(self.name_path_map) - excluded_names
        }
        self.commit()
        for name in excluded_names:
            os.remove(self.name_path_map[name])
        data = json.dumps(self.book_render_data, ensure_ascii=False)
        if not isinstance(data, bytes):
            data = data.encode('utf-8')
        with lopen(os.path.join(self.root, 'calibre-book-manifest.json'),
                   'wb') as f:
            f.write(data)