Exemple #1
0
    def __init__(self, oeb, opts, resources):
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
        self.compress = not self.opts.dont_compress
        self.has_tbs = False
        self.log.info('Creating KF8 output')

        # Create an inline ToC if one does not already exist
        self.toc_adder = TOCAdder(oeb, opts)
        self.used_images = set()
        self.resources = resources
        self.flows = [None]  # First flow item is reserved for the text
        self.records = [None]  # Placeholder for zeroth record

        self.log('\tGenerating KF8 markup...')
        self.dup_data()
        self.cleanup_markup()
        self.replace_resource_links()
        self.extract_css_into_flows()
        self.extract_svg_into_flows()
        self.replace_internal_links_with_placeholders()
        self.insert_aid_attributes()
        self.chunk_it_up()
        # Dump the cloned data as it is no longer needed
        del self._data_cache
        self.create_text_records()
        self.log('\tCreating indices...')
        self.create_fdst_records()
        self.create_indices()
        self.create_guide()
        # We do not want to use this ToC for MOBI 6, so remove it
        self.toc_adder.remove_generated_toc()
Exemple #2
0
    def __init__(self, oeb, opts, resources):
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
        self.compress = not self.opts.dont_compress
        self.has_tbs = False
        self.log.info('Creating KF8 output')

        # Create an inline ToC if one does not already exist
        self.toc_adder = TOCAdder(oeb, opts)
        self.used_images = set()
        self.resources = resources
        self.flows = [None] # First flow item is reserved for the text
        self.records = [None] # Placeholder for zeroth record

        self.log.info('\tGenerating KF8 markup...')
        self.dup_data()
        self.cleanup_markup()
        self.replace_resource_links()
        self.extract_css_into_flows()
        self.extract_svg_into_flows()
        self.replace_internal_links_with_placeholders()
        self.insert_aid_attributes()
        self.chunk_it_up()
        # Dump the cloned data as it is no longer needed
        del self._data_cache
        self.create_text_records()
        self.log.info('\tCreating indices...')
        self.create_fdst_records()
        self.create_indices()
        self.create_guide()
        # We do not want to use this ToC for MOBI 6, so remove it
        self.toc_adder.remove_generated_toc()
Exemple #3
0
class KF8Writer(object):
    def __init__(self, oeb, opts, resources):
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
        self.compress = not self.opts.dont_compress
        self.has_tbs = False
        self.log.info('Creating KF8 output')

        # Create an inline ToC if one does not already exist
        self.toc_adder = TOCAdder(oeb, opts)
        self.used_images = set()
        self.resources = resources
        self.flows = [None]  # First flow item is reserved for the text
        self.records = [None]  # Placeholder for zeroth record

        self.log('\tGenerating KF8 markup...')
        self.dup_data()
        self.cleanup_markup()
        self.replace_resource_links()
        self.extract_css_into_flows()
        self.extract_svg_into_flows()
        self.replace_internal_links_with_placeholders()
        self.insert_aid_attributes()
        self.chunk_it_up()
        # Dump the cloned data as it is no longer needed
        del self._data_cache
        self.create_text_records()
        self.log('\tCreating indices...')
        self.create_fdst_records()
        self.create_indices()
        self.create_guide()
        # We do not want to use this ToC for MOBI 6, so remove it
        self.toc_adder.remove_generated_toc()

    def dup_data(self):
        ''' Duplicate data so that any changes we make to markup/CSS only
        affect KF8 output and not MOBI 6 output '''
        self._data_cache = {}
        # Suppress cssutils logging output as it is duplicated anyway earlier
        # in the pipeline
        cssutils.log.setLevel(logging.CRITICAL)
        for item in self.oeb.manifest:
            if item.media_type in XML_DOCS:
                self._data_cache[item.href] = copy.deepcopy(item.data)
            elif item.media_type in OEB_STYLES:
                # I can't figure out how to make an efficient copy of the
                # in-memory CSSStylesheet, as deepcopy doesn't work (raises an
                # exception)
                self._data_cache[item.href] = cssutils.parseString(
                    item.data.cssText, validate=False)

    def data(self, item):
        return self._data_cache.get(item.href, item.data)

    def cleanup_markup(self):
        for item in self.oeb.spine:
            root = self.data(item)

            # Remove empty script tags as they are pointless
            for tag in XPath('//h:script')(root):
                if not tag.text and not tag.get('src', False):
                    tag.getparent().remove(tag)

            # Remove [ac]id attributes as they are used by this code for anchor
            # to offset mapping
            for tag in XPath('//*[@aid or @cid]')(root):
                tag.attrib.pop('aid', None), tag.attrib.pop('cid', None)

    def replace_resource_links(self):
        ''' Replace links to resources (raster images/fonts) with pointers to
        the MOBI record containing the resource. The pointers are of the form:
        kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and
        not used for fonts. '''
        def pointer(item, oref):
            ref = urlnormalize(item.abshref(oref))
            idx = self.resources.item_map.get(ref, None)
            if idx is not None:
                is_image = self.resources.records[idx - 1][:4] not in {b'FONT'}
                idx = to_ref(idx)
                if is_image:
                    self.used_images.add(ref)
                    return 'kindle:embed:%s?mime=%s' % (
                        idx, self.resources.mime_map[ref])
                else:
                    return 'kindle:embed:%s' % idx
            return oref

        for item in self.oeb.manifest:

            if item.media_type in XML_DOCS:
                root = self.data(item)
                for tag in XPath('//h:img|//svg:image')(root):
                    for attr, ref in tag.attrib.iteritems():
                        if attr.split('}')[-1].lower() in {'src', 'href'}:
                            tag.attrib[attr] = pointer(item, ref)

                for tag in XPath('//h:style')(root):
                    if tag.text:
                        sheet = cssutils.parseString(tag.text, validate=False)
                        replacer = partial(pointer, item)
                        cssutils.replaceUrls(sheet,
                                             replacer,
                                             ignoreImportRules=True)
                        repl = sheet.cssText
                        if isbytestring(repl):
                            repl = repl.decode('utf-8')
                        tag.text = '\n' + repl + '\n'

            elif item.media_type in OEB_STYLES:
                sheet = self.data(item)
                replacer = partial(pointer, item)
                cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True)

    def extract_css_into_flows(self):
        inlines = defaultdict(list)  # Ensure identical <style>s not repeated
        sheets = {}
        passthrough = getattr(self.opts, 'mobi_passthrough', False)

        for item in self.oeb.manifest:
            if item.media_type in OEB_STYLES:
                sheet = self.data(item)
                if not passthrough and not self.opts.expand_css and hasattr(
                        item.data, 'cssText'):
                    condense_sheet(sheet)
                sheets[item.href] = len(self.flows)
                self.flows.append(sheet)

        def fix_import_rules(sheet):
            changed = False
            for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
                if rule.href:
                    href = item.abshref(rule.href)
                    idx = sheets.get(href, None)
                    if idx is not None:
                        idx = to_ref(idx)
                        rule.href = 'kindle:flow:%s?mime=text/css' % idx
                        changed = True
            return changed

        for item in self.oeb.spine:
            root = self.data(item)

            for link in XPath('//h:link[@href]')(root):
                href = item.abshref(link.get('href'))
                idx = sheets.get(href, None)
                if idx is not None:
                    idx = to_ref(idx)
                    link.set('href', 'kindle:flow:%s?mime=text/css' % idx)

            for tag in XPath('//h:style')(root):
                p = tag.getparent()
                idx = p.index(tag)
                raw = tag.text
                if not raw or not raw.strip():
                    extract(tag)
                    continue
                sheet = cssutils.parseString(raw, validate=False)
                if fix_import_rules(sheet):
                    raw = force_unicode(sheet.cssText, 'utf-8')

                repl = etree.Element(XHTML('link'),
                                     type='text/css',
                                     rel='stylesheet')
                repl.tail = '\n'
                p.insert(idx, repl)
                extract(tag)
                inlines[raw].append(repl)

        for raw, elems in inlines.iteritems():
            idx = to_ref(len(self.flows))
            self.flows.append(raw)
            for link in elems:
                link.set('href', 'kindle:flow:%s?mime=text/css' % idx)

        for item in self.oeb.manifest:
            if item.media_type in OEB_STYLES:
                sheet = self.data(item)
                if hasattr(sheet, 'cssRules'):
                    fix_import_rules(sheet)

        for i, sheet in enumerate(tuple(self.flows)):
            if hasattr(sheet, 'cssText'):
                self.flows[i] = force_unicode(sheet.cssText, 'utf-8')

    def extract_svg_into_flows(self):
        images = {}

        for item in self.oeb.manifest:
            if item.media_type == SVG_MIME:
                data = self.data(item)
                images[item.href] = len(self.flows)
                self.flows.append(
                    etree.tostring(data,
                                   encoding='UTF-8',
                                   with_tail=True,
                                   xml_declaration=True))

        for item in self.oeb.spine:
            root = self.data(item)

            for svg in XPath('//svg:svg')(root):
                raw = etree.tostring(svg, encoding=unicode, with_tail=False)
                idx = len(self.flows)
                self.flows.append(raw)
                p = svg.getparent()
                pos = p.index(svg)
                img = etree.Element(XHTML('img'),
                                    src="kindle:flow:%s?mime=image/svg+xml" %
                                    to_ref(idx))
                p.insert(pos, img)
                extract(svg)

            for img in XPath('//h:img[@src]')(root):
                src = img.get('src')
                abshref = item.abshref(src)
                idx = images.get(abshref, None)
                if idx is not None:
                    img.set('src',
                            'kindle:flow:%s?mime=image/svg+xml' % to_ref(idx))

    def replace_internal_links_with_placeholders(self):
        self.link_map = {}
        count = 0
        hrefs = {item.href for item in self.oeb.spine}
        for item in self.oeb.spine:
            root = self.data(item)

            for a in XPath('//h:a[@href]')(root):
                count += 1
                ref = item.abshref(a.get('href'))
                href, _, frag = ref.partition('#')
                try:
                    href = urlnormalize(href)
                except ValueError:
                    # a non utf-8 quoted url? Since we cannot interpret it, pass it through.
                    pass
                if href in hrefs:
                    placeholder = 'kindle:pos:fid:0000:off:%s' % to_href(count)
                    self.link_map[placeholder] = (href, frag)
                    a.set('href', placeholder)

    def insert_aid_attributes(self):
        self.id_map = {}
        cid = 0
        for i, item in enumerate(self.oeb.spine):
            root = self.data(item)
            aidbase = i * int(1e6)
            j = 0

            def in_table(elem):
                p = elem.getparent()
                if p is None:
                    return False
                if barename(p.tag).lower() == 'table':
                    return True
                return in_table(p)

            for tag in root.iterdescendants(etree.Element):
                id_ = tag.attrib.get('id', None)
                if id_ is None and tag.tag == XHTML('a'):
                    # Can happen during tweaking
                    id_ = tag.attrib.get('name', None)
                    if id_ is not None:
                        tag.attrib['id'] = id_
                tagname = barename(tag.tag).lower()
                if id_ is not None or tagname in aid_able_tags:
                    if tagname == 'table' or in_table(tag):
                        # The Kindle renderer barfs on large tables that have
                        # aid on any of their tags. See
                        # https://bugs.launchpad.net/bugs/1489495
                        if id_:
                            cid += 1
                            val = 'c%d' % cid
                            self.id_map[(item.href, id_)] = val
                            tag.set('cid', val)
                    else:
                        aid = to_base(aidbase + j, base=32)
                        tag.set('aid', aid)
                        if tag.tag == XHTML('body'):
                            self.id_map[(item.href, '')] = aid
                        if id_ is not None:
                            self.id_map[(item.href, id_)] = aid

                        j += 1

    def chunk_it_up(self):
        placeholder_map = {}
        for placeholder, x in self.link_map.iteritems():
            href, frag = x
            aid = self.id_map.get(x, None)
            if aid is None:
                aid = self.id_map.get((href, ''))
            placeholder_map[placeholder] = aid
        chunker = Chunker(self.oeb, self.data, placeholder_map)

        for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
            setattr(self, x, getattr(chunker, x))

        self.flows[0] = chunker.text

    def create_text_records(self):
        self.flows = [
            x.encode('utf-8') if isinstance(x, unicode) else x
            for x in self.flows
        ]
        text = b''.join(self.flows)
        self.text_length = len(text)
        text = BytesIO(text)
        nrecords = 0
        records_size = 0
        self.uncompressed_record_lengths = []

        if self.compress:
            self.oeb.logger.info('\tCompressing markup...')

        while text.tell() < self.text_length:
            data, overlap = create_text_record(text)
            self.uncompressed_record_lengths.append(len(data))
            if self.compress:
                data = compress_doc(data)

            data += overlap
            data += pack(b'>B', len(overlap))

            self.records.append(data)
            records_size += len(data)
            nrecords += 1

        self.last_text_record_idx = nrecords
        self.first_non_text_record_idx = nrecords + 1
        # Pad so that the next records starts at a 4 byte boundary
        if records_size % 4 != 0:
            self.records.append(b'\x00' * (records_size % 4))
            self.first_non_text_record_idx += 1

    def create_fdst_records(self):
        FDST = namedtuple('Flow', 'start end')
        entries = []
        self.fdst_table = []
        for i, flow in enumerate(self.flows):
            start = 0 if i == 0 else self.fdst_table[-1].end
            self.fdst_table.append(FDST(start, start + len(flow)))
            entries.extend(self.fdst_table[-1])
        rec = (b'FDST' + pack(b'>LL', 12, len(self.fdst_table)) +
               pack(b'>%dL' % len(entries), *entries))
        self.fdst_records = [rec]
        self.fdst_count = len(self.fdst_table)

    def create_indices(self):
        self.skel_records = SkelIndex(self.skel_table)()
        self.chunk_records = ChunkIndex(self.chunk_table)()
        self.ncx_records = []
        toc = self.oeb.toc
        entries = []
        is_periodical = self.opts.mobi_periodical
        if toc.count() < 1:
            self.log.warn('Document has no ToC, MOBI will have no NCX index')
            return

        # Flatten the ToC into a depth first list
        fl = toc.iterdescendants()
        for i, item in enumerate(fl):
            entry = {
                'id': id(item),
                'index': i,
                'label': (item.title or _('Unknown')),
                'children': []
            }
            entry['depth'] = getattr(item, 'ncx_hlvl', 0)
            p = getattr(item, 'ncx_parent', None)
            if p is not None:
                entry['parent_id'] = p
            for child in item:
                child.ncx_parent = entry['id']
                child.ncx_hlvl = entry['depth'] + 1
                entry['children'].append(id(child))
            if is_periodical:
                if item.author:
                    entry['author'] = item.author
                if item.description:
                    entry['description'] = item.description
            entries.append(entry)
            href = item.href or ''
            href, frag = href.partition('#')[0::2]
            aid = self.id_map.get((href, frag), None)
            if aid is None:
                aid = self.id_map.get((href, ''), None)
            if aid is None:
                pos, fid = 0, 0
                chunk = self.chunk_table[pos]
                offset = chunk.insert_pos + fid
            else:
                pos, fid, offset = self.aid_offset_map[aid]

            entry['pos_fid'] = (pos, fid)
            entry['offset'] = offset

        # The Kindle requires entries to be sorted by (depth, playorder)
        # However, I cannot figure out how to deal with non linear ToCs, i.e.
        # ToCs whose nth entry at depth d has an offset after its n+k entry at
        # the same depth, so we sort on (depth, offset) instead. This re-orders
        # the ToC to be linear. A non-linear ToC causes section to section
        # jumping to not work. kindlegen somehow handles non-linear tocs, but I
        # cannot figure out how.
        original = sorted(entries,
                          key=lambda entry: (entry['depth'], entry['index']))
        linearized = sorted(entries,
                            key=lambda entry:
                            (entry['depth'], entry['offset']))
        is_non_linear = original != linearized
        entries = linearized
        is_non_linear = False  # False as we are using the linearized entries

        if is_non_linear:
            for entry in entries:
                entry['kind'] = 'chapter'

        for i, entry in enumerate(entries):
            entry['index'] = i
        id_to_index = {entry['id']: entry['index'] for entry in entries}

        # Write the hierarchical information
        for entry in entries:
            children = entry.pop('children')
            if children:
                entry['first_child'] = id_to_index[children[0]]
                entry['last_child'] = id_to_index[children[-1]]
            if 'parent_id' in entry:
                entry['parent'] = id_to_index[entry.pop('parent_id')]

        # Write the lengths
        def get_next_start(entry):
            enders = [
                e['offset'] for e in entries if e['depth'] <= entry['depth']
                and e['offset'] > entry['offset']
            ]
            if enders:
                return min(enders)
            return len(self.flows[0])

        for entry in entries:
            entry['length'] = get_next_start(entry) - entry['offset']

        self.has_tbs = apply_trailing_byte_sequences(
            entries, self.records, self.uncompressed_record_lengths)
        idx_type = NonLinearNCXIndex if is_non_linear else NCXIndex
        self.ncx_records = idx_type(entries)()

    def create_guide(self):
        self.start_offset = None
        self.guide_table = []
        self.guide_records = []
        GuideRef = namedtuple('GuideRef', 'title type pos_fid')
        for ref in self.oeb.guide.values():
            href, frag = ref.href.partition('#')[0::2]
            aid = self.id_map.get((href, frag), None)
            if aid is None:
                aid = self.id_map.get((href, ''))
            if aid is None:
                continue
            pos, fid, offset = self.aid_offset_map[aid]
            if is_guide_ref_start(ref):
                self.start_offset = offset
            self.guide_table.append(
                GuideRef(ref.title or _('Unknown'), ref.type, (pos, fid)))

        if self.guide_table:
            self.guide_table.sort(key=lambda x: x.type)  # Needed by the Kindle
            self.guide_records = GuideIndex(self.guide_table)()
Exemple #4
0
class KF8Writer(object):

    def __init__(self, oeb, opts, resources):
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
        self.compress = not self.opts.dont_compress
        self.has_tbs = False
        self.log.info('Creating KF8 output')

        # Create an inline ToC if one does not already exist
        self.toc_adder = TOCAdder(oeb, opts)
        self.used_images = set()
        self.resources = resources
        self.flows = [None] # First flow item is reserved for the text
        self.records = [None] # Placeholder for zeroth record

        self.log.info('\tGenerating KF8 markup...')
        self.dup_data()
        self.cleanup_markup()
        self.replace_resource_links()
        self.extract_css_into_flows()
        self.extract_svg_into_flows()
        self.replace_internal_links_with_placeholders()
        self.insert_aid_attributes()
        self.chunk_it_up()
        # Dump the cloned data as it is no longer needed
        del self._data_cache
        self.create_text_records()
        self.log.info('\tCreating indices...')
        self.create_fdst_records()
        self.create_indices()
        self.create_guide()
        # We do not want to use this ToC for MOBI 6, so remove it
        self.toc_adder.remove_generated_toc()

    def dup_data(self):
        ''' Duplicate data so that any changes we make to markup/CSS only
        affect KF8 output and not MOBI 6 output '''
        self._data_cache = {}
        # Suppress cssutils logging output as it is duplicated anyway earlier
        # in the pipeline
        cssutils.log.setLevel(logging.CRITICAL)
        for item in self.oeb.manifest:
            if item.media_type in XML_DOCS:
                self._data_cache[item.href] = copy.deepcopy(item.data)
            elif item.media_type in OEB_STYLES:
                # I can't figure out how to make an efficient copy of the
                # in-memory CSSStylesheet, as deepcopy doesn't work (raises an
                # exception)
                self._data_cache[item.href] = cssutils.parseString(
                        item.data.cssText, validate=False)

    def data(self, item):
        return self._data_cache.get(item.href, item.data)

    def cleanup_markup(self):
        for item in self.oeb.spine:
            root = self.data(item)

            # Remove empty script tags as they are pointless
            for tag in XPath('//h:script')(root):
                if not tag.text and not tag.get('src', False):
                    tag.getparent().remove(tag)

    def replace_resource_links(self):
        ''' Replace links to resources (raster images/fonts) with pointers to
        the MOBI record containing the resource. The pointers are of the form:
        kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and
        not used for fonts. '''

        def pointer(item, oref):
            ref = urlnormalize(item.abshref(oref))
            idx = self.resources.item_map.get(ref, None)
            if idx is not None:
                is_image = self.resources.records[idx-1][:4] not in {b'FONT'}
                idx = to_ref(idx)
                if is_image:
                    self.used_images.add(ref)
                    return 'kindle:embed:%s?mime=%s'%(idx,
                            self.resources.mime_map[ref])
                else:
                    return 'kindle:embed:%s'%idx
            return oref

        for item in self.oeb.manifest:

            if item.media_type in XML_DOCS:
                root = self.data(item)
                for tag in XPath('//h:img|//svg:image')(root):
                    for attr, ref in tag.attrib.iteritems():
                        if attr.split('}')[-1].lower() in {'src', 'href'}:
                            tag.attrib[attr] = pointer(item, ref)

                for tag in XPath('//h:style')(root):
                    if tag.text:
                        sheet = cssutils.parseString(tag.text, validate=False)
                        replacer = partial(pointer, item)
                        cssutils.replaceUrls(sheet, replacer,
                                ignoreImportRules=True)
                        repl = sheet.cssText
                        if isbytestring(repl):
                            repl = repl.decode('utf-8')
                        tag.text = '\n'+ repl + '\n'

            elif item.media_type in OEB_STYLES:
                sheet = self.data(item)
                replacer = partial(pointer, item)
                cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True)

    def extract_css_into_flows(self):
        inlines = defaultdict(list) # Ensure identical <style>s not repeated
        sheets = {}

        for item in self.oeb.manifest:
            if item.media_type in OEB_STYLES:
                data = self.data(item).cssText
                sheets[item.href] = len(self.flows)
                self.flows.append(force_unicode(data, 'utf-8'))

        for item in self.oeb.spine:
            root = self.data(item)

            for link in XPath('//h:link[@href]')(root):
                href = item.abshref(link.get('href'))
                idx = sheets.get(href, None)
                if idx is not None:
                    idx = to_ref(idx)
                    link.set('href', 'kindle:flow:%s?mime=text/css'%idx)

            for tag in XPath('//h:style')(root):
                p = tag.getparent()
                idx = p.index(tag)
                raw = tag.text
                if not raw or not raw.strip():
                    extract(tag)
                    continue
                repl = etree.Element(XHTML('link'), type='text/css',
                        rel='stylesheet')
                repl.tail='\n'
                p.insert(idx, repl)
                extract(tag)
                inlines[raw].append(repl)

        for raw, elems in inlines.iteritems():
            idx = to_ref(len(self.flows))
            self.flows.append(raw)
            for link in elems:
                link.set('href', 'kindle:flow:%s?mime=text/css'%idx)

    def extract_svg_into_flows(self):
        images = {}

        for item in self.oeb.manifest:
            if item.media_type == SVG_MIME:
                data = self.data(item)
                images[item.href] = len(self.flows)
                self.flows.append(etree.tostring(data, encoding='UTF-8',
                    with_tail=True, xml_declaration=True))

        for item in self.oeb.spine:
            root = self.data(item)

            for svg in XPath('//svg:svg')(root):
                raw = etree.tostring(svg, encoding=unicode, with_tail=False)
                idx = len(self.flows)
                self.flows.append(raw)
                p = svg.getparent()
                pos = p.index(svg)
                img = etree.Element(XHTML('img'),
                        src="kindle:flow:%s?mime=image/svg+xml"%to_ref(idx))
                p.insert(pos, img)
                extract(svg)

            for img in XPath('//h:img[@src]')(root):
                src = img.get('src')
                abshref = item.abshref(src)
                idx = images.get(abshref, None)
                if idx is not None:
                    img.set('src', 'kindle:flow:%s?mime=image/svg+xml'%
                            to_ref(idx))

    def replace_internal_links_with_placeholders(self):
        self.link_map = {}
        count = 0
        hrefs = {item.href for item in self.oeb.spine}
        for item in self.oeb.spine:
            root = self.data(item)

            for a in XPath('//h:a[@href]')(root):
                count += 1
                ref = item.abshref(a.get('href'))
                href, _, frag = ref.partition('#')
                href = urlnormalize(href)
                if href in hrefs:
                    placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count)
                    self.link_map[placeholder] = (href, frag)
                    a.set('href', placeholder)

    def insert_aid_attributes(self):
        self.id_map = {}
        for i, item in enumerate(self.oeb.spine):
            root = self.data(item)
            aidbase = i * int(1e6)
            j = 0
            for tag in root.iterdescendants(etree.Element):
                id_ = tag.attrib.get('id', None)
                if id_ is None and tag.tag == XHTML('a'):
                    # Can happen during tweaking
                    id_ = tag.attrib.get('name', None)
                    if id_ is not None:
                        tag.attrib['id'] = id_
                if id_ is not None or barename(tag.tag).lower() in aid_able_tags:
                    aid = aidbase + j
                    tag.attrib['aid'] = to_base(aid, base=32)
                    if tag.tag == XHTML('body'):
                        self.id_map[(item.href, '')] = tag.attrib['aid']
                    if id_ is not None:
                        self.id_map[(item.href, id_)] = tag.attrib['aid']

                    j += 1

    def chunk_it_up(self):
        placeholder_map = {}
        for placeholder, x in self.link_map.iteritems():
            href, frag = x
            aid = self.id_map.get(x, None)
            if aid is None:
                aid = self.id_map.get((href, ''))
            placeholder_map[placeholder] = aid
        chunker = Chunker(self.oeb, self.data, placeholder_map)

        for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
            setattr(self, x, getattr(chunker, x))

        self.flows[0] = chunker.text

    def create_text_records(self):
        self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x
                in self.flows]
        text = b''.join(self.flows)
        self.text_length = len(text)
        text = BytesIO(text)
        nrecords = 0
        records_size = 0
        self.uncompressed_record_lengths = []

        if self.compress:
            self.oeb.logger.info('\tCompressing markup...')

        while text.tell() < self.text_length:
            data, overlap = create_text_record(text)
            self.uncompressed_record_lengths.append(len(data))
            if self.compress:
                data = compress_doc(data)

            data += overlap
            data += pack(b'>B', len(overlap))

            self.records.append(data)
            records_size += len(data)
            nrecords += 1

        self.last_text_record_idx = nrecords
        self.first_non_text_record_idx = nrecords + 1
        # Pad so that the next records starts at a 4 byte boundary
        if records_size % 4 != 0:
            self.records.append(b'\x00'*(records_size % 4))
            self.first_non_text_record_idx += 1

    def create_fdst_records(self):
        FDST = namedtuple('Flow', 'start end')
        entries = []
        self.fdst_table = []
        for i, flow in enumerate(self.flows):
            start = 0 if i == 0 else self.fdst_table[-1].end
            self.fdst_table.append(FDST(start, start + len(flow)))
            entries.extend(self.fdst_table[-1])
        rec = (b'FDST' + pack(b'>LL', 12, len(self.fdst_table)) +
                pack(b'>%dL'%len(entries), *entries))
        self.fdst_records = [rec]
        self.fdst_count = len(self.fdst_table)

    def create_indices(self):
        self.skel_records = SkelIndex(self.skel_table)()
        self.chunk_records = ChunkIndex(self.chunk_table)()
        self.ncx_records = []
        toc = self.oeb.toc
        entries = []
        is_periodical = self.opts.mobi_periodical
        if toc.count() < 2:
            self.log.warn('Document has no ToC, MOBI will have no NCX index')
            return

        # Flatten the ToC into a depth first list
        fl = toc.iterdescendants()
        for i, item in enumerate(fl):
            entry = {'id': id(item), 'index': i, 'label':(item.title or
                _('Unknown')), 'children':[]}
            entry['depth'] = getattr(item, 'ncx_hlvl', 0)
            p = getattr(item, 'ncx_parent', None)
            if p is not None:
                entry['parent_id'] = p
            for child in item:
                child.ncx_parent = entry['id']
                child.ncx_hlvl = entry['depth'] + 1
                entry['children'].append(id(child))
            if is_periodical:
                if item.author:
                    entry['author'] = item.author
                if item.description:
                    entry['description'] = item.description
            entries.append(entry)
            href = item.href or ''
            href, frag = href.partition('#')[0::2]
            aid = self.id_map.get((href, frag), None)
            if aid is None:
                aid = self.id_map.get((href, ''), None)
            if aid is None:
                pos, fid = 0, 0
                chunk = self.chunk_table[pos]
                offset = chunk.insert_pos + fid
            else:
                pos, fid, offset = self.aid_offset_map[aid]

            entry['pos_fid'] = (pos, fid)
            entry['offset'] = offset

        # The Kindle requires entries to be sorted by (depth, playorder)
        # However, I cannot figure out how to deal with non linear ToCs, i.e.
        # ToCs whose nth entry at depth d has an offset after its n+k entry at
        # the same depth, so we sort on (depth, offset) instead. This re-orders
        # the ToC to be linear. A non-linear ToC causes section to section
        # jumping to not work. kindlegen somehow handles non-linear tocs, but I
        # cannot figure out how.
        original = sorted(entries,
                key=lambda entry: (entry['depth'], entry['index']))
        linearized = sorted(entries,
                key=lambda entry: (entry['depth'], entry['offset']))
        is_non_linear = original != linearized
        entries = linearized
        is_non_linear = False # False as we are using the linearized entries

        if is_non_linear:
            for entry in entries:
                entry['kind'] = 'chapter'

        for i, entry in enumerate(entries):
            entry['index'] = i
        id_to_index = {entry['id']:entry['index'] for entry in entries}

        # Write the hierarchical information
        for entry in entries:
            children = entry.pop('children')
            if children:
                entry['first_child'] = id_to_index[children[0]]
                entry['last_child'] = id_to_index[children[-1]]
            if 'parent_id' in entry:
                entry['parent'] = id_to_index[entry.pop('parent_id')]

        # Write the lengths
        def get_next_start(entry):
            enders = [e['offset'] for e in entries if e['depth'] <=
                    entry['depth'] and e['offset'] > entry['offset']]
            if enders:
                return min(enders)
            return len(self.flows[0])
        for entry in entries:
            entry['length'] = get_next_start(entry) - entry['offset']

        self.has_tbs = apply_trailing_byte_sequences(entries, self.records,
                self.uncompressed_record_lengths)
        idx_type = NonLinearNCXIndex if is_non_linear else NCXIndex
        self.ncx_records = idx_type(entries)()

    def create_guide(self):
        self.start_offset = None
        self.guide_table = []
        self.guide_records = []
        GuideRef = namedtuple('GuideRef', 'title type pos_fid')
        for ref in self.oeb.guide.values():
            href, frag = ref.href.partition('#')[0::2]
            aid = self.id_map.get((href, frag), None)
            if aid is None:
                aid = self.id_map.get((href, ''))
            if aid is None:
                continue
            pos, fid, offset = self.aid_offset_map[aid]
            if is_guide_ref_start(ref):
                self.start_offset = offset
            self.guide_table.append(GuideRef(ref.title or
                _('Unknown'), ref.type, (pos, fid)))

        if self.guide_table:
            self.guide_table.sort(key=lambda x:x.type) # Needed by the Kindle
            self.guide_records = GuideIndex(self.guide_table)()
    def convert(self, oeb, output_path, input_plugin, opts, log):
        self.log, self.opts, self.oeb = log, opts, oeb

        if self.opts.epub_inline_toc:
            from calibre.ebooks.mobi.writer8.toc import TOCAdder
            opts.mobi_toc_at_start = not opts.epub_toc_at_end
            opts.mobi_passthrough = False
            opts.no_inline_toc = False
            TOCAdder(oeb,
                     opts,
                     replace_previous_inline_toc=True,
                     ignore_existing_toc=True)

        if self.opts.epub_flatten:
            from calibre.ebooks.oeb.transforms.filenames import FlatFilenames
            FlatFilenames()(oeb, opts)
        else:
            from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames
            UniqueFilenames()(oeb, opts)

        self.workaround_ade_quirks()
        self.workaround_webkit_quirks()
        self.upshift_markup()
        from calibre.ebooks.oeb.transforms.rescale import RescaleImages
        RescaleImages(check_colorspaces=True)(oeb, opts)

        from calibre.ebooks.oeb.transforms.split import Split
        split = Split(not self.opts.dont_split_on_page_breaks,
                      max_flow_size=self.opts.flow_size * 1024)
        split(self.oeb, self.opts)

        from calibre.ebooks.oeb.transforms.cover import CoverManager
        cm = CoverManager(
            no_default_cover=self.opts.no_default_epub_cover,
            no_svg_cover=self.opts.no_svg_cover,
            preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
        cm(self.oeb, self.opts, self.log)

        self.workaround_sony_quirks()

        if self.oeb.toc.count() == 0:
            self.log.warn('This EPUB file has no Table of Contents. '
                          'Creating a default TOC')
            first = next(iter(self.oeb.spine))
            self.oeb.toc.add(_('Start'), first.href)

        from calibre.ebooks.oeb.base import OPF
        identifiers = oeb.metadata['identifier']
        uuid = None
        for x in identifiers:
            if x.get(OPF('scheme'),
                     None).lower() == 'uuid' or str(x).startswith('urn:uuid:'):
                uuid = str(x).split(':')[-1]
                break
        encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])

        if uuid is None:
            self.log.warn('No UUID identifier found')
            from uuid import uuid4
            uuid = str(uuid4())
            oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)

        if encrypted_fonts and not uuid.startswith('urn:uuid:'):
            # Apparently ADE requires this value to start with urn:uuid:
            # for some absurd reason, or it will throw a hissy fit and refuse
            # to use the obfuscated fonts.
            for x in identifiers:
                if str(x) == uuid:
                    x.content = 'urn:uuid:' + uuid

        with TemporaryDirectory('_epub_output') as tdir:
            from calibre.customize.ui import plugin_for_output_format
            metadata_xml = None
            extra_entries = []
            if self.is_periodical:
                if self.opts.output_profile.epub_periodical_format == 'sony':
                    from calibre.ebooks.epub.periodical import sony_metadata
                    metadata_xml, atom_xml = sony_metadata(oeb)
                    extra_entries = [('atom.xml', 'application/atom+xml',
                                      atom_xml)]
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb, tdir, input_plugin, opts, log)
            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
            self.condense_ncx([
                os.path.join(tdir, x) for x in os.listdir(tdir)
                if x.endswith('.ncx')
            ][0])
            encryption = None
            if encrypted_fonts:
                encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)

            from calibre.ebooks.epub import initialize_container
            with initialize_container(output_path,
                                      os.path.basename(opf),
                                      extra_entries=extra_entries) as epub:
                epub.add_dir(tdir)
                if encryption is not None:
                    epub.writestr('META-INF/encryption.xml', encryption)
                if metadata_xml is not None:
                    epub.writestr('META-INF/metadata.xml',
                                  metadata_xml.encode('utf-8'))
            if opts.extract_to is not None:
                from calibre.utils.zipfile import ZipFile
                if os.path.exists(opts.extract_to):
                    if os.path.isdir(opts.extract_to):
                        shutil.rmtree(opts.extract_to)
                    else:
                        os.remove(opts.extract_to)
                os.mkdir(opts.extract_to)
                with ZipFile(output_path) as zf:
                    zf.extractall(path=opts.extract_to)
                self.log.info('EPUB extracted to', opts.extract_to)