Exemple #1
0
def process_all_mobi_headers(files, sect, mhlst, K8Boundary, k8only=False):
    imgnames = []
    for mh in mhlst:

        if mh.isK8():
            print "\n\nProcessing K8 format Ebook ..."
        elif mh.isPrintReplica():
            print "\nProcessing PrintReplica (.azw4) format Ebook ..."
        else:
            print "\nProcessing Mobi format Ebook ..."

        if DEBUG:
            # write out raw mobi header data
            mhname = os.path.join(files.outdir, "header.dat")
            if mh.isK8():
                mhname = os.path.join(files.outdir, "header_K8.dat")
            file(mhname, 'wb').write(mh.header)

        # process each mobi header
        if mh.isEncrypted():
            raise unpackException('file is encrypted')

        # build up the metadata
        metadata = mh.getMetaData()
        metadata['Language'] = mh.Language()
        metadata['Title'] = [unicode(mh.title, mh.codec).encode("utf-8")]
        metadata['Codec'] = [mh.codec]
        metadata['UniqueID'] = [str(mh.unique_id)]
        if DEBUG:
            print "MetaData from EXTH"
            print metadata

        # save the raw markup language
        rawML = mh.getRawML()
        if DEBUG or WRITE_RAW_DATA:
            ext = '.rawml'
            if mh.isK8():
                outraw = os.path.join(files.k8dir,
                                      files.getInputFileBasename() + ext)
            else:
                if mh.isPrintReplica():
                    ext = '.rawpr'
                    outraw = os.path.join(files.outdir,
                                          files.getInputFileBasename() + ext)
                else:
                    outraw = os.path.join(files.mobi7dir,
                                          files.getInputFileBasename() + ext)
            file(outraw, 'wb').write(rawML)

        # process additional sections that represent images, resources, fonts, and etc
        # build up a list of image names to use to postprocess the rawml
        print "Unpacking images, resources, fonts, etc"
        firstaddl = mh.getfirstAddl()
        if DEBUG:
            print "firstaddl is ", firstaddl
            print "num_sections is ", sect.num_sections
            print "K8Boundary is ", K8Boundary
        beg = firstaddl
        end = sect.num_sections
        if firstaddl < K8Boundary:
            end = K8Boundary
        obfuscate_data = []
        for i in xrange(beg, end):
            if DEBUG:
                print "Section is ", i
            data = sect.loadSection(i)
            type = data[0:4]
            if type in ["FLIS", "FCIS", "FDST", "DATP"]:
                if DEBUG:
                    print 'First 4 bytes: %s' % toHex(data[0:4])
                    fname = "%05d" % (1 + i - beg)
                    fname = type + fname
                    if mh.isK8():
                        fname += "_K8"
                    fname += '.dat'
                    outname = os.path.join(files.outdir, fname)
                    file(outname, 'wb').write(data)
                    print "Skipping ", type, " section"
                imgnames.append(None)
                continue
            elif type == "SRCS":
                # The mobi file was created by kindlegen and contains a zip archive with all source files.
                # Extract the archive and save it.
                print "    Info: File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME
                srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME)
                file(srcname, 'wb').write(data[16:])
                imgnames.append(None)
                continue
            elif type == "FONT":
                # fonts only exist in K8 ebooks
                # Format:
                # bytes  0 -  3:  'FONT'
                # bytes  4 -  7:  uncompressed size
                # bytes  8 - 11:  flags
                #                     bit 0x0001 - zlib compression
                #                     bit 0x0002 - obfuscated with xor string
                # bytes 12 - 15:  offset to start of compressed font data
                # bytes 16 - 19:  length of xor string stored before the start of the comnpress font data
                # bytes 19 - 23:  start of xor string
                usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(
                    '>LLLLL', data, 4)
                font_data = data[dstart:]
                extent = len(font_data)
                extent = min(extent, 1040)
                if fflags & 0x0002:
                    # obfuscated so need to de-obfuscate the first 1040 bytes
                    key = bytearray(data[xor_start:xor_start + xor_len])
                    buf = bytearray(font_data)
                    for n in xrange(extent):
                        buf[n] ^= key[n % xor_len]
                    font_data = bytes(buf)
                if fflags & 0x0001:
                    # ZLIB compressed data
                    wbits, err = read_zlib_header(font_data[:2])
                    if err is None:
                        adler32, = struct.unpack_from('>I', font_data,
                                                      len(font_data) - 4)
                        font_data = zlib.decompress(font_data[2:-4], -wbits,
                                                    usize)
                        if len(font_data) != usize:
                            print 'Font Decompression Error: Uncompressed font size mismatch'
                        if False:
                            # For some reason these almost never match, probably Amazon has a
                            # buggy Adler32 implementation
                            sig = (zlib.adler32(font_data) & 0xffffffff)
                            if sig != adler32:
                                print 'Font Decompression Error'
                                print 'Adler checksum did not match. Stored: %d Calculated: %d' % (
                                    adler32, sig)
                    else:
                        print "Error Decoding Font", str(err)
                hdr = font_data[0:4]
                if hdr == '\0\1\0\0' or hdr == 'true' or hdr == 'ttcf':
                    ext = '.ttf'
                elif hdr == 'OTTO':
                    ext = '.otf'
                else:
                    print "Warning: unknown font header %s" % hdr.encode('hex')
                    ext = '.dat'
                fontname = "font%05d" % (1 + i - beg)
                fontname += ext
                if (ext == '.ttf' or ext == '.otf') and (fflags & 0x0002):
                    obfuscate_data.append(fontname)
                print "    extracting font: ", fontname
                outfnt = os.path.join(files.imgdir, fontname)
                file(outfnt, 'wb').write(font_data)
                imgnames.append(fontname)
                continue

            elif type == "RESC":
                # resources only exist in K8 ebooks
                # not sure what they are, looks like
                # a piece of the top of the original content.opf
                # file, so only write them out
                # if DEBUG is True
                if DEBUG:
                    data = data[4:]
                    rescname = "resc%05d.dat" % (1 + i - beg)
                    print "    extracting resource: ", rescname
                    outrsc = os.path.join(files.imgdir, rescname)
                    file(outrsc, 'wb').write(data)
                imgnames.append(None)
                continue

            if data == EOF_RECORD:
                if DEBUG:
                    print "Skip section %i as it contains the EOF record." % i
                imgnames.append(None)
                continue

            # if reach here should be an image but double check to make sure
            # Get the proper file extension
            imgtype = imghdr.what(None, data)
            if imgtype is None:
                print "Warning: Section %s contains no image or an unknown image format" % i
                imgnames.append(None)
                if DEBUG:
                    print 'First 4 bytes: %s' % toHex(data[0:4])
                    fname = "unknown%05d.dat" % (1 + i - beg)
                    outname = os.path.join(files.outdir, fname)
                    file(outname, 'wb').write(data)
            else:
                imgname = "image%05d.%s" % (1 + i - beg, imgtype)
                print "    extracting image: ", imgname
                outimg = os.path.join(files.imgdir, imgname)
                file(outimg, 'wb').write(data)
                imgnames.append(imgname)

        # FIXME all of this PrintReplica code is untested!
        # Process print replica book.
        if mh.isPrintReplica() and not k8only:
            filenames = []
            print "Print Replica ebook detected"
            try:
                mh.processPrintReplica(files)
            except Exception, e:
                print 'Error processing Print Replica: ' + str(e)
            filenames.append(['', files.getInputFileBasename() + '.pdf'])
            usedmap = {}
            for name in imgnames:
                if name != None:
                    usedmap[name] = 'used'
            opf = OPFProcessor(files, metadata, filenames, imgnames, False, mh,
                               usedmap)
            opf.writeOPF()
            continue

        if mh.isK8():
            # K8 mobi
            # require other indexes which contain parsing information and the FDST info
            # to process the rawml back into the xhtml files, css files, svg image files, etc
            k8proc = K8Processor(mh, sect, DEBUG)
            k8proc.buildParts(rawML)

            # collect information for the guide first
            guidetext = k8proc.getGuideText()
            # add in any guide info from metadata, such as StartOffset
            if 'StartOffset' in metadata.keys():
                starts = metadata['StartOffset']
                last_start = starts.pop()
                if int(last_start) == 0xffffffff:
                    last_start = '0'
                filename, partnum, beg, end = k8proc.getFileInfo(
                    int(last_start))
                idtext = k8proc.getIDTag(int(last_start))
                linktgt = filename
                if idtext != '':
                    linktgt += '#' + idtext
                guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt

            # process the toc ncx
            # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
            ncx = ncxExtract(mh, files)
            ncx_data = ncx.parseNCX()

            # extend the ncx data with
            # info about filenames and proper internal idtags
            for i in range(len(ncx_data)):
                ncxmap = ncx_data[i]
                [junk1, junk2, junk3, fid, junk4,
                 off] = ncxmap['pos_fid'].split(':')
                filename, idtag = k8proc.getIDTagByPosFid(fid, off)
                ncxmap['filename'] = filename
                ncxmap['idtag'] = idtag
                ncx_data[i] = ncxmap

            # write out the toc.ncx
            ncx.writeK8NCX(ncx_data, metadata)

            # convert the rawML to a set of xhtml files
            htmlproc = XHTMLK8Processor(imgnames, k8proc)
            usedmap = htmlproc.buildXHTML()

            # write out the files
            filenames = []
            n = k8proc.getNumberOfParts()
            for i in range(n):
                part = k8proc.getPart(i)
                [skelnum, dir, filename, beg, end,
                 aidtext] = k8proc.getPartInfo(i)
                filenames.append([dir, filename])
                fname = os.path.join(files.k8oebps, dir, filename)
                file(fname, 'wb').write(part)
            n = k8proc.getNumberOfFlows()
            for i in range(1, n):
                [type, format, dir, filename] = k8proc.getFlowInfo(i)
                flowpart = k8proc.getFlow(i)
                if format == 'file':
                    filenames.append([dir, filename])
                    fname = os.path.join(files.k8oebps, dir, filename)
                    file(fname, 'wb').write(flowpart)

            opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX,
                               mh, usedmap, guidetext)

            if obfuscate_data:
                uuid = opf.writeOPF(True)
            else:
                uuid = opf.writeOPF()

            # make an epub of it all
            files.makeEPUB(usedmap, obfuscate_data, uuid)

        elif not k8only:
            # An original Mobi
            # process the toc ncx
            # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
            ncx = ncxExtract(mh, files)
            ncx_data = ncx.parseNCX()
            ncx.writeNCX(metadata)

            positionMap = {}
            # If Dictionary build up the positionMap
            if mh.isDictionary():
                if mh.DictInLanguage():
                    metadata['DictInLanguage'] = mh.DictInLanguage()
                if mh.DictOutLanguage():
                    metadata['DictOutLanguage'] = mh.DictOutLanguage()
                positionMap = dictSupport(mh, sect).getPositionMap()

            # convert the rawml back to Mobi ml
            proc = HTMLProcessor(files, metadata, imgnames)
            srctext = proc.findAnchors(rawML, ncx_data, positionMap)
            srctext, usedmap = proc.insertHREFS()
            filenames = []

            # write the proper mobi html
            fname = files.getInputFileBasename() + '.html'
            filenames.append(['', fname])
            outhtml = os.path.join(files.mobi7dir, fname)
            file(outhtml, 'wb').write(srctext)

            # create an OPF
            # extract guidetext from srctext
            guidetext = ''
            guidematch = re.search(r'''<guide>(.*)</guide>''', srctext,
                                   re.IGNORECASE + re.DOTALL)
            if guidematch:
                replacetext = r'''href="''' + filenames[0][
                    1] + r'''#filepos\1"'''
                guidetext = re.sub(r'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''',
                                   replacetext, guidematch.group(1))
                guidetext += '\n'
                guidetext = unicode(guidetext, mh.codec).encode("utf-8")
            opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX,
                               mh, usedmap, guidetext)
            opf.writeOPF()
def processMobi7(mh, metadata, sect, files, rscnames):
    global DUMP
    global WRITE_RAW_DATA
    # An original Mobi
    rawML = mh.getRawML()
    if DUMP or WRITE_RAW_DATA:
        outraw = os.path.join(files.mobi7dir,
                              files.getInputFileBasename() + '.rawml')
        open(pathof(outraw), 'wb').write(rawML)

    # process the toc ncx
    # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
    ncx = ncxExtract(mh, files)
    ncx_data = ncx.parseNCX()
    ncx.writeNCX(metadata)

    positionMap = {}

    # if Dictionary build up the positionMap
    if mh.isDictionary():
        if mh.DictInLanguage():
            metadata['DictInLanguage'] = [mh.DictInLanguage()]
        if mh.DictOutLanguage():
            metadata['DictOutLanguage'] = [mh.DictOutLanguage()]
        positionMap = dictSupport(mh, sect).getPositionMap()

    # convert the rawml back to Mobi ml
    proc = HTMLProcessor(files, metadata, rscnames)
    srctext = proc.findAnchors(rawML, ncx_data, positionMap)
    srctext, usedmap = proc.insertHREFS()

    # write the proper mobi html
    fileinfo = []
    # fname = files.getInputFileBasename() + '.html'
    fname = 'book.html'
    fileinfo.append([None, '', fname])
    outhtml = os.path.join(files.mobi7dir, fname)
    open(pathof(outhtml), 'wb').write(srctext)

    # extract guidetext from srctext
    guidetext = ''
    pagemapxml = ''
    guidematch = re.search(r'''<guide>(.*)</guide>''', srctext,
                           re.IGNORECASE + re.DOTALL)
    if guidematch:
        guidetext = guidematch.group(1)
        # sometimes old mobi guide from srctext horribly written so need to clean up
        guidetext = guidetext.replace("\r", "")
        guidetext = guidetext.replace('<REFERENCE', '<reference')
        guidetext = guidetext.replace(' HREF=', ' href=')
        guidetext = guidetext.replace(' TITLE=', ' title=')
        guidetext = guidetext.replace(' TYPE=', ' type=')
        # reference must be a self-closing tag
        # and any href must be replaced with filepos information
        ref_tag_pattern = re.compile(r'''(<reference [^>]*>)''', re.IGNORECASE)
        guidepieces = ref_tag_pattern.split(guidetext)
        for i in range(1, len(guidepieces), 2):
            reftag = guidepieces[i]
            # remove any href there now to replace with filepos
            reftag = re.sub(r'''href\s*=[^'"]*['"][^'"]*['"]''', '', reftag)
            # make sure the reference tag ends properly
            if not reftag.endswith("/>"):
                reftag = reftag[0:-1] + "/>"
                guidepieces[i] = reftag
        guidetext = "".join(guidepieces)
        replacetext = r'''href="''' + fileinfo[0][2] + r'''#filepos\1"'''
        guidetext = re.sub(r'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''',
                           replacetext, guidetext)
        guidetext += '\n'
        if isinstance(guidetext, unicode):
            guidetext = guidetext.decode(mh.codec).encode("utf-8")
        else:
            guidetext = unicode(guidetext, mh.codec).encode("utf-8")

    if 'StartOffset' in metadata.keys():
        for value in metadata['StartOffset']:
            if int(value) == 0xffffffff:
                value = '0'
            starting_offset = value
        # get guide items from metadata
        metaguidetext = '<reference type="text" href="' + fileinfo[0][
            2] + '#filepos' + starting_offset + '" />\n'
        guidetext += metaguidetext

    # create an OPF
    opf = OPFProcessor(files, metadata, fileinfo, rscnames, ncx.isNCX, mh,
                       usedmap, pagemapxml, guidetext)
    opf.writeOPF()
def process_all_mobi_headers(files, sect, mhlst, K8Boundary, k8only=False):
    imgnames = []
    for mh in mhlst:

        if mh.isK8():
            print "\n\nProcessing K8 format Ebook ..."
        elif mh.isPrintReplica():
            print "\nProcessing PrintReplica (.azw4) format Ebook ..."
        else:
            print "\nProcessing Mobi format Ebook ..."

        if DEBUG:
            # write out raw mobi header data
            mhname = os.path.join(files.outdir, "header.dat")
            if mh.isK8():
                mhname = os.path.join(files.outdir, "header_K8.dat")
            file(mhname, "wb").write(mh.header)

        # process each mobi header
        if mh.isEncrypted():
            raise unpackException("file is encrypted")

        # build up the metadata
        metadata = mh.getMetaData()
        metadata["Language"] = mh.Language()
        metadata["Title"] = [unicode(mh.title, mh.codec).encode("utf-8")]
        metadata["Codec"] = [mh.codec]
        metadata["UniqueID"] = [str(mh.unique_id)]
        if DEBUG:
            print "MetaData from EXTH"
            print metadata

        # save the raw markup language
        rawML = mh.getRawML()
        if DEBUG or WRITE_RAW_DATA:
            ext = ".rawml"
            if mh.isK8():
                outraw = os.path.join(files.k8dir, files.getInputFileBasename() + ext)
            else:
                if mh.isPrintReplica():
                    ext = ".rawpr"
                    outraw = os.path.join(files.outdir, files.getInputFileBasename() + ext)
                else:
                    outraw = os.path.join(files.mobi7dir, files.getInputFileBasename() + ext)
            file(outraw, "wb").write(rawML)

        # process additional sections that represent images, resources, fonts, and etc
        # build up a list of image names to use to postprocess the rawml
        print "Unpacking images, resources, fonts, etc"
        firstaddl = mh.getfirstAddl()
        if DEBUG:
            print "firstaddl is ", firstaddl
            print "num_sections is ", sect.num_sections
            print "K8Boundary is ", K8Boundary
        beg = firstaddl
        end = sect.num_sections
        if firstaddl < K8Boundary:
            end = K8Boundary
        obfuscate_data = []
        for i in xrange(beg, end):
            if DEBUG:
                print "Section is ", i
            data = sect.loadSection(i)
            type = data[0:4]
            if type in ["FLIS", "FCIS", "FDST", "DATP"]:
                if DEBUG:
                    print "First 4 bytes: %s" % toHex(data[0:4])
                    fname = "%05d" % (1 + i - beg)
                    fname = type + fname
                    if mh.isK8():
                        fname += "_K8"
                    fname += ".dat"
                    outname = os.path.join(files.outdir, fname)
                    file(outname, "wb").write(data)
                    print "Skipping ", type, " section"
                imgnames.append(None)
                continue
            elif type == "SRCS":
                # The mobi file was created by kindlegen and contains a zip archive with all source files.
                # Extract the archive and save it.
                print "    Info: File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME
                srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME)
                file(srcname, "wb").write(data[16:])
                imgnames.append(None)
                continue
            elif type == "FONT":
                # fonts only exist in K8 ebooks
                # Format:
                # bytes  0 -  3:  'FONT'
                # bytes  4 -  7:  uncompressed size
                # bytes  8 - 11:  flags
                #                     bit 0x0001 - zlib compression
                #                     bit 0x0002 - obfuscated with xor string
                # bytes 12 - 15:  offset to start of compressed font data
                # bytes 16 - 19:  length of xor string stored before the start of the comnpress font data
                # bytes 19 - 23:  start of xor string
                usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(">LLLLL", data, 4)
                font_data = data[dstart:]
                extent = len(font_data)
                extent = min(extent, 1040)
                if fflags & 0x0002:
                    # obfuscated so need to de-obfuscate the first 1040 bytes
                    key = bytearray(data[xor_start : xor_start + xor_len])
                    buf = bytearray(font_data)
                    for n in xrange(extent):
                        buf[n] ^= key[n % xor_len]
                    font_data = bytes(buf)
                if fflags & 0x0001:
                    # ZLIB compressed data
                    wbits, err = read_zlib_header(font_data[:2])
                    if err is None:
                        adler32, = struct.unpack_from(">I", font_data, len(font_data) - 4)
                        font_data = zlib.decompress(font_data[2:-4], -wbits, usize)
                        if len(font_data) != usize:
                            print "Font Decompression Error: Uncompressed font size mismatch"
                        if False:
                            # For some reason these almost never match, probably Amazon has a
                            # buggy Adler32 implementation
                            sig = zlib.adler32(font_data) & 0xFFFFFFFF
                            if sig != adler32:
                                print "Font Decompression Error"
                                print "Adler checksum did not match. Stored: %d Calculated: %d" % (adler32, sig)
                    else:
                        print "Error Decoding Font", str(err)
                hdr = font_data[0:4]
                if hdr == "\0\1\0\0" or hdr == "true" or hdr == "ttcf":
                    ext = ".ttf"
                elif hdr == "OTTO":
                    ext = ".otf"
                else:
                    print "Warning: unknown font header %s" % hdr.encode("hex")
                    ext = ".dat"
                fontname = "font%05d" % (1 + i - beg)
                fontname += ext
                if (ext == ".ttf" or ext == ".otf") and (fflags & 0x0002):
                    obfuscate_data.append(fontname)
                print "    extracting font: ", fontname
                outfnt = os.path.join(files.imgdir, fontname)
                file(outfnt, "wb").write(font_data)
                imgnames.append(fontname)
                continue

            elif type == "RESC":
                # resources only exist in K8 ebooks
                # not sure what they are, looks like
                # a piece of the top of the original content.opf
                # file, so only write them out
                # if DEBUG is True
                if DEBUG:
                    data = data[4:]
                    rescname = "resc%05d.dat" % (1 + i - beg)
                    print "    extracting resource: ", rescname
                    outrsc = os.path.join(files.imgdir, rescname)
                    file(outrsc, "wb").write(data)
                imgnames.append(None)
                continue

            if data == EOF_RECORD:
                if DEBUG:
                    print "Skip section %i as it contains the EOF record." % i
                imgnames.append(None)
                continue

            # if reach here should be an image but double check to make sure
            # Get the proper file extension
            imgtype = imghdr.what(None, data)
            if imgtype is None:
                print "Warning: Section %s contains no image or an unknown image format" % i
                imgnames.append(None)
                if DEBUG:
                    print "First 4 bytes: %s" % toHex(data[0:4])
                    fname = "unknown%05d.dat" % (1 + i - beg)
                    outname = os.path.join(files.outdir, fname)
                    file(outname, "wb").write(data)
            else:
                imgname = "image%05d.%s" % (1 + i - beg, imgtype)
                print "    extracting image: ", imgname
                outimg = os.path.join(files.imgdir, imgname)
                file(outimg, "wb").write(data)
                imgnames.append(imgname)

        # FIXME all of this PrintReplica code is untested!
        # Process print replica book.
        if mh.isPrintReplica() and not k8only:
            filenames = []
            print "Print Replica ebook detected"
            try:
                mh.processPrintReplica(files)
            except Exception, e:
                print "Error processing Print Replica: " + str(e)
            filenames.append(["", files.getInputFileBasename() + ".pdf"])
            usedmap = {}
            for name in imgnames:
                if name != None:
                    usedmap[name] = "used"
            opf = OPFProcessor(files, metadata, filenames, imgnames, False, mh, usedmap)
            opf.writeOPF()
            continue

        if mh.isK8():
            # K8 mobi
            # require other indexes which contain parsing information and the FDST info
            # to process the rawml back into the xhtml files, css files, svg image files, etc
            k8proc = K8Processor(mh, sect, DEBUG)
            k8proc.buildParts(rawML)

            # collect information for the guide first
            guidetext = k8proc.getGuideText()
            # add in any guide info from metadata, such as StartOffset
            if "StartOffset" in metadata.keys():
                starts = metadata["StartOffset"]
                last_start = starts.pop()
                if int(last_start) == 0xFFFFFFFF:
                    last_start = "0"
                filename, partnum, beg, end = k8proc.getFileInfo(int(last_start))
                idtext = k8proc.getIDTag(int(last_start))
                linktgt = filename
                if idtext != "":
                    linktgt += "#" + idtext
                guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt

            # process the toc ncx
            # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
            ncx = ncxExtract(mh, files)
            ncx_data = ncx.parseNCX()

            # extend the ncx data with
            # info about filenames and proper internal idtags
            for i in range(len(ncx_data)):
                ncxmap = ncx_data[i]
                [junk1, junk2, junk3, fid, junk4, off] = ncxmap["pos_fid"].split(":")
                filename, idtag = k8proc.getIDTagByPosFid(fid, off)
                ncxmap["filename"] = filename
                ncxmap["idtag"] = idtag
                ncx_data[i] = ncxmap

            # write out the toc.ncx
            ncx.writeK8NCX(ncx_data, metadata)

            # convert the rawML to a set of xhtml files
            htmlproc = XHTMLK8Processor(imgnames, k8proc)
            usedmap = htmlproc.buildXHTML()

            # write out the files
            filenames = []
            n = k8proc.getNumberOfParts()
            for i in range(n):
                part = k8proc.getPart(i)
                [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i)
                filenames.append([dir, filename])
                fname = os.path.join(files.k8oebps, dir, filename)
                file(fname, "wb").write(part)
            n = k8proc.getNumberOfFlows()
            for i in range(1, n):
                [type, format, dir, filename] = k8proc.getFlowInfo(i)
                flowpart = k8proc.getFlow(i)
                if format == "file":
                    filenames.append([dir, filename])
                    fname = os.path.join(files.k8oebps, dir, filename)
                    file(fname, "wb").write(flowpart)

            opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext)

            if obfuscate_data:
                uuid = opf.writeOPF(True)
            else:
                uuid = opf.writeOPF()

            # make an epub of it all
            files.makeEPUB(usedmap, obfuscate_data, uuid)

        elif not k8only:
            # An original Mobi
            # process the toc ncx
            # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
            ncx = ncxExtract(mh, files)
            ncx_data = ncx.parseNCX()
            ncx.writeNCX(metadata)

            positionMap = {}
            # If Dictionary build up the positionMap
            if mh.isDictionary():
                if mh.DictInLanguage():
                    metadata["DictInLanguage"] = mh.DictInLanguage()
                if mh.DictOutLanguage():
                    metadata["DictOutLanguage"] = mh.DictOutLanguage()
                positionMap = dictSupport(mh, sect).getPositionMap()

            # convert the rawml back to Mobi ml
            proc = HTMLProcessor(files, metadata, imgnames)
            srctext = proc.findAnchors(rawML, ncx_data, positionMap)
            srctext, usedmap = proc.insertHREFS()
            filenames = []

            # write the proper mobi html
            fname = files.getInputFileBasename() + ".html"
            filenames.append(["", fname])
            outhtml = os.path.join(files.mobi7dir, fname)
            file(outhtml, "wb").write(srctext)

            # create an OPF
            # extract guidetext from srctext
            guidetext = ""
            guidematch = re.search(r"""<guide>(.*)</guide>""", srctext, re.IGNORECASE + re.DOTALL)
            if guidematch:
                replacetext = r'''href="''' + filenames[0][1] + r'''#filepos\1"'''
                guidetext = re.sub(r"""filepos=['"]{0,1}0*(\d+)['"]{0,1}""", replacetext, guidematch.group(1))
                guidetext += "\n"
                guidetext = unicode(guidetext, mh.codec).encode("utf-8")
            opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext)
            opf.writeOPF()
def processMobi7(mh, metadata, sect, files, imgnames):
    global DUMP
    global WRITE_RAW_DATA
    # An original Mobi
    rawML = mh.getRawML()
    if DUMP or WRITE_RAW_DATA:
        outraw = os.path.join(files.mobi7dir,files.getInputFileBasename() + '.rawml')
        open(pathof(outraw),'wb').write(rawML)

    # process the toc ncx
    # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
    ncx = ncxExtract(mh, files)
    ncx_data = ncx.parseNCX()
    ncx.writeNCX(metadata)

    positionMap = {}

    # if Dictionary build up the positionMap
    if mh.isDictionary():
        if mh.DictInLanguage():
            metadata['DictInLanguage'] = [mh.DictInLanguage()]
        if mh.DictOutLanguage():
            metadata['DictOutLanguage'] = [mh.DictOutLanguage()]
        positionMap = dictSupport(mh, sect).getPositionMap()

    # convert the rawml back to Mobi ml
    proc = HTMLProcessor(files, metadata, imgnames)
    srctext = proc.findAnchors(rawML, ncx_data, positionMap)
    srctext, usedmap = proc.insertHREFS()

    # write the proper mobi html
    fileinfo=[]
    fname = files.getInputFileBasename() + '.html'
    fileinfo.append([None,'', fname])
    outhtml = os.path.join(files.mobi7dir, fname)
    open(pathof(outhtml), 'wb').write(srctext)

    # extract guidetext from srctext
    guidetext =''
    pagemapxml = ''
    guidematch = re.search(r'''<guide>(.*)</guide>''',srctext,re.IGNORECASE+re.DOTALL)
    if guidematch:
        guidetext = guidematch.group(1)
        # sometimes old mobi guide from srctext horribly written so need to clean up
        guidetext = guidetext.replace("\r", "")
        guidetext = guidetext.replace('<REFERENCE', '<reference')
        guidetext = guidetext.replace(' HREF=', ' href=')
        guidetext = guidetext.replace(' TITLE=', ' title=')
        guidetext = guidetext.replace(' TYPE=', ' type=')
        # reference must be a self-closing tag
        # and any href must be replaced with filepos information
        ref_tag_pattern = re.compile(r'''(<reference [^>]*>)''', re.IGNORECASE)
        guidepieces = ref_tag_pattern.split(guidetext)
        for i in range(1,len(guidepieces), 2):
            reftag = guidepieces[i]
            # remove any href there now to replace with filepos
            reftag = re.sub(r'''href\s*=[^'"]*['"][^'"]*['"]''','', reftag)
            # make sure the reference tag ends properly
            if not reftag.endswith("/>"):
                reftag = reftag[0:-1] + "/>"
                guidepieces[i] = reftag
        guidetext = "".join(guidepieces)
        replacetext = r'''href="'''+fileinfo[0][2]+r'''#filepos\1"'''
        guidetext = re.sub(r'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidetext)
        guidetext += '\n'
        if isinstance(guidetext, unicode):
            guidetext = guidetext.decode(mh.codec).encode("utf-8")
        else:
            guidetext = unicode(guidetext, mh.codec).encode("utf-8")

    if 'StartOffset' in metadata.keys():
        for value in metadata['StartOffset']:
            if int(value) == 0xffffffff:
                value = '0'
            starting_offset = value
        # get guide items from metadata
        metaguidetext = '<reference type="text" href="'+fileinfo[0][2]+'#filepos'+starting_offset+'" />\n'
        guidetext += metaguidetext

    # create an OPF
    opf = OPFProcessor(files, metadata, fileinfo, imgnames, ncx.isNCX, mh, usedmap, pagemapxml, guidetext)
    opf.writeOPF()