Example #1
0
    def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
        '''
        Apply inflection rule.

        @param mainEntry: The word to inflect.
        @param inflectionRuleData: The inflection rules.
        @param start: The start position of the inflection rule to use.
        @param end: The end position of the inflection rule to use.
        @return: The string with the inflected word or None if an error occurs.
        '''
        mode = -1
        byteArray = array.array("c", mainEntry)
        position = len(byteArray)
        for charOffset in range(start, end):
            char = inflectionRuleData[charOffset]
            byte = ord(char)
            if byte >= 0x0a and byte <= 0x13:
                # Move cursor backwards
                offset = byte - 0x0a
                if mode not in [0x02, 0x03]:
                    mode = 0x02
                    position = len(byteArray)
                position -= offset
            elif byte > 0x13:
                if mode == -1:
                    print "Error: Unexpected first byte %i of inflection rule" % byte
                    return None
                elif position == -1:
                    print "Error: Unexpected first byte %i of inflection rule" % byte
                    return None
                else:
                    if mode == 0x01:
                        # Insert at word start
                        byteArray.insert(position, char)
                        position += 1
                    elif mode == 0x02:
                        # Insert at word end
                        byteArray.insert(position, char)
                    elif mode == 0x03:
                        # Delete at word end
                        position -= 1
                        deleted = byteArray.pop(position)
                        if deleted != char:
                            if DEBUG_DICT:
                                print "0x03: %s %s %s %s" % (
                                    mainEntry,
                                    toHex(inflectionRuleData[start:end]), char,
                                    deleted)
                            print "Error: Delete operation of inflection rule failed"
                            return None
                    elif mode == 0x04:
                        # Delete at word start
                        deleted = byteArray.pop(position)
                        if deleted != char:
                            if DEBUG_DICT:
                                print "0x03: %s %s %s %s" % (
                                    mainEntry,
                                    toHex(inflectionRuleData[start:end]), char,
                                    deleted)
                            print "Error: Delete operation of inflection rule failed"
                            return None
                    else:
                        print "Error: Inflection rule mode %x is not implemented" % mode
                        return None
            elif byte == 0x01:
                # Insert at word start
                if mode not in [0x01, 0x04]:
                    position = 0
                mode = byte
            elif byte == 0x02:
                # Insert at word end
                if mode not in [0x02, 0x03]:
                    position = len(byteArray)
                mode = byte
            elif byte == 0x03:
                # Delete at word end
                if mode not in [0x02, 0x03]:
                    position = len(byteArray)
                mode = byte
            elif byte == 0x04:
                # Delete at word start
                if mode not in [0x01, 0x04]:
                    position = 0
                # Delete at word start
                mode = byte
            else:
                print "Error: Inflection rule mode %x is not implemented" % byte
                return None
        return byteArray.tostring()
Example #2
0
def process_all_mobi_headers(files, sect, mhlst, K8Boundary, k8only=False):
    imgnames = []
    for mh in mhlst:

        if mh.isK8():
            print "\n\nProcessing K8 format Ebook ..."
        elif mh.isPrintReplica():
            print "\nProcessing PrintReplica (.azw4) format Ebook ..."
        else:
            print "\nProcessing Mobi format Ebook ..."

        if DEBUG:
            # write out raw mobi header data
            mhname = os.path.join(files.outdir, "header.dat")
            if mh.isK8():
                mhname = os.path.join(files.outdir, "header_K8.dat")
            file(mhname, 'wb').write(mh.header)

        # process each mobi header
        if mh.isEncrypted():
            raise unpackException('file is encrypted')

        # build up the metadata
        metadata = mh.getMetaData()
        metadata['Language'] = mh.Language()
        metadata['Title'] = [unicode(mh.title, mh.codec).encode("utf-8")]
        metadata['Codec'] = [mh.codec]
        metadata['UniqueID'] = [str(mh.unique_id)]
        if DEBUG:
            print "MetaData from EXTH"
            print metadata

        # save the raw markup language
        rawML = mh.getRawML()
        if DEBUG or WRITE_RAW_DATA:
            ext = '.rawml'
            if mh.isK8():
                outraw = os.path.join(files.k8dir,
                                      files.getInputFileBasename() + ext)
            else:
                if mh.isPrintReplica():
                    ext = '.rawpr'
                    outraw = os.path.join(files.outdir,
                                          files.getInputFileBasename() + ext)
                else:
                    outraw = os.path.join(files.mobi7dir,
                                          files.getInputFileBasename() + ext)
            file(outraw, 'wb').write(rawML)

        # process additional sections that represent images, resources, fonts, and etc
        # build up a list of image names to use to postprocess the rawml
        print "Unpacking images, resources, fonts, etc"
        firstaddl = mh.getfirstAddl()
        if DEBUG:
            print "firstaddl is ", firstaddl
            print "num_sections is ", sect.num_sections
            print "K8Boundary is ", K8Boundary
        beg = firstaddl
        end = sect.num_sections
        if firstaddl < K8Boundary:
            end = K8Boundary
        obfuscate_data = []
        for i in xrange(beg, end):
            if DEBUG:
                print "Section is ", i
            data = sect.loadSection(i)
            type = data[0:4]
            if type in ["FLIS", "FCIS", "FDST", "DATP"]:
                if DEBUG:
                    print 'First 4 bytes: %s' % toHex(data[0:4])
                    fname = "%05d" % (1 + i - beg)
                    fname = type + fname
                    if mh.isK8():
                        fname += "_K8"
                    fname += '.dat'
                    outname = os.path.join(files.outdir, fname)
                    file(outname, 'wb').write(data)
                    print "Skipping ", type, " section"
                imgnames.append(None)
                continue
            elif type == "SRCS":
                # The mobi file was created by kindlegen and contains a zip archive with all source files.
                # Extract the archive and save it.
                print "    Info: File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME
                srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME)
                file(srcname, 'wb').write(data[16:])
                imgnames.append(None)
                continue
            elif type == "FONT":
                # fonts only exist in K8 ebooks
                # Format:
                # bytes  0 -  3:  'FONT'
                # bytes  4 -  7:  uncompressed size
                # bytes  8 - 11:  flags
                #                     bit 0x0001 - zlib compression
                #                     bit 0x0002 - obfuscated with xor string
                # bytes 12 - 15:  offset to start of compressed font data
                # bytes 16 - 19:  length of xor string stored before the start of the comnpress font data
                # bytes 19 - 23:  start of xor string
                usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(
                    '>LLLLL', data, 4)
                font_data = data[dstart:]
                extent = len(font_data)
                extent = min(extent, 1040)
                if fflags & 0x0002:
                    # obfuscated so need to de-obfuscate the first 1040 bytes
                    key = bytearray(data[xor_start:xor_start + xor_len])
                    buf = bytearray(font_data)
                    for n in xrange(extent):
                        buf[n] ^= key[n % xor_len]
                    font_data = bytes(buf)
                if fflags & 0x0001:
                    # ZLIB compressed data
                    wbits, err = read_zlib_header(font_data[:2])
                    if err is None:
                        adler32, = struct.unpack_from('>I', font_data,
                                                      len(font_data) - 4)
                        font_data = zlib.decompress(font_data[2:-4], -wbits,
                                                    usize)
                        if len(font_data) != usize:
                            print 'Font Decompression Error: Uncompressed font size mismatch'
                        if False:
                            # For some reason these almost never match, probably Amazon has a
                            # buggy Adler32 implementation
                            sig = (zlib.adler32(font_data) & 0xffffffff)
                            if sig != adler32:
                                print 'Font Decompression Error'
                                print 'Adler checksum did not match. Stored: %d Calculated: %d' % (
                                    adler32, sig)
                    else:
                        print "Error Decoding Font", str(err)
                hdr = font_data[0:4]
                if hdr == '\0\1\0\0' or hdr == 'true' or hdr == 'ttcf':
                    ext = '.ttf'
                elif hdr == 'OTTO':
                    ext = '.otf'
                else:
                    print "Warning: unknown font header %s" % hdr.encode('hex')
                    ext = '.dat'
                fontname = "font%05d" % (1 + i - beg)
                fontname += ext
                if (ext == '.ttf' or ext == '.otf') and (fflags & 0x0002):
                    obfuscate_data.append(fontname)
                print "    extracting font: ", fontname
                outfnt = os.path.join(files.imgdir, fontname)
                file(outfnt, 'wb').write(font_data)
                imgnames.append(fontname)
                continue

            elif type == "RESC":
                # resources only exist in K8 ebooks
                # not sure what they are, looks like
                # a piece of the top of the original content.opf
                # file, so only write them out
                # if DEBUG is True
                if DEBUG:
                    data = data[4:]
                    rescname = "resc%05d.dat" % (1 + i - beg)
                    print "    extracting resource: ", rescname
                    outrsc = os.path.join(files.imgdir, rescname)
                    file(outrsc, 'wb').write(data)
                imgnames.append(None)
                continue

            if data == EOF_RECORD:
                if DEBUG:
                    print "Skip section %i as it contains the EOF record." % i
                imgnames.append(None)
                continue

            # if reach here should be an image but double check to make sure
            # Get the proper file extension
            imgtype = imghdr.what(None, data)
            if imgtype is None:
                print "Warning: Section %s contains no image or an unknown image format" % i
                imgnames.append(None)
                if DEBUG:
                    print 'First 4 bytes: %s' % toHex(data[0:4])
                    fname = "unknown%05d.dat" % (1 + i - beg)
                    outname = os.path.join(files.outdir, fname)
                    file(outname, 'wb').write(data)
            else:
                imgname = "image%05d.%s" % (1 + i - beg, imgtype)
                print "    extracting image: ", imgname
                outimg = os.path.join(files.imgdir, imgname)
                file(outimg, 'wb').write(data)
                imgnames.append(imgname)

        # FIXME all of this PrintReplica code is untested!
        # Process print replica book.
        if mh.isPrintReplica() and not k8only:
            filenames = []
            print "Print Replica ebook detected"
            try:
                mh.processPrintReplica(files)
            except Exception, e:
                print 'Error processing Print Replica: ' + str(e)
            filenames.append(['', files.getInputFileBasename() + '.pdf'])
            usedmap = {}
            for name in imgnames:
                if name != None:
                    usedmap[name] = 'used'
            opf = OPFProcessor(files, metadata, filenames, imgnames, False, mh,
                               usedmap)
            opf.writeOPF()
            continue

        if mh.isK8():
            # K8 mobi
            # require other indexes which contain parsing information and the FDST info
            # to process the rawml back into the xhtml files, css files, svg image files, etc
            k8proc = K8Processor(mh, sect, DEBUG)
            k8proc.buildParts(rawML)

            # collect information for the guide first
            guidetext = k8proc.getGuideText()
            # add in any guide info from metadata, such as StartOffset
            if 'StartOffset' in metadata.keys():
                starts = metadata['StartOffset']
                last_start = starts.pop()
                if int(last_start) == 0xffffffff:
                    last_start = '0'
                filename, partnum, beg, end = k8proc.getFileInfo(
                    int(last_start))
                idtext = k8proc.getIDTag(int(last_start))
                linktgt = filename
                if idtext != '':
                    linktgt += '#' + idtext
                guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt

            # process the toc ncx
            # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
            ncx = ncxExtract(mh, files)
            ncx_data = ncx.parseNCX()

            # extend the ncx data with
            # info about filenames and proper internal idtags
            for i in range(len(ncx_data)):
                ncxmap = ncx_data[i]
                [junk1, junk2, junk3, fid, junk4,
                 off] = ncxmap['pos_fid'].split(':')
                filename, idtag = k8proc.getIDTagByPosFid(fid, off)
                ncxmap['filename'] = filename
                ncxmap['idtag'] = idtag
                ncx_data[i] = ncxmap

            # write out the toc.ncx
            ncx.writeK8NCX(ncx_data, metadata)

            # convert the rawML to a set of xhtml files
            htmlproc = XHTMLK8Processor(imgnames, k8proc)
            usedmap = htmlproc.buildXHTML()

            # write out the files
            filenames = []
            n = k8proc.getNumberOfParts()
            for i in range(n):
                part = k8proc.getPart(i)
                [skelnum, dir, filename, beg, end,
                 aidtext] = k8proc.getPartInfo(i)
                filenames.append([dir, filename])
                fname = os.path.join(files.k8oebps, dir, filename)
                file(fname, 'wb').write(part)
            n = k8proc.getNumberOfFlows()
            for i in range(1, n):
                [type, format, dir, filename] = k8proc.getFlowInfo(i)
                flowpart = k8proc.getFlow(i)
                if format == 'file':
                    filenames.append([dir, filename])
                    fname = os.path.join(files.k8oebps, dir, filename)
                    file(fname, 'wb').write(flowpart)

            opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX,
                               mh, usedmap, guidetext)

            if obfuscate_data:
                uuid = opf.writeOPF(True)
            else:
                uuid = opf.writeOPF()

            # make an epub of it all
            files.makeEPUB(usedmap, obfuscate_data, uuid)

        elif not k8only:
            # An original Mobi
            # process the toc ncx
            # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
            ncx = ncxExtract(mh, files)
            ncx_data = ncx.parseNCX()
            ncx.writeNCX(metadata)

            positionMap = {}
            # If Dictionary build up the positionMap
            if mh.isDictionary():
                if mh.DictInLanguage():
                    metadata['DictInLanguage'] = mh.DictInLanguage()
                if mh.DictOutLanguage():
                    metadata['DictOutLanguage'] = mh.DictOutLanguage()
                positionMap = dictSupport(mh, sect).getPositionMap()

            # convert the rawml back to Mobi ml
            proc = HTMLProcessor(files, metadata, imgnames)
            srctext = proc.findAnchors(rawML, ncx_data, positionMap)
            srctext, usedmap = proc.insertHREFS()
            filenames = []

            # write the proper mobi html
            fname = files.getInputFileBasename() + '.html'
            filenames.append(['', fname])
            outhtml = os.path.join(files.mobi7dir, fname)
            file(outhtml, 'wb').write(srctext)

            # create an OPF
            # extract guidetext from srctext
            guidetext = ''
            guidematch = re.search(r'''<guide>(.*)</guide>''', srctext,
                                   re.IGNORECASE + re.DOTALL)
            if guidematch:
                replacetext = r'''href="''' + filenames[0][
                    1] + r'''#filepos\1"'''
                guidetext = re.sub(r'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''',
                                   replacetext, guidematch.group(1))
                guidetext += '\n'
                guidetext = unicode(guidetext, mh.codec).encode("utf-8")
            opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX,
                               mh, usedmap, guidetext)
            opf.writeOPF()
Example #3
0
def process_all_mobi_headers(files, sect, mhlst, K8Boundary, k8only=False):
    imgnames = []
    for mh in mhlst:

        if mh.isK8():
            print "\n\nProcessing K8 format Ebook ..."
        elif mh.isPrintReplica():
            print "\nProcessing PrintReplica (.azw4) format Ebook ..."
        else:
            print "\nProcessing Mobi format Ebook ..."

        if DEBUG:
            # write out raw mobi header data
            mhname = os.path.join(files.outdir, "header.dat")
            if mh.isK8():
                mhname = os.path.join(files.outdir, "header_K8.dat")
            file(mhname, "wb").write(mh.header)

        # process each mobi header
        if mh.isEncrypted():
            raise unpackException("file is encrypted")

        # build up the metadata
        metadata = mh.getMetaData()
        metadata["Language"] = mh.Language()
        metadata["Title"] = [unicode(mh.title, mh.codec).encode("utf-8")]
        metadata["Codec"] = [mh.codec]
        metadata["UniqueID"] = [str(mh.unique_id)]
        if DEBUG:
            print "MetaData from EXTH"
            print metadata

        # save the raw markup language
        rawML = mh.getRawML()
        if DEBUG or WRITE_RAW_DATA:
            ext = ".rawml"
            if mh.isK8():
                outraw = os.path.join(files.k8dir, files.getInputFileBasename() + ext)
            else:
                if mh.isPrintReplica():
                    ext = ".rawpr"
                    outraw = os.path.join(files.outdir, files.getInputFileBasename() + ext)
                else:
                    outraw = os.path.join(files.mobi7dir, files.getInputFileBasename() + ext)
            file(outraw, "wb").write(rawML)

        # process additional sections that represent images, resources, fonts, and etc
        # build up a list of image names to use to postprocess the rawml
        print "Unpacking images, resources, fonts, etc"
        firstaddl = mh.getfirstAddl()
        if DEBUG:
            print "firstaddl is ", firstaddl
            print "num_sections is ", sect.num_sections
            print "K8Boundary is ", K8Boundary
        beg = firstaddl
        end = sect.num_sections
        if firstaddl < K8Boundary:
            end = K8Boundary
        obfuscate_data = []
        for i in xrange(beg, end):
            if DEBUG:
                print "Section is ", i
            data = sect.loadSection(i)
            type = data[0:4]
            if type in ["FLIS", "FCIS", "FDST", "DATP"]:
                if DEBUG:
                    print "First 4 bytes: %s" % toHex(data[0:4])
                    fname = "%05d" % (1 + i - beg)
                    fname = type + fname
                    if mh.isK8():
                        fname += "_K8"
                    fname += ".dat"
                    outname = os.path.join(files.outdir, fname)
                    file(outname, "wb").write(data)
                    print "Skipping ", type, " section"
                imgnames.append(None)
                continue
            elif type == "SRCS":
                # The mobi file was created by kindlegen and contains a zip archive with all source files.
                # Extract the archive and save it.
                print "    Info: File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME
                srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME)
                file(srcname, "wb").write(data[16:])
                imgnames.append(None)
                continue
            elif type == "FONT":
                # fonts only exist in K8 ebooks
                # Format:
                # bytes  0 -  3:  'FONT'
                # bytes  4 -  7:  uncompressed size
                # bytes  8 - 11:  flags
                #                     bit 0x0001 - zlib compression
                #                     bit 0x0002 - obfuscated with xor string
                # bytes 12 - 15:  offset to start of compressed font data
                # bytes 16 - 19:  length of xor string stored before the start of the comnpress font data
                # bytes 19 - 23:  start of xor string
                usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(">LLLLL", data, 4)
                font_data = data[dstart:]
                extent = len(font_data)
                extent = min(extent, 1040)
                if fflags & 0x0002:
                    # obfuscated so need to de-obfuscate the first 1040 bytes
                    key = bytearray(data[xor_start : xor_start + xor_len])
                    buf = bytearray(font_data)
                    for n in xrange(extent):
                        buf[n] ^= key[n % xor_len]
                    font_data = bytes(buf)
                if fflags & 0x0001:
                    # ZLIB compressed data
                    wbits, err = read_zlib_header(font_data[:2])
                    if err is None:
                        adler32, = struct.unpack_from(">I", font_data, len(font_data) - 4)
                        font_data = zlib.decompress(font_data[2:-4], -wbits, usize)
                        if len(font_data) != usize:
                            print "Font Decompression Error: Uncompressed font size mismatch"
                        if False:
                            # For some reason these almost never match, probably Amazon has a
                            # buggy Adler32 implementation
                            sig = zlib.adler32(font_data) & 0xFFFFFFFF
                            if sig != adler32:
                                print "Font Decompression Error"
                                print "Adler checksum did not match. Stored: %d Calculated: %d" % (adler32, sig)
                    else:
                        print "Error Decoding Font", str(err)
                hdr = font_data[0:4]
                if hdr == "\0\1\0\0" or hdr == "true" or hdr == "ttcf":
                    ext = ".ttf"
                elif hdr == "OTTO":
                    ext = ".otf"
                else:
                    print "Warning: unknown font header %s" % hdr.encode("hex")
                    ext = ".dat"
                fontname = "font%05d" % (1 + i - beg)
                fontname += ext
                if (ext == ".ttf" or ext == ".otf") and (fflags & 0x0002):
                    obfuscate_data.append(fontname)
                print "    extracting font: ", fontname
                outfnt = os.path.join(files.imgdir, fontname)
                file(outfnt, "wb").write(font_data)
                imgnames.append(fontname)
                continue

            elif type == "RESC":
                # resources only exist in K8 ebooks
                # not sure what they are, looks like
                # a piece of the top of the original content.opf
                # file, so only write them out
                # if DEBUG is True
                if DEBUG:
                    data = data[4:]
                    rescname = "resc%05d.dat" % (1 + i - beg)
                    print "    extracting resource: ", rescname
                    outrsc = os.path.join(files.imgdir, rescname)
                    file(outrsc, "wb").write(data)
                imgnames.append(None)
                continue

            if data == EOF_RECORD:
                if DEBUG:
                    print "Skip section %i as it contains the EOF record." % i
                imgnames.append(None)
                continue

            # if reach here should be an image but double check to make sure
            # Get the proper file extension
            imgtype = imghdr.what(None, data)
            if imgtype is None:
                print "Warning: Section %s contains no image or an unknown image format" % i
                imgnames.append(None)
                if DEBUG:
                    print "First 4 bytes: %s" % toHex(data[0:4])
                    fname = "unknown%05d.dat" % (1 + i - beg)
                    outname = os.path.join(files.outdir, fname)
                    file(outname, "wb").write(data)
            else:
                imgname = "image%05d.%s" % (1 + i - beg, imgtype)
                print "    extracting image: ", imgname
                outimg = os.path.join(files.imgdir, imgname)
                file(outimg, "wb").write(data)
                imgnames.append(imgname)

        # FIXME all of this PrintReplica code is untested!
        # Process print replica book.
        if mh.isPrintReplica() and not k8only:
            filenames = []
            print "Print Replica ebook detected"
            try:
                mh.processPrintReplica(files)
            except Exception, e:
                print "Error processing Print Replica: " + str(e)
            filenames.append(["", files.getInputFileBasename() + ".pdf"])
            usedmap = {}
            for name in imgnames:
                if name != None:
                    usedmap[name] = "used"
            opf = OPFProcessor(files, metadata, filenames, imgnames, False, mh, usedmap)
            opf.writeOPF()
            continue

        if mh.isK8():
            # K8 mobi
            # require other indexes which contain parsing information and the FDST info
            # to process the rawml back into the xhtml files, css files, svg image files, etc
            k8proc = K8Processor(mh, sect, DEBUG)
            k8proc.buildParts(rawML)

            # collect information for the guide first
            guidetext = k8proc.getGuideText()
            # add in any guide info from metadata, such as StartOffset
            if "StartOffset" in metadata.keys():
                starts = metadata["StartOffset"]
                last_start = starts.pop()
                if int(last_start) == 0xFFFFFFFF:
                    last_start = "0"
                filename, partnum, beg, end = k8proc.getFileInfo(int(last_start))
                idtext = k8proc.getIDTag(int(last_start))
                linktgt = filename
                if idtext != "":
                    linktgt += "#" + idtext
                guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt

            # process the toc ncx
            # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
            ncx = ncxExtract(mh, files)
            ncx_data = ncx.parseNCX()

            # extend the ncx data with
            # info about filenames and proper internal idtags
            for i in range(len(ncx_data)):
                ncxmap = ncx_data[i]
                [junk1, junk2, junk3, fid, junk4, off] = ncxmap["pos_fid"].split(":")
                filename, idtag = k8proc.getIDTagByPosFid(fid, off)
                ncxmap["filename"] = filename
                ncxmap["idtag"] = idtag
                ncx_data[i] = ncxmap

            # write out the toc.ncx
            ncx.writeK8NCX(ncx_data, metadata)

            # convert the rawML to a set of xhtml files
            htmlproc = XHTMLK8Processor(imgnames, k8proc)
            usedmap = htmlproc.buildXHTML()

            # write out the files
            filenames = []
            n = k8proc.getNumberOfParts()
            for i in range(n):
                part = k8proc.getPart(i)
                [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i)
                filenames.append([dir, filename])
                fname = os.path.join(files.k8oebps, dir, filename)
                file(fname, "wb").write(part)
            n = k8proc.getNumberOfFlows()
            for i in range(1, n):
                [type, format, dir, filename] = k8proc.getFlowInfo(i)
                flowpart = k8proc.getFlow(i)
                if format == "file":
                    filenames.append([dir, filename])
                    fname = os.path.join(files.k8oebps, dir, filename)
                    file(fname, "wb").write(flowpart)

            opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext)

            if obfuscate_data:
                uuid = opf.writeOPF(True)
            else:
                uuid = opf.writeOPF()

            # make an epub of it all
            files.makeEPUB(usedmap, obfuscate_data, uuid)

        elif not k8only:
            # An original Mobi
            # process the toc ncx
            # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
            ncx = ncxExtract(mh, files)
            ncx_data = ncx.parseNCX()
            ncx.writeNCX(metadata)

            positionMap = {}
            # If Dictionary build up the positionMap
            if mh.isDictionary():
                if mh.DictInLanguage():
                    metadata["DictInLanguage"] = mh.DictInLanguage()
                if mh.DictOutLanguage():
                    metadata["DictOutLanguage"] = mh.DictOutLanguage()
                positionMap = dictSupport(mh, sect).getPositionMap()

            # convert the rawml back to Mobi ml
            proc = HTMLProcessor(files, metadata, imgnames)
            srctext = proc.findAnchors(rawML, ncx_data, positionMap)
            srctext, usedmap = proc.insertHREFS()
            filenames = []

            # write the proper mobi html
            fname = files.getInputFileBasename() + ".html"
            filenames.append(["", fname])
            outhtml = os.path.join(files.mobi7dir, fname)
            file(outhtml, "wb").write(srctext)

            # create an OPF
            # extract guidetext from srctext
            guidetext = ""
            guidematch = re.search(r"""<guide>(.*)</guide>""", srctext, re.IGNORECASE + re.DOTALL)
            if guidematch:
                replacetext = r'''href="''' + filenames[0][1] + r'''#filepos\1"'''
                guidetext = re.sub(r"""filepos=['"]{0,1}0*(\d+)['"]{0,1}""", replacetext, guidematch.group(1))
                guidetext += "\n"
                guidetext = unicode(guidetext, mh.codec).encode("utf-8")
            opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext)
            opf.writeOPF()
Example #4
0
def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
    '''
    Create a map of tags and values from the given byte section.

    @param controlByteCount: The number of control bytes.
    @param tagTable: The tag table.
    @param entryData: The data to process.
    @param startPos: The starting position in entryData.
    @param endPos: The end position in entryData or None if it is unknown.
    @return: Hashmap of tag and list of values.
    '''
    tags = []
    tagHashMap = {}
    controlByteIndex = 0
    dataStart = startPos + controlByteCount

    for tag, valuesPerEntry, mask, endFlag in tagTable:
        if endFlag == 0x01:
            controlByteIndex += 1
            continue
        cbyte = ord(entryData[startPos + controlByteIndex])
        if 0:
            print "Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte)

        value = ord(entryData[startPos + controlByteIndex]) & mask
        if value != 0:
            if value == mask:
                if countSetBits(mask) > 1:
                    # If all bits of masked value are set and the mask has more than one bit, a variable width value
                    # will follow after the control bytes which defines the length of bytes (NOT the value count!)
                    # which will contain the corresponding variable width values.
                    consumed, value = getVariableWidthValue(entryData, dataStart)
                    dataStart += consumed
                    tags.append((tag, None, value, valuesPerEntry))
                else:
                    tags.append((tag, 1, None, valuesPerEntry))
            else:
                # Shift bits to get the masked value.
                while mask & 0x01 == 0:
                    mask = mask >> 1
                    value = value >> 1
                tags.append((tag, value, None, valuesPerEntry))
    for tag, valueCount, valueBytes, valuesPerEntry in tags:
        values = []
        if valueCount != None:
            # Read valueCount * valuesPerEntry variable width values.
            for _ in range(valueCount):
                for _ in range(valuesPerEntry):
                    consumed, data = getVariableWidthValue(entryData, dataStart)
                    dataStart += consumed
                    values.append(data)
        else:
            # Convert valueBytes to variable width values.
            totalConsumed = 0
            while totalConsumed < valueBytes:
                # Does this work for valuesPerEntry != 1?
                consumed, data = getVariableWidthValue(entryData, dataStart)
                dataStart += consumed
                totalConsumed += consumed
                values.append(data)
            if totalConsumed != valueBytes:
                print "Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed)
        tagHashMap[tag] = values
    # Test that all bytes have been processed if endPos is given.
    if endPos is not None and dataStart != endPos:
        # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
        for char in entryData[dataStart:endPos]:
            if char != chr(0x00):
                print "Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos])
                if 0:
                    print "controlByteCount: %s" % controlByteCount
                    print "tagTable: %s" % tagTable
                    print "data: %s" % toHex(entryData[startPos:endPos])
                    print "tagHashMap: %s" % tagHashMap
                break

    return tagHashMap
Example #5
0
    def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
        '''
        Apply inflection rule.

        @param mainEntry: The word to inflect.
        @param inflectionRuleData: The inflection rules.
        @param start: The start position of the inflection rule to use.
        @param end: The end position of the inflection rule to use.
        @return: The string with the inflected word or None if an error occurs.
        '''
        mode = -1
        byteArray = array.array("c", mainEntry)
        position = len(byteArray)
        for charOffset in range(start, end):
            char = inflectionRuleData[charOffset]
            byte = ord(char)
            if byte >= 0x0a and byte <= 0x13:
                # Move cursor backwards
                offset = byte - 0x0a
                if mode not in [0x02, 0x03]:
                    mode = 0x02
                    position = len(byteArray)
                position -= offset
            elif byte > 0x13:
                if mode == -1:
                    print "Error: Unexpected first byte %i of inflection rule" % byte
                    return None
                elif position == -1:
                    print "Error: Unexpected first byte %i of inflection rule" % byte
                    return None
                else:
                    if mode == 0x01:
                        # Insert at word start
                        byteArray.insert(position, char)
                        position += 1
                    elif mode == 0x02:
                        # Insert at word end
                        byteArray.insert(position, char)
                    elif mode == 0x03:
                        # Delete at word end
                        position -= 1
                        deleted = byteArray.pop(position)
                        if deleted != char:
                            if DEBUG_DICT:
                                print "0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, deleted)
                            print "Error: Delete operation of inflection rule failed"
                            return None
                    elif mode == 0x04:
                        # Delete at word start
                        deleted = byteArray.pop(position)
                        if deleted != char:
                            if DEBUG_DICT:
                                print "0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, deleted)
                            print "Error: Delete operation of inflection rule failed"
                            return None
                    else:
                        print "Error: Inflection rule mode %x is not implemented" % mode
                        return None
            elif byte == 0x01:
                # Insert at word start
                if mode not in [0x01, 0x04]:
                    position = 0
                mode = byte
            elif byte == 0x02:
                # Insert at word end
                if mode not in [0x02, 0x03]:
                    position = len(byteArray)
                mode = byte
            elif byte == 0x03:
                # Delete at word end
                if mode not in [0x02, 0x03]:
                    position = len(byteArray)
                mode = byte
            elif byte == 0x04:
                # Delete at word start
                if mode not in [0x01, 0x04]:
                    position = 0
                # Delete at word start
                mode = byte
            else:
                print "Error: Inflection rule mode %x is not implemented" % byte
                return None
        return byteArray.tostring()
Example #6
0
    def getTagMap(self, controlByteCount, tagTable, entryData, startPos,
                  endPos):
        '''
        Create a map of tags and values from the given byte section.

        @param controlByteCount: The number of control bytes.
        @param tagTable: The tag table.
        @param entryData: The data to process.
        @param startPos: The starting position in entryData.
        @param endPos: The end position in entryData or None if it is unknown.
        @return: Hashmap of tag and list of values.
        '''
        tags = []
        tagHashMap = {}
        controlByteIndex = 0
        dataStart = startPos + controlByteCount

        for tag, valuesPerEntry, mask, endFlag in tagTable:
            if endFlag == 0x01:
                controlByteIndex += 1
                continue

            value = ord(entryData[startPos + controlByteIndex]) & mask

            if value != 0:
                if value == mask:
                    if self.countSetBits(mask) > 1:
                        # If all bits of masked value are set and the mask has more than one bit, a variable width value
                        # will follow after the control bytes which defines the length of bytes (NOT the value count!)
                        # which will contain the corresponding variable width values.
                        consumed, value = getVariableWidthValue(
                            entryData, dataStart)
                        dataStart += consumed
                        tags.append((tag, None, value, valuesPerEntry))
                    else:
                        tags.append((tag, 1, None, valuesPerEntry))
                else:
                    # Shift bits to get the masked value.
                    while mask & 0x01 == 0:
                        mask = mask >> 1
                        value = value >> 1
                    tags.append((tag, value, None, valuesPerEntry))

        for tag, valueCount, valueBytes, valuesPerEntry in tags:
            values = []
            if valueCount != None:
                # Read valueCount * valuesPerEntry variable width values.
                for _ in range(valueCount):
                    for _ in range(valuesPerEntry):
                        consumed, data = getVariableWidthValue(
                            entryData, dataStart)
                        dataStart += consumed
                        values.append(data)
            else:
                # Convert valueBytes to variable width values.
                totalConsumed = 0
                while totalConsumed < valueBytes:
                    # Does this work for valuesPerEntry != 1?
                    consumed, data = getVariableWidthValue(
                        entryData, dataStart)
                    dataStart += consumed
                    totalConsumed += consumed
                    values.append(data)
                if totalConsumed != valueBytes:
                    print "Error: Should consume %s bytes, but consumed %s" % (
                        valueBytes, totalConsumed)
            tagHashMap[tag] = values

        # Test that all bytes have been processed if endPos is given.
        if endPos is not None and dataStart != endPos:
            # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
            for char in entryData[dataStart:endPos]:
                if char != chr(0x00):
                    print "Warning: There are unprocessed index bytes left: %s" % toHex(
                        entryData[dataStart:endPos])
                    if DEBUG_DICT:
                        print "controlByteCount: %s" % controlByteCount
                        print "tagTable: %s" % tagTable
                        print "data: %s" % toHex(entryData[startPos:endPos])
                        print "tagHashMap: %s" % tagHashMap
                    break

        return tagHashMap