Beispiel #1
0
 def __init__(self, mh, files):
     self.mh = mh
     self.sect = self.mh.sect
     self.files = files
     self.isNCX = False
     self.mi = MobiIndex(self.sect)
     self.ncxidx = self.mh.ncxidx
     self.indx_data = None
Beispiel #2
0
    def __init__(self, mh, sect, debug=False):
        self.sect = sect
        self.mi = MobiIndex(sect)
        self.mh = mh
        self.skelidx = mh.skelidx
        self.dividx = mh.dividx
        self.othidx = mh.othidx
        self.fdst = mh.fdst
        self.flowmap = {}
        self.flows = None
        self.flowinfo = []
        self.parts = None
        self.partinfo = []
        self.fdsttbl= [0,0xffffffff]
        self.DEBUG = debug

        # read in and parse the FDST info which is very similar in format to the Palm DB section
        # parsing except it provides offsets into rawML file and not the Palm DB file
        # this is needed to split up the final css, svg, etc flow section
        # that can exist at the end of the rawML file
        if self.fdst != 0xffffffff:
            header = self.sect.loadSection(self.fdst)
            if header[0:4] == "FDST":
                num_sections, = struct.unpack_from('>L', header, 0x08)
                sections = header[0x0c:]
                self.fdsttbl = struct.unpack_from('>%dL' % (num_sections*2), sections, 0)[::2] + (0xfffffff, )
            else:
                print "Error: K8 Mobi with Missing FDST info"
        if self.DEBUG:
            print "\nFDST Section Map:  %d entries" % len(self.fdsttbl)
            for j in xrange(len(self.fdsttbl)):
                print "  %d - %0x" % (j, self.fdsttbl[j])

        # read/process skeleton index info to create the skeleton table
        skeltbl = []
        if self.skelidx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx)
            fileptr = 0
            for [text, tagMap] in outtbl:
                # file number, skeleton name, divtbl record count, start position, length
                skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
                fileptr += 1
        self.skeltbl = skeltbl
        if self.DEBUG:
            print "\nSkel Table:  %d entries" % len(self.skeltbl)
            print "table: filenum, skeleton name, div tbl record count, start position, length"
            for j in xrange(len(self.skeltbl)):
                print self.skeltbl[j]

        # read/process the div index to create to <div> (and <p>) table
        divtbl = []
        if self.dividx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.dividx)
            for [text, tagMap] in outtbl:
                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
                ctocoffset = tagMap[2][0]
                ctocdata = ctoc_text[ctocoffset]
                divtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]])
        self.divtbl = divtbl
        if self.DEBUG:
            print "\nDiv (Fragment) Table: %d entries" % len(self.divtbl)
            print "table: file position, link id text, file num, sequence number, start position, length"
            for j in xrange(len(self.divtbl)):
                print self.divtbl[j]

        # read / process other index <guide> element of opf
        othtbl = []
        if self.othidx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.othidx)
            for [text, tagMap] in outtbl:
                # ref_type, ref_title, div/frag number
                ctocoffset = tagMap[1][0]
                ref_title = ctoc_text[ctocoffset]
                ref_type = text
                fileno = None
                if 3 in tagMap.keys():
                    fileno  = tagMap[3][0]
                if 6 in tagMap.keys():
                    fileno = tagMap[6][0]
                othtbl.append([ref_type, ref_title, fileno])
        self.othtbl = othtbl
        if self.DEBUG:
            print "\nOther (Guide) Table: %d entries" % len(self.othtbl)
            print "table: ref_type, ref_title, divtbl entry number"
            for j in xrange(len(self.othtbl)):
                print self.othtbl[j]
Beispiel #3
0
class K8Processor:
    def __init__(self, mh, sect, debug=False):
        self.sect = sect
        self.mi = MobiIndex(sect)
        self.mh = mh
        self.skelidx = mh.skelidx
        self.dividx = mh.dividx
        self.othidx = mh.othidx
        self.fdst = mh.fdst
        self.flowmap = {}
        self.flows = None
        self.flowinfo = []
        self.parts = None
        self.partinfo = []
        self.fdsttbl= [0,0xffffffff]
        self.DEBUG = debug

        # read in and parse the FDST info which is very similar in format to the Palm DB section
        # parsing except it provides offsets into rawML file and not the Palm DB file
        # this is needed to split up the final css, svg, etc flow section
        # that can exist at the end of the rawML file
        if self.fdst != 0xffffffff:
            header = self.sect.loadSection(self.fdst)
            if header[0:4] == "FDST":
                num_sections, = struct.unpack_from('>L', header, 0x08)
                sections = header[0x0c:]
                self.fdsttbl = struct.unpack_from('>%dL' % (num_sections*2), sections, 0)[::2] + (0xfffffff, )
            else:
                print "Error: K8 Mobi with Missing FDST info"
        if self.DEBUG:
            print "\nFDST Section Map:  %d entries" % len(self.fdsttbl)
            for j in xrange(len(self.fdsttbl)):
                print "  %d - %0x" % (j, self.fdsttbl[j])

        # read/process skeleton index info to create the skeleton table
        skeltbl = []
        if self.skelidx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx)
            fileptr = 0
            for [text, tagMap] in outtbl:
                # file number, skeleton name, divtbl record count, start position, length
                skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
                fileptr += 1
        self.skeltbl = skeltbl
        if self.DEBUG:
            print "\nSkel Table:  %d entries" % len(self.skeltbl)
            print "table: filenum, skeleton name, div tbl record count, start position, length"
            for j in xrange(len(self.skeltbl)):
                print self.skeltbl[j]

        # read/process the div index to create to <div> (and <p>) table
        divtbl = []
        if self.dividx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.dividx)
            for [text, tagMap] in outtbl:
                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
                ctocoffset = tagMap[2][0]
                ctocdata = ctoc_text[ctocoffset]
                divtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]])
        self.divtbl = divtbl
        if self.DEBUG:
            print "\nDiv (Fragment) Table: %d entries" % len(self.divtbl)
            print "table: file position, link id text, file num, sequence number, start position, length"
            for j in xrange(len(self.divtbl)):
                print self.divtbl[j]

        # read / process other index <guide> element of opf
        othtbl = []
        if self.othidx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.othidx)
            for [text, tagMap] in outtbl:
                # ref_type, ref_title, div/frag number
                ctocoffset = tagMap[1][0]
                ref_title = ctoc_text[ctocoffset]
                ref_type = text
                fileno = None
                if 3 in tagMap.keys():
                    fileno  = tagMap[3][0]
                if 6 in tagMap.keys():
                    fileno = tagMap[6][0]
                othtbl.append([ref_type, ref_title, fileno])
        self.othtbl = othtbl
        if self.DEBUG:
            print "\nOther (Guide) Table: %d entries" % len(self.othtbl)
            print "table: ref_type, ref_title, divtbl entry number"
            for j in xrange(len(self.othtbl)):
                print self.othtbl[j]


    def buildParts(self, rawML):
        # now split the rawML into its flow pieces
        self.flows = []
        for j in xrange(0, len(self.fdsttbl)-1):
            start = self.fdsttbl[j]
            end = self.fdsttbl[j+1]
            if end == 0xffffffff:
                end = len(rawML)
                if self.DEBUG:
                    print "splitting rawml starting at %d and ending at %d into flow piece %d" % (start, end, j)
            self.flows.append(rawML[start:end])

        # the first piece represents the xhtml text
        text = self.flows[0]
        self.flows[0] = ''

        # walk the <skeleton> and <div> tables to build original source xhtml files
        # *without* destroying any file position information needed for later href processing
        # and create final list of file separation start: stop points and etc in partinfo
        if self.DEBUG:
            print "\nRebuilding flow piece 0: the main body of the ebook"
        self.parts = []
        self.partinfo = []
        divptr = 0
        baseptr = 0
        for [skelnum, skelname, divcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos: baseptr]
            for i in range(divcnt):
                [insertpos, idtext, filenum, seqnum, startpos, length] = self.divtbl[divptr]
                if self.DEBUG:
                    print "    moving div/frag %d starting at %d of length %d" % (divptr, startpos, length)
                    print "        inside of skeleton number %d at postion %d" %  (skelnum, insertpos)
                if i == 0:
                    aidtext = idtext[12:-2]
                    filename = 'part%04d.xhtml' % filenum
                slice = text[baseptr: baseptr + length]
                insertpos = insertpos - skelpos
                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
                baseptr = baseptr + length
                divptr += 1
            self.parts.append(skeleton)
            self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext])

        # The primary css style sheet is typically stored next followed by any
        # snippets of code that were previously inlined in the
        # original xhtml but have been stripped out and placed here.
        # This can include local CDATA snippets and and svg sections.

        # The problem is that for most browsers and ereaders, you can not
        # use <img src="imageXXXX.svg" /> to import any svg image that itself
        # properly uses an <image/> tag to import some raster image - it
        # should work according to the spec but does not for almost all browsers
        # and ereaders and causes epub validation issues because those  raster
        # images are in manifest but not in xhtml text - since they only
        # referenced from an svg image

        # So we need to check the remaining flow pieces to see if they are css
        # or svg images.  if svg images, we must check if they have an <image />
        # and if so inline them into the xhtml text pieces.

        # there may be other sorts of pieces stored here but until we see one
        # in the wild to reverse engineer we won't be able to tell
        self.flowinfo.append([None, None, None, None])
        svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE)
        image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE)
        for j in xrange(1,len(self.flows)):
            flowpart = self.flows[j]
            nstr = '%04d' % j
            m = re.search(svg_tag_pattern, flowpart)
            if m != None:
                # svg
                type = 'svg'
                start = m.start()
                m2 = re.search(image_tag_pattern, flowpart)
                if m2 != None:
                    format = 'inline'
                    dir = None
                    fname = None
                    # strip off anything before <svg if inlining
                    flowpart = flowpart[start:]
                else:
                    format = 'file'
                    dir = "Images"
                    fname = 'svgimg' + nstr + '.svg'
            else:
                # search for CDATA and if exists inline it
                if flowpart.find('[CDATA[') >= 0:
                    type = 'css'
                    flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
                    format = 'inline'
                    dir = None
                    fname = None
                else:
                    # css - assume as standalone css file
                    type = 'css'
                    format = 'file'
                    dir = "Styles"
                    fname = 'style' + nstr + '.css'

            self.flows[j] = flowpart
            self.flowinfo.append([type, format, dir, fname])
        
        if self.DEBUG:
            print "\nFlow Map:  %d entries" % len(self.flowinfo)
            for fi in self.flowinfo:
                print fi
            print "\n"

            print "\nXHTML File Part Position Information: %d entries" % len(self.partinfo)
            for pi in self.partinfo:
                print pi

        if False:  # self.DEBUG:
            # dump all of the locations of the aid tags used in TEXT
            # find id links only inside of tags
            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
            #    [^>]* means match any amount of chars except for  '>' char
            #    [^'"] match any amount of chars except for the quote character
            #    \s* means match any amount of whitespace
            print "\npositions of all aid= pieces"
            id_pattern = re.compile(r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE)
            for m in re.finditer(id_pattern, rawML):
                print "%0x %s %0x" % (m.start(), m.group(1), fromBase32(m.group(1)))
                [filename, partnum, start, end] = self.getFileInfo(m.start())
                print "   in  %d %0x %0x" % (partnum, start, end)

        return

    # get information about the part (file) that exists at pos in oriignal rawML
    def getFileInfo(self, pos):
        for [partnum, dir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return filename, partnum, start, end
        return None, None, None, None


    # accessor functions to properly protect the internal structure
    def getNumberOfParts(self):
        return len(self.parts)

    def getPart(self,i):
        if i >= 0 and i < len(self.parts):
            return self.parts[i]
        return None

    def getPartInfo(self, i):
        if i >= 0 and i < len(self.partinfo):
            return self.partinfo[i]
        return None

    def getNumberOfFlows(self):
        return len(self.flows)

    def getFlow(self,i):
        # note flows[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flows):
            return self.flows[i]
        return None

    def getFlowInfo(self,i):
        # note flowinfo[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flowinfo):
            return self.flowinfo[i]
        return None


    def getIDTagByPosFid(self, posfid, offset):
        # first convert kindle:pos:fid and offset info to position in file
        row = fromBase32(posfid)
        off = fromBase32(offset)
        [insertpos, idtext, filenum, seqnm, startpos, length] = self.divtbl[row]
        pos = insertpos + off
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        # an existing "id=" must exist in original xhtml otherwise it would not have worked for linking.
        # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
        # some position information encoded into Base32 name.

        # so find the closest "id=" before position the file  by actually searching in that file
        idtext = self.getIDTag(pos)
        return fname, idtext

    def getIDTag(self, pos):
        # find the correct tag by actually searching in the destination textblock at position
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        textblock = self.parts[pn]
        idtbl = []
        npos = pos - skelpos
        pgt = textblock.find('>',npos)
        plt = textblock.find('<',npos)
        # if npos inside a tag then search all text before the its end of tag marker
        # else not in a tag need to search the preceding tag
        if plt == npos  or  pgt < plt:
            npos = pgt + 1
        textblock = textblock[0:npos]
        # find id links only inside of tags
        #    inside any < > pair find all "id=' and return whatever is inside the quotes
        #    [^>]* means match any amount of chars except for  '>' char
        #    [^'"] match any amount of chars except for the quote character
        #    \s* means match any amount of whitespace
        id_pattern = re.compile(r'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE)
        for m in re.finditer(id_pattern, textblock):
            idtbl.append([m.start(), m.group(1)])
        n = len(idtbl)
        if n == 0:
            if self.DEBUG:
                print "Found no id in the textblock, link must be to top of file"
            return ''
        # if npos is before first id= inside a tag, return the first
        if npos < idtbl[0][0] :
            return idtbl[0][1]
        # if npos is after the last id= inside a tag, return the last
        if npos > idtbl[n-1][0] :
            return idtbl[n-1][1]
        # otherwise find last id before npos
        tgt = 0
        for r in xrange(n):
            if npos < idtbl[r][0]:
                tgt = r-1
                break
        if self.DEBUG:
            print pos, npos, idtbl[tgt]
        return idtbl[tgt][1]


    # do we need to do deep copying
    def setParts(self, parts):
        assert(len(parts) == len(self.parts))
        for i in range(len(parts)):
            self.parts[i] = parts[i]

    # do we need to do deep copying
    def setFlows(self, flows):
        assert(len(flows) == len(self.flows))
        for i in xrange(len(flows)):
            self.flows[i] = flows[i]


    # get information about the part (file) that exists at pos in oriignal rawML
    def getSkelInfo(self, pos):
        for [partnum, dir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return [partnum, dir, filename, start, end, aidtext]
        return [None, None, None, None, None, None]

    # fileno is actually a reference into divtbl (a fragment)
    def getGuideText(self):
        guidetext = ''
        for [ref_type, ref_title, fileno] in self.othtbl:
            [pos, idtext, filenum, seqnm, startpos, length] = self.divtbl[fileno]
            [pn, dir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos)
            idtext = self.getIDTag(pos)
            linktgt = filename
            if idtext != '':
                linktgt += '#' + idtext
            guidetext += '<reference type="%s" title="%s" href="%s/%s" />\n' % (ref_type, ref_title, dir, linktgt)
        # opf is encoded utf-8 so must convert any titles properly
        guidetext = unicode(guidetext, self.mh.codec).encode("utf-8")
        return guidetext
Beispiel #4
0
    def __init__(self, mh, sect, debug=False):
        self.sect = sect
        self.mi = MobiIndex(sect)
        self.mh = mh
        self.skelidx = mh.skelidx
        self.dividx = mh.dividx
        self.othidx = mh.othidx
        self.fdst = mh.fdst
        self.flowmap = {}
        self.flows = None
        self.flowinfo = []
        self.parts = None
        self.partinfo = []
        self.fdsttbl = [0, 0xffffffff]
        self.DEBUG = debug

        # read in and parse the FDST info which is very similar in format to the Palm DB section
        # parsing except it provides offsets into rawML file and not the Palm DB file
        # this is needed to split up the final css, svg, etc flow section
        # that can exist at the end of the rawML file
        if self.fdst != 0xffffffff:
            header = self.sect.loadSection(self.fdst)
            if header[0:4] == "FDST":
                num_sections, = struct.unpack_from('>L', header, 0x08)
                sections = header[0x0c:]
                self.fdsttbl = struct.unpack_from('>%dL' %
                                                  (num_sections * 2), sections,
                                                  0)[::2] + (0xfffffff, )
            else:
                print "Error: K8 Mobi with Missing FDST info"
        if self.DEBUG:
            print "\nFDST Section Map:  %d entries" % len(self.fdsttbl)
            for j in xrange(len(self.fdsttbl)):
                print "  %d - %0x" % (j, self.fdsttbl[j])

        # read/process skeleton index info to create the skeleton table
        skeltbl = []
        if self.skelidx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx)
            fileptr = 0
            for [text, tagMap] in outtbl:
                # file number, skeleton name, divtbl record count, start position, length
                skeltbl.append(
                    [fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
                fileptr += 1
        self.skeltbl = skeltbl
        if self.DEBUG:
            print "\nSkel Table:  %d entries" % len(self.skeltbl)
            print "table: filenum, skeleton name, div tbl record count, start position, length"
            for j in xrange(len(self.skeltbl)):
                print self.skeltbl[j]

        # read/process the div index to create to <div> (and <p>) table
        divtbl = []
        if self.dividx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.dividx)
            for [text, tagMap] in outtbl:
                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
                ctocoffset = tagMap[2][0]
                ctocdata = ctoc_text[ctocoffset]
                divtbl.append([
                    int(text), ctocdata, tagMap[3][0], tagMap[4][0],
                    tagMap[6][0], tagMap[6][1]
                ])
        self.divtbl = divtbl
        if self.DEBUG:
            print "\nDiv (Fragment) Table: %d entries" % len(self.divtbl)
            print "table: file position, link id text, file num, sequence number, start position, length"
            for j in xrange(len(self.divtbl)):
                print self.divtbl[j]

        # read / process other index <guide> element of opf
        othtbl = []
        if self.othidx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.othidx)
            for [text, tagMap] in outtbl:
                # ref_type, ref_title, div/frag number
                ctocoffset = tagMap[1][0]
                ref_title = ctoc_text[ctocoffset]
                ref_type = text
                fileno = None
                if 3 in tagMap.keys():
                    fileno = tagMap[3][0]
                if 6 in tagMap.keys():
                    fileno = tagMap[6][0]
                othtbl.append([ref_type, ref_title, fileno])
        self.othtbl = othtbl
        if self.DEBUG:
            print "\nOther (Guide) Table: %d entries" % len(self.othtbl)
            print "table: ref_type, ref_title, divtbl entry number"
            for j in xrange(len(self.othtbl)):
                print self.othtbl[j]
Beispiel #5
0
class K8Processor:
    def __init__(self, mh, sect, debug=False):
        self.sect = sect
        self.mi = MobiIndex(sect)
        self.mh = mh
        self.skelidx = mh.skelidx
        self.dividx = mh.dividx
        self.othidx = mh.othidx
        self.fdst = mh.fdst
        self.flowmap = {}
        self.flows = None
        self.flowinfo = []
        self.parts = None
        self.partinfo = []
        self.fdsttbl = [0, 0xffffffff]
        self.DEBUG = debug

        # read in and parse the FDST info which is very similar in format to the Palm DB section
        # parsing except it provides offsets into rawML file and not the Palm DB file
        # this is needed to split up the final css, svg, etc flow section
        # that can exist at the end of the rawML file
        if self.fdst != 0xffffffff:
            header = self.sect.loadSection(self.fdst)
            if header[0:4] == "FDST":
                num_sections, = struct.unpack_from('>L', header, 0x08)
                sections = header[0x0c:]
                self.fdsttbl = struct.unpack_from('>%dL' %
                                                  (num_sections * 2), sections,
                                                  0)[::2] + (0xfffffff, )
            else:
                print "Error: K8 Mobi with Missing FDST info"
        if self.DEBUG:
            print "\nFDST Section Map:  %d entries" % len(self.fdsttbl)
            for j in xrange(len(self.fdsttbl)):
                print "  %d - %0x" % (j, self.fdsttbl[j])

        # read/process skeleton index info to create the skeleton table
        skeltbl = []
        if self.skelidx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx)
            fileptr = 0
            for [text, tagMap] in outtbl:
                # file number, skeleton name, divtbl record count, start position, length
                skeltbl.append(
                    [fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
                fileptr += 1
        self.skeltbl = skeltbl
        if self.DEBUG:
            print "\nSkel Table:  %d entries" % len(self.skeltbl)
            print "table: filenum, skeleton name, div tbl record count, start position, length"
            for j in xrange(len(self.skeltbl)):
                print self.skeltbl[j]

        # read/process the div index to create to <div> (and <p>) table
        divtbl = []
        if self.dividx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.dividx)
            for [text, tagMap] in outtbl:
                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
                ctocoffset = tagMap[2][0]
                ctocdata = ctoc_text[ctocoffset]
                divtbl.append([
                    int(text), ctocdata, tagMap[3][0], tagMap[4][0],
                    tagMap[6][0], tagMap[6][1]
                ])
        self.divtbl = divtbl
        if self.DEBUG:
            print "\nDiv (Fragment) Table: %d entries" % len(self.divtbl)
            print "table: file position, link id text, file num, sequence number, start position, length"
            for j in xrange(len(self.divtbl)):
                print self.divtbl[j]

        # read / process other index <guide> element of opf
        othtbl = []
        if self.othidx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.othidx)
            for [text, tagMap] in outtbl:
                # ref_type, ref_title, div/frag number
                ctocoffset = tagMap[1][0]
                ref_title = ctoc_text[ctocoffset]
                ref_type = text
                fileno = None
                if 3 in tagMap.keys():
                    fileno = tagMap[3][0]
                if 6 in tagMap.keys():
                    fileno = tagMap[6][0]
                othtbl.append([ref_type, ref_title, fileno])
        self.othtbl = othtbl
        if self.DEBUG:
            print "\nOther (Guide) Table: %d entries" % len(self.othtbl)
            print "table: ref_type, ref_title, divtbl entry number"
            for j in xrange(len(self.othtbl)):
                print self.othtbl[j]

    def buildParts(self, rawML):
        # now split the rawML into its flow pieces
        self.flows = []
        for j in xrange(0, len(self.fdsttbl) - 1):
            start = self.fdsttbl[j]
            end = self.fdsttbl[j + 1]
            if end == 0xffffffff:
                end = len(rawML)
                if self.DEBUG:
                    print "splitting rawml starting at %d and ending at %d into flow piece %d" % (
                        start, end, j)
            self.flows.append(rawML[start:end])

        # the first piece represents the xhtml text
        text = self.flows[0]
        self.flows[0] = ''

        # walk the <skeleton> and <div> tables to build original source xhtml files
        # *without* destroying any file position information needed for later href processing
        # and create final list of file separation start: stop points and etc in partinfo
        if self.DEBUG:
            print "\nRebuilding flow piece 0: the main body of the ebook"
        self.parts = []
        self.partinfo = []
        divptr = 0
        baseptr = 0
        for [skelnum, skelname, divcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos:baseptr]
            for i in range(divcnt):
                [insertpos, idtext, filenum, seqnum, startpos,
                 length] = self.divtbl[divptr]
                if self.DEBUG:
                    print "    moving div/frag %d starting at %d of length %d" % (
                        divptr, startpos, length)
                    print "        inside of skeleton number %d at postion %d" % (
                        skelnum, insertpos)
                if i == 0:
                    aidtext = idtext[12:-2]
                    filename = 'part%04d.xhtml' % filenum
                slice = text[baseptr:baseptr + length]
                insertpos = insertpos - skelpos
                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
                baseptr = baseptr + length
                divptr += 1
            self.parts.append(skeleton)
            self.partinfo.append(
                [skelnum, 'Text', filename, skelpos, baseptr, aidtext])

        # The primary css style sheet is typically stored next followed by any
        # snippets of code that were previously inlined in the
        # original xhtml but have been stripped out and placed here.
        # This can include local CDATA snippets and and svg sections.

        # The problem is that for most browsers and ereaders, you can not
        # use <img src="imageXXXX.svg" /> to import any svg image that itself
        # properly uses an <image/> tag to import some raster image - it
        # should work according to the spec but does not for almost all browsers
        # and ereaders and causes epub validation issues because those  raster
        # images are in manifest but not in xhtml text - since they only
        # referenced from an svg image

        # So we need to check the remaining flow pieces to see if they are css
        # or svg images.  if svg images, we must check if they have an <image />
        # and if so inline them into the xhtml text pieces.

        # there may be other sorts of pieces stored here but until we see one
        # in the wild to reverse engineer we won't be able to tell
        self.flowinfo.append([None, None, None, None])
        svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE)
        image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE)
        for j in xrange(1, len(self.flows)):
            flowpart = self.flows[j]
            nstr = '%04d' % j
            m = re.search(svg_tag_pattern, flowpart)
            if m != None:
                # svg
                type = 'svg'
                start = m.start()
                m2 = re.search(image_tag_pattern, flowpart)
                if m2 != None:
                    format = 'inline'
                    dir = None
                    fname = None
                    # strip off anything before <svg if inlining
                    flowpart = flowpart[start:]
                else:
                    format = 'file'
                    dir = "Images"
                    fname = 'svgimg' + nstr + '.svg'
            else:
                # search for CDATA and if exists inline it
                if flowpart.find('[CDATA[') >= 0:
                    type = 'css'
                    flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
                    format = 'inline'
                    dir = None
                    fname = None
                else:
                    # css - assume as standalone css file
                    type = 'css'
                    format = 'file'
                    dir = "Styles"
                    fname = 'style' + nstr + '.css'

            self.flows[j] = flowpart
            self.flowinfo.append([type, format, dir, fname])

        if self.DEBUG:
            print "\nFlow Map:  %d entries" % len(self.flowinfo)
            for fi in self.flowinfo:
                print fi
            print "\n"

            print "\nXHTML File Part Position Information: %d entries" % len(
                self.partinfo)
            for pi in self.partinfo:
                print pi

        if False:  # self.DEBUG:
            # dump all of the locations of the aid tags used in TEXT
            # find id links only inside of tags
            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
            #    [^>]* means match any amount of chars except for  '>' char
            #    [^'"] match any amount of chars except for the quote character
            #    \s* means match any amount of whitespace
            print "\npositions of all aid= pieces"
            id_pattern = re.compile(
                r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''', re.IGNORECASE)
            for m in re.finditer(id_pattern, rawML):
                print "%0x %s %0x" % (m.start(), m.group(1),
                                      fromBase32(m.group(1)))
                [filename, partnum, start, end] = self.getFileInfo(m.start())
                print "   in  %d %0x %0x" % (partnum, start, end)

        return

    # get information about the part (file) that exists at pos in oriignal rawML
    def getFileInfo(self, pos):
        for [partnum, dir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return filename, partnum, start, end
        return None, None, None, None

    # accessor functions to properly protect the internal structure
    def getNumberOfParts(self):
        return len(self.parts)

    def getPart(self, i):
        if i >= 0 and i < len(self.parts):
            return self.parts[i]
        return None

    def getPartInfo(self, i):
        if i >= 0 and i < len(self.partinfo):
            return self.partinfo[i]
        return None

    def getNumberOfFlows(self):
        return len(self.flows)

    def getFlow(self, i):
        # note flows[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flows):
            return self.flows[i]
        return None

    def getFlowInfo(self, i):
        # note flowinfo[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flowinfo):
            return self.flowinfo[i]
        return None

    def getIDTagByPosFid(self, posfid, offset):
        # first convert kindle:pos:fid and offset info to position in file
        row = fromBase32(posfid)
        off = fromBase32(offset)
        [insertpos, idtext, filenum, seqnm, startpos,
         length] = self.divtbl[row]
        pos = insertpos + off
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        # an existing "id=" must exist in original xhtml otherwise it would not have worked for linking.
        # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
        # some position information encoded into Base32 name.

        # so find the closest "id=" before position the file  by actually searching in that file
        idtext = self.getIDTag(pos)
        return fname, idtext

    def getIDTag(self, pos):
        # find the correct tag by actually searching in the destination textblock at position
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        textblock = self.parts[pn]
        idtbl = []
        npos = pos - skelpos
        pgt = textblock.find('>', npos)
        plt = textblock.find('<', npos)
        # if npos inside a tag then search all text before the its end of tag marker
        # else not in a tag need to search the preceding tag
        if plt == npos or pgt < plt:
            npos = pgt + 1
        textblock = textblock[0:npos]
        # find id links only inside of tags
        #    inside any < > pair find all "id=' and return whatever is inside the quotes
        #    [^>]* means match any amount of chars except for  '>' char
        #    [^'"] match any amount of chars except for the quote character
        #    \s* means match any amount of whitespace
        id_pattern = re.compile(r'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"][^>]*>''',
                                re.IGNORECASE)
        for m in re.finditer(id_pattern, textblock):
            idtbl.append([m.start(), m.group(1)])
        n = len(idtbl)
        if n == 0:
            if self.DEBUG:
                print "Found no id in the textblock, link must be to top of file"
            return ''
        # if npos is before first id= inside a tag, return the first
        if npos < idtbl[0][0]:
            return idtbl[0][1]
        # if npos is after the last id= inside a tag, return the last
        if npos > idtbl[n - 1][0]:
            return idtbl[n - 1][1]
        # otherwise find last id before npos
        tgt = 0
        for r in xrange(n):
            if npos < idtbl[r][0]:
                tgt = r - 1
                break
        if self.DEBUG:
            print pos, npos, idtbl[tgt]
        return idtbl[tgt][1]

    # do we need to do deep copying
    def setParts(self, parts):
        assert (len(parts) == len(self.parts))
        for i in range(len(parts)):
            self.parts[i] = parts[i]

    # do we need to do deep copying
    def setFlows(self, flows):
        assert (len(flows) == len(self.flows))
        for i in xrange(len(flows)):
            self.flows[i] = flows[i]

    # get information about the part (file) that exists at pos in oriignal rawML
    def getSkelInfo(self, pos):
        for [partnum, dir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return [partnum, dir, filename, start, end, aidtext]
        return [None, None, None, None, None, None]

    # fileno is actually a reference into divtbl (a fragment)
    def getGuideText(self):
        guidetext = ''
        for [ref_type, ref_title, fileno] in self.othtbl:
            [pos, idtext, filenum, seqnm, startpos,
             length] = self.divtbl[fileno]
            [pn, dir, filename, skelpos, skelend,
             aidtext] = self.getSkelInfo(pos)
            idtext = self.getIDTag(pos)
            linktgt = filename
            if idtext != '':
                linktgt += '#' + idtext
            guidetext += '<reference type="%s" title="%s" href="%s/%s" />\n' % (
                ref_type, ref_title, dir, linktgt)
        # opf is encoded utf-8 so must convert any titles properly
        guidetext = unicode(guidetext, self.mh.codec).encode("utf-8")
        return guidetext
Beispiel #6
0
    def __init__(self, mh, sect, files, debug=False):
        self.sect = sect
        self.files = files
        self.mi = MobiIndex(sect)
        self.mh = mh
        self.skelidx = mh.skelidx
        self.fragidx = mh.fragidx
        self.guideidx = mh.guideidx
        self.fdst = mh.fdst
        self.flowmap = {}
        self.flows = None
        self.flowinfo = []
        self.parts = None
        self.partinfo = []
        self.fdsttbl = [0, 0xffffffff]
        self.DEBUG = debug

        # read in and parse the FDST info which is very similar in format to the Palm DB section
        # parsing except it provides offsets into rawML file and not the Palm DB file
        # this is needed to split up the final css, svg, etc flow section
        # that can exist at the end of the rawML file
        if self.fdst != 0xffffffff:
            header = self.sect.loadSection(self.fdst)
            if header[0:4] == "FDST":
                num_sections, = struct.unpack_from('>L', header, 0x08)
                self.fdsttbl = struct.unpack_from('>%dL' %
                                                  (num_sections * 2), header,
                                                  12)[::2] + (mh.rawSize, )
                sect.setsectiondescription(self.fdst, "KF8 FDST INDX")
                if self.DEBUG:
                    print "\nFDST Section Map:  %d sections" % num_sections
                    for j in xrange(num_sections):
                        print "Section %d: 0x%08X - 0x%08X" % (
                            j, self.fdsttbl[j], self.fdsttbl[j + 1])
            else:
                print "\nError: K8 Mobi with Missing FDST info"

        # read/process skeleton index info to create the skeleton table
        skeltbl = []
        if self.skelidx != 0xffffffff:
            # for i in xrange(2):
            #     fname = 'skel%04d.dat' % i
            #     data = self.sect.loadSection(self.skelidx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx,
                                                     "KF8 Skeleton")
            fileptr = 0
            for [text, tagMap] in outtbl:
                # file number, skeleton name, fragtbl record count, start position, length
                skeltbl.append(
                    [fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
                fileptr += 1
        self.skeltbl = skeltbl
        if self.DEBUG:
            print "\nSkel Table:  %d entries" % len(self.skeltbl)
            print "table: filenum, skeleton name, frag tbl record count, start position, length"
            for j in xrange(len(self.skeltbl)):
                print self.skeltbl[j]

        # read/process the fragment index to create the fragment table
        fragtbl = []
        if self.fragidx != 0xffffffff:
            # for i in xrange(3):
            #     fname = 'frag%04d.dat' % i
            #     data = self.sect.loadSection(self.fragidx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.fragidx,
                                                     "KF8 Fragment")
            for [text, tagMap] in outtbl:
                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
                ctocoffset = tagMap[2][0]
                ctocdata = ctoc_text[ctocoffset]
                fragtbl.append([
                    int(text), ctocdata, tagMap[3][0], tagMap[4][0],
                    tagMap[6][0], tagMap[6][1]
                ])
        self.fragtbl = fragtbl
        if self.DEBUG:
            print "\nFragment Table: %d entries" % len(self.fragtbl)
            print "table: file position, link id text, file num, sequence number, start position, length"
            for j in xrange(len(self.fragtbl)):
                print self.fragtbl[j]

        # read / process guide index for guide elements of opf
        guidetbl = []
        if self.guideidx != 0xffffffff:
            # for i in xrange(3):
            #     fname = 'guide%04d.dat' % i
            #     data = self.sect.loadSection(self.guideidx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.guideidx,
                                                     "KF8 Guide elements)")
            for [text, tagMap] in outtbl:
                # ref_type, ref_title, frag number
                ctocoffset = tagMap[1][0]
                ref_title = ctoc_text[ctocoffset]
                ref_type = text
                fileno = None
                if 3 in tagMap.keys():
                    fileno = tagMap[3][0]
                if 6 in tagMap.keys():
                    fileno = tagMap[6][0]
                guidetbl.append([ref_type, ref_title, fileno])
        self.guidetbl = guidetbl
        if self.DEBUG:
            print "\nGuide Table: %d entries" % len(self.guidetbl)
            print "table: ref_type, ref_title, fragtbl entry number"
            for j in xrange(len(self.guidetbl)):
                print self.guidetbl[j]
Beispiel #7
0
class K8Processor:
    def __init__(self, mh, sect, files, debug=False):
        self.sect = sect
        self.files = files
        self.mi = MobiIndex(sect)
        self.mh = mh
        self.skelidx = mh.skelidx
        self.fragidx = mh.fragidx
        self.guideidx = mh.guideidx
        self.fdst = mh.fdst
        self.flowmap = {}
        self.flows = None
        self.flowinfo = []
        self.parts = None
        self.partinfo = []
        self.fdsttbl = [0, 0xffffffff]
        self.DEBUG = debug

        # read in and parse the FDST info which is very similar in format to the Palm DB section
        # parsing except it provides offsets into rawML file and not the Palm DB file
        # this is needed to split up the final css, svg, etc flow section
        # that can exist at the end of the rawML file
        if self.fdst != 0xffffffff:
            header = self.sect.loadSection(self.fdst)
            if header[0:4] == "FDST":
                num_sections, = struct.unpack_from('>L', header, 0x08)
                self.fdsttbl = struct.unpack_from('>%dL' %
                                                  (num_sections * 2), header,
                                                  12)[::2] + (mh.rawSize, )
                sect.setsectiondescription(self.fdst, "KF8 FDST INDX")
                if self.DEBUG:
                    print "\nFDST Section Map:  %d sections" % num_sections
                    for j in xrange(num_sections):
                        print "Section %d: 0x%08X - 0x%08X" % (
                            j, self.fdsttbl[j], self.fdsttbl[j + 1])
            else:
                print "\nError: K8 Mobi with Missing FDST info"

        # read/process skeleton index info to create the skeleton table
        skeltbl = []
        if self.skelidx != 0xffffffff:
            # for i in xrange(2):
            #     fname = 'skel%04d.dat' % i
            #     data = self.sect.loadSection(self.skelidx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx,
                                                     "KF8 Skeleton")
            fileptr = 0
            for [text, tagMap] in outtbl:
                # file number, skeleton name, fragtbl record count, start position, length
                skeltbl.append(
                    [fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
                fileptr += 1
        self.skeltbl = skeltbl
        if self.DEBUG:
            print "\nSkel Table:  %d entries" % len(self.skeltbl)
            print "table: filenum, skeleton name, frag tbl record count, start position, length"
            for j in xrange(len(self.skeltbl)):
                print self.skeltbl[j]

        # read/process the fragment index to create the fragment table
        fragtbl = []
        if self.fragidx != 0xffffffff:
            # for i in xrange(3):
            #     fname = 'frag%04d.dat' % i
            #     data = self.sect.loadSection(self.fragidx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.fragidx,
                                                     "KF8 Fragment")
            for [text, tagMap] in outtbl:
                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
                ctocoffset = tagMap[2][0]
                ctocdata = ctoc_text[ctocoffset]
                fragtbl.append([
                    int(text), ctocdata, tagMap[3][0], tagMap[4][0],
                    tagMap[6][0], tagMap[6][1]
                ])
        self.fragtbl = fragtbl
        if self.DEBUG:
            print "\nFragment Table: %d entries" % len(self.fragtbl)
            print "table: file position, link id text, file num, sequence number, start position, length"
            for j in xrange(len(self.fragtbl)):
                print self.fragtbl[j]

        # read / process guide index for guide elements of opf
        guidetbl = []
        if self.guideidx != 0xffffffff:
            # for i in xrange(3):
            #     fname = 'guide%04d.dat' % i
            #     data = self.sect.loadSection(self.guideidx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.guideidx,
                                                     "KF8 Guide elements)")
            for [text, tagMap] in outtbl:
                # ref_type, ref_title, frag number
                ctocoffset = tagMap[1][0]
                ref_title = ctoc_text[ctocoffset]
                ref_type = text
                fileno = None
                if 3 in tagMap.keys():
                    fileno = tagMap[3][0]
                if 6 in tagMap.keys():
                    fileno = tagMap[6][0]
                guidetbl.append([ref_type, ref_title, fileno])
        self.guidetbl = guidetbl
        if self.DEBUG:
            print "\nGuide Table: %d entries" % len(self.guidetbl)
            print "table: ref_type, ref_title, fragtbl entry number"
            for j in xrange(len(self.guidetbl)):
                print self.guidetbl[j]

    def buildParts(self, rawML):
        # now split the rawML into its flow pieces
        self.flows = []
        for j in xrange(0, len(self.fdsttbl) - 1):
            start = self.fdsttbl[j]
            end = self.fdsttbl[j + 1]
            self.flows.append(rawML[start:end])

        # the first piece represents the xhtml text
        text = self.flows[0]
        self.flows[0] = ''

        # walk the <skeleton> and fragment tables to build original source xhtml files
        # *without* destroying any file position information needed for later href processing
        # and create final list of file separation start: stop points and etc in partinfo
        if self.DEBUG:
            print "\nRebuilding flow piece 0: the main body of the ebook"
        self.parts = []
        self.partinfo = []
        fragptr = 0
        baseptr = 0
        cnt = 0
        for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos:baseptr]
            for i in range(fragcnt):
                [insertpos, idtext, filenum, seqnum, startpos,
                 length] = self.fragtbl[fragptr]
                aidtext = idtext[12:-2]
                if i == 0:
                    filename = 'part%04d.xhtml' % filenum
                slice = text[baseptr:baseptr + length]
                insertpos = insertpos - skelpos
                head = skeleton[:insertpos]
                tail = skeleton[insertpos:]
                actual_inspos = insertpos
                if (tail.find(b'>') < tail.find(b'<')
                        or head.rfind(b'>') < head.rfind(b'<')):
                    # There is an incomplete tag in either the head or tail.
                    # This can happen for some badly formed KF8 files
                    print 'The fragment table for %s has incorrect insert position. Calculating manually.' % skelname
                    bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
                    if bp != ep:
                        actual_inspos = ep + 1 + startpos
                if insertpos != actual_inspos:
                    print "fixed corrupt fragment table insert position", insertpos + skelpos, actual_inspos + skelpos
                    insertpos = actual_inspos
                    self.fragtbl[fragptr][0] = actual_inspos + skelpos
                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
                baseptr = baseptr + length
                fragptr += 1
            cnt += 1
            self.parts.append(skeleton)
            self.partinfo.append(
                [skelnum, 'Text', filename, skelpos, baseptr, aidtext])

        assembled_text = "".join(self.parts)
        if self.DEBUG:
            outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat')
            open(pathof(outassembled), 'wb').write(assembled_text)

        # The primary css style sheet is typically stored next followed by any
        # snippets of code that were previously inlined in the
        # original xhtml but have been stripped out and placed here.
        # This can include local CDATA snippets and and svg sections.

        # The problem is that for most browsers and ereaders, you can not
        # use <img src="imageXXXX.svg" /> to import any svg image that itself
        # properly uses an <image/> tag to import some raster image - it
        # should work according to the spec but does not for almost all browsers
        # and ereaders and causes epub validation issues because those  raster
        # images are in manifest but not in xhtml text - since they only
        # referenced from an svg image

        # So we need to check the remaining flow pieces to see if they are css
        # or svg images.  if svg images, we must check if they have an <image />
        # and if so inline them into the xhtml text pieces.

        # there may be other sorts of pieces stored here but until we see one
        # in the wild to reverse engineer we won't be able to tell
        self.flowinfo.append([None, None, None, None])
        svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE)
        image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE)
        for j in xrange(1, len(self.flows)):
            flowpart = self.flows[j]
            nstr = '%04d' % j
            m = re.search(svg_tag_pattern, flowpart)
            if m is not None:
                # svg
                type = 'svg'
                start = m.start()
                m2 = re.search(image_tag_pattern, flowpart)
                if m2 is not None:
                    format = 'inline'
                    dir = None
                    fname = None
                    # strip off anything before <svg if inlining
                    flowpart = flowpart[start:]
                else:
                    format = 'file'
                    dir = "Images"
                    fname = 'svgimg' + nstr + '.svg'
            else:
                # search for CDATA and if exists inline it
                if flowpart.find('[CDATA[') >= 0:
                    type = 'css'
                    flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
                    format = 'inline'
                    dir = None
                    fname = None
                else:
                    # css - assume as standalone css file
                    type = 'css'
                    format = 'file'
                    dir = "Styles"
                    fname = 'style' + nstr + '.css'

            self.flows[j] = flowpart
            self.flowinfo.append([type, format, dir, fname])

        if self.DEBUG:
            print "\nFlow Map:  %d entries" % len(self.flowinfo)
            for fi in self.flowinfo:
                print fi
            print "\n"

            print "\nXHTML File Part Position Information: %d entries" % len(
                self.partinfo)
            for pi in self.partinfo:
                print pi

        if False:  # self.Debug:
            # dump all of the locations of the aid tags used in TEXT
            # find id links only inside of tags
            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
            #    [^>]* means match any amount of chars except for  '>' char
            #    [^'"] match any amount of chars except for the quote character
            #    \s* means match any amount of whitespace
            print "\npositions of all aid= pieces"
            id_pattern = re.compile(
                r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''', re.IGNORECASE)
            for m in re.finditer(id_pattern, rawML):
                [filename, partnum, start, end] = self.getFileInfo(m.start())
                [seqnum, idtext] = self.getFragTblInfo(m.start())
                value = fromBase32(m.group(1))
                print "  aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (
                    m.group(1), value, m.start(), partnum, start, end)
                print "       %s  fragtbl entry %d" % (idtext, seqnum)

        return

    # get information fragment table entry by pos
    def getFragTblInfo(self, pos):
        for j in xrange(len(self.fragtbl)):
            [insertpos, idtext, filenum, seqnum, startpos,
             length] = self.fragtbl[j]
            if pos >= insertpos and pos < (insertpos + length):
                return seqnum, 'in: ' + idtext
            if pos < insertpos:
                return seqnum, 'before: ' + idtext
        return None, None

    # get information about the part (file) that exists at pos in original rawML
    def getFileInfo(self, pos):
        for [partnum, dir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return filename, partnum, start, end
        return None, None, None, None

    # accessor functions to properly protect the internal structure
    def getNumberOfParts(self):
        return len(self.parts)

    def getPart(self, i):
        if i >= 0 and i < len(self.parts):
            return self.parts[i]
        return None

    def getPartInfo(self, i):
        if i >= 0 and i < len(self.partinfo):
            return self.partinfo[i]
        return None

    def getNumberOfFlows(self):
        return len(self.flows)

    def getFlow(self, i):
        # note flows[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flows):
            return self.flows[i]
        return None

    def getFlowInfo(self, i):
        # note flowinfo[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flowinfo):
            return self.flowinfo[i]
        return None

    def getIDTagByPosFid(self, posfid, offset):
        # first convert kindle:pos:fid and offset info to position in file
        row = fromBase32(posfid)
        off = fromBase32(offset)
        [insertpos, idtext, filenum, seqnm, startpos,
         length] = self.fragtbl[row]
        pos = insertpos + off
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        if fname is None:
            # pos does not exist
            # default to skeleton pos instead
            print "Link To Position", pos, "does not exist, retargeting to top of target"
            pos = self.skeltbl[filenum][3]
            fname, pn, skelpos, skelend = self.getFileInfo(pos)
        # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
        # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
        # some position information encoded into Base32 name.
        # so find the closest "id=" before position the file  by actually searching in that file
        idtext = self.getIDTag(pos)
        return fname, idtext

    def getIDTag(self, pos):
        # find the first tag with a named anchor (name or id attribute) before pos
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        if pn is None and skelpos is None:
            print "Error: getIDTag - no file contains ", pos
        textblock = self.parts[pn]
        npos = pos - skelpos
        # if npos inside a tag then search all text before the its end of tag marker
        pgt = textblock.find('>', npos)
        plt = textblock.find('<', npos)
        if plt == npos or pgt < plt:
            npos = pgt + 1
        # find id and name attributes only inside of tags
        # use a reverse tag search since that is faster
        #    inside any < > pair find "id=" and "name=" attributes return it
        #    [^>]* means match any amount of chars except for  '>' char
        #    [^'"] match any amount of chars except for the quote character
        #    \s* means match any amount of whitespace
        textblock = textblock[0:npos]
        id_pattern = re.compile(r'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',
                                re.IGNORECASE)
        name_pattern = re.compile(r'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',
                                  re.IGNORECASE)
        for tag in reverse_tag_iter(textblock):
            # any ids in the body should default to top of file
            if tag[0:6] == '<body ':
                return ''
            if tag[0:6] != '<meta ':
                m = id_pattern.match(tag) or name_pattern.match(tag)
                if m is not None:
                    return m.group(1)
        return ''

    # do we need to do deep copying
    def setParts(self, parts):
        assert (len(parts) == len(self.parts))
        for i in range(len(parts)):
            self.parts[i] = parts[i]

    # do we need to do deep copying
    def setFlows(self, flows):
        assert (len(flows) == len(self.flows))
        for i in xrange(len(flows)):
            self.flows[i] = flows[i]

    # get information about the part (file) that exists at pos in original rawML
    def getSkelInfo(self, pos):
        for [partnum, dir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return [partnum, dir, filename, start, end, aidtext]
        return [None, None, None, None, None, None]

    # fileno is actually a reference into fragtbl (a fragment)
    def getGuideText(self):
        guidetext = ''
        for [ref_type, ref_title, fileno] in self.guidetbl:
            if ref_type == 'thumbimagestandard':
                continue
            if ref_type not in _guide_types and not ref_type.startswith(
                    'other.'):
                if ref_type == 'start':
                    ref_type = 'text'
                else:
                    ref_type = 'other.' + ref_type
            [pos, idtext, filenum, seqnm, startpos,
             length] = self.fragtbl[fileno]
            [pn, dir, filename, skelpos, skelend,
             aidtext] = self.getSkelInfo(pos)
            idtext = self.getIDTag(pos)
            linktgt = filename
            if idtext != '':
                linktgt += '#' + idtext
            guidetext += '<reference type="%s" title="%s" href="%s/%s" />\n' % (
                ref_type, ref_title, dir, linktgt)
        # opf is encoded utf-8 so must convert any titles properly
        guidetext = unicode(guidetext, self.mh.codec).encode("utf-8")
        return guidetext

    def getPageIDTag(self, pos):
        # find the first tag with a named anchor (name or id attribute) before pos
        # but page map offsets need to little more leeway so if the offset points
        # into a tag look for the next ending tag "/>" or "</" and start your search from there.
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        if pn is None and skelpos is None:
            print "Error: getIDTag - no file contains ", pos
        textblock = self.parts[pn]
        npos = pos - skelpos
        # if npos inside a tag then search all text before next ending tag
        pgt = textblock.find('>', npos)
        plt = textblock.find('<', npos)
        if plt == npos or pgt < plt:
            # we are in a tag
            # so find first ending tag
            pend1 = textblock.find('/>', npos)
            pend2 = textblock.find('</', npos)
            if pend1 != -1 and pend2 != -1:
                pend = min(pend1, pend2)
            else:
                pend = max(pend1, pend2)
            if pend != -1:
                npos = pend
            else:
                npos = pgt + 1
        # find id and name attributes only inside of tags
        # use a reverse tag search since that is faster
        #    inside any < > pair find "id=" and "name=" attributes return it
        #    [^>]* means match any amount of chars except for  '>' char
        #    [^'"] match any amount of chars except for the quote character
        #    \s* means match any amount of whitespace
        textblock = textblock[0:npos]
        id_pattern = re.compile(r'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',
                                re.IGNORECASE)
        name_pattern = re.compile(r'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',
                                  re.IGNORECASE)
        for tag in reverse_tag_iter(textblock):
            # any ids in the body should default to top of file
            if tag[0:6] == '<body ':
                return ''
            if tag[0:6] != '<meta ':
                m = id_pattern.match(tag) or name_pattern.match(tag)
                if m is not None:
                    return m.group(1)
        return ''
Beispiel #8
0
class ncxExtract:
    def __init__(self, mh, files):
        self.mh = mh
        self.sect = self.mh.sect
        self.files = files
        self.isNCX = False
        self.mi = MobiIndex(self.sect)
        self.ncxidx = self.mh.ncxidx
        self.indx_data = None

    def parseNCX(self):
        indx_data = []
        tag_fieldname_map = {
                1: ['pos',0],
                2: ['len',0],
                3: ['noffs',0],
                4: ['hlvl',0],
                5: ['koffs',0],
                6: ['pos_fid',0],
                21: ['parent',0],
                22: ['child1',0],
                23: ['childn',0]
        }
        if self.ncxidx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
            if DEBUG_NCX:
                print ctoc_text
                print outtbl
            num = 0
            for [text, tagMap] in outtbl:
                tmp = {
                        'name': text,
                        'pos':  -1,
                        'len':  0,
                        'noffs': -1,
                        'text' : "Unknown Text",
                        'hlvl' : -1,
                        'kind' : "Unknown Kind",
                        'pos_fid' : None,
                        'parent' : -1,
                        'child1' : -1,
                        'childn' : -1,
                        'num'  : num
                        }
                for tag in tag_fieldname_map.keys():
                    [fieldname, i] = tag_fieldname_map[tag]
                    if tag in tagMap:
                        fieldvalue = tagMap[tag][i]
                        if tag == 6:
                            pos_fid = toBase32(fieldvalue,4)
                            fieldvalue2 = tagMap[tag][i+1]
                            pos_off = toBase32(fieldvalue2,10)
                            fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off)
                        tmp[fieldname] = fieldvalue
                        if tag == 3:
                            toctext = ctoc_text.get(fieldvalue, 'Unknown Text')
                            if self.mh.codec != 'utf-8':
                                toctext = unicode(toctext, self.mh.codec).encode('utf-8')
                            tmp['text'] = toctext
                        if tag == 5:
                            kindtext = ctoc_text.get(fieldvalue, 'Unknown Kind')
                            if self.mh.codec != 'utf-8':
                                kindtext = unicode(kindtext, self.mh.codec).encode('utf-8')
                            tmp['kind'] = kindtext
                indx_data.append(tmp)
                if DEBUG_NCX:
                    print "record number: ", num
                    print "name: ", tmp['name'],
                    print "position", tmp['pos']," length: ", tmp['len']
                    print "text: ", tmp['text']
                    print "kind: ", tmp['kind']
                    print "heading level: ", tmp['hlvl']
                    print "parent:", tmp['parent']
                    print "first child: ",tmp['child1']," last child: ", tmp['childn']
                    print "pos_fid is ", tmp['pos_fid']
                    print "\n\n"
                num += 1
        self.indx_data = indx_data
        return indx_data


    def buildNCX(self, htmlfile, title, ident, lang):
        indx_data = self.indx_data

        ncx_header = \
'''<?xml version='1.0' encoding='utf-8'?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
<head>
<meta content="%s" name="dtb:uid"/>
<meta content="%d" name="dtb:depth"/>
<meta content="mobiunpack.py" name="dtb:generator"/>
<meta content="0" name="dtb:totalPageCount"/>
<meta content="0" name="dtb:maxPageNumber"/>
</head>
<docTitle>
<text>%s</text>
</docTitle>
<navMap>
'''

        ncx_footer = \
'''  </navMap>
</ncx>
'''

        ncx_entry = \
'''<navPoint id="%s" playOrder="%d">
<navLabel>
<text>%s</text>
</navLabel>
<content src="%s"/>'''

        #recursive part
        def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
            if start>len(indx_data) or end>len(indx_data):
                print "Warning: missing INDX child entries", start, end, len(indx_data)
                return ''
            if DEBUG_NCX:
                print "recursINDX lvl %d from %d to %d" % (lvl, start, end)
            xml = ''
            if start <= 0:
                start = 0
            if end <= 0:
                end = len(indx_data)
            if lvl > max_lvl:
                max_lvl = lvl
            indent = '  ' * (2 + lvl)

            for i in range(start, end):
                e = indx_data[i]
                if not e['hlvl'] == lvl:
                    continue
                #open entry
                num += 1
                link = '%s#filepos%d' % (htmlfile, e['pos'])
                tagid = 'np_%d' % num
                entry = ncx_entry % (tagid, num, e['text'], link)
                entry = re.sub(re.compile('^', re.M), indent, entry, 0)
                xml += entry + '\n'
                #recurs
                if e['child1']>=0:
                    xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,\
                            e['child1'], e['childn'] + 1)
                    xml += xmlrec
                #close entry
                xml += indent + '</navPoint>\n'
            return xml, max_lvl, num

        body, max_lvl, num = recursINDX()
        header = ncx_header % (lang, ident, max_lvl + 1, title)
        ncx =  header + body + ncx_footer
        if not len(indx_data) == num:
            print "Warning: different number of entries in NCX", len(indx_data), num
        return ncx

    def writeNCX(self, metadata):
        # build the xml
        self.isNCX = True
        print "Write ncx"
        htmlname = os.path.basename(self.files.outbase)
        htmlname += '.html'
        xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
        #write the ncx file
        ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx')
        open(pathof(ncxname), 'wb').write(xml)


    def buildK8NCX(self, indx_data, title, ident, lang):
        ncx_header = \
'''<?xml version='1.0' encoding='utf-8'?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
<head>
<meta content="%s" name="dtb:uid"/>
<meta content="%d" name="dtb:depth"/>
<meta content="mobiunpack.py" name="dtb:generator"/>
<meta content="0" name="dtb:totalPageCount"/>
<meta content="0" name="dtb:maxPageNumber"/>
</head>
<docTitle>
<text>%s</text>
</docTitle>
<navMap>
'''

        ncx_footer = \
'''  </navMap>
</ncx>
'''

        ncx_entry = \
'''<navPoint id="%s" playOrder="%d">
<navLabel>
<text>%s</text>
</navLabel>
<content src="%s"/>'''

        #recursive part
        def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
            if start>len(indx_data) or end>len(indx_data):
                print "Warning: missing INDX child entries", start, end, len(indx_data)
                return ''
            if DEBUG_NCX:
                print "recursINDX lvl %d from %d to %d" % (lvl, start, end)
            xml = ''
            if start <= 0:
                start = 0
            if end <= 0:
                end = len(indx_data)
            if lvl > max_lvl:
                max_lvl = lvl
            indent = '  ' * (2 + lvl)

            for i in range(start, end):
                e = indx_data[i]
                htmlfile = e['filename']
                desttag = e['idtag']
                if not e['hlvl'] == lvl:
                    continue
                #open entry
                num += 1
                if desttag == '':
                    link = 'Text/%s' % htmlfile
                else:
                    link = 'Text/%s#%s' % (htmlfile, desttag)
                tagid = 'np_%d' % num
                entry = ncx_entry % (tagid, num, e['text'], link)
                entry = re.sub(re.compile('^', re.M), indent, entry, 0)
                xml += entry + '\n'
                #recurs
                if e['child1']>=0:
                    xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,\
                            e['child1'], e['childn'] + 1)
                    xml += xmlrec
                #close entry
                xml += indent + '</navPoint>\n'
            return xml, max_lvl, num

        body, max_lvl, num = recursINDX()
        header = ncx_header % (lang, ident, max_lvl + 1, title)
        ncx =  header + body + ncx_footer
        if not len(indx_data) == num:
            print "Warning: different number of entries in NCX", len(indx_data), num
        return ncx

    def writeK8NCX(self, ncx_data, metadata):
        # build the xml
        self.isNCX = True
        print "Write K8 ncx"
        xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
        bname = 'toc.ncx'
        ncxname = os.path.join(self.files.k8oebps,bname)
        open(pathof(ncxname), 'wb').write(xml)
class K8Processor:
    def __init__(self, mh, sect, debug=False):
        self.sect = sect
        self.mi = MobiIndex(sect)
        self.mh = mh
        self.skelidx = mh.skelidx
        self.dividx = mh.dividx
        self.othidx = mh.othidx
        self.fdst = mh.fdst
        self.flowmap = {}
        self.flows = None
        self.flowinfo = []
        self.parts = None
        self.partinfo = []
        self.fdsttbl= [0,0xffffffff]
        self.DEBUG = debug

        # read in and parse the FDST info which is very similar in format to the Palm DB section
        # parsing except it provides offsets into rawML file and not the Palm DB file
        # this is needed to split up the final css, svg, etc flow section
        # that can exist at the end of the rawML file
        if self.fdst != 0xffffffff:
            header = self.sect.loadSection(self.fdst)
            if header[0:4] == "FDST":
                num_sections, = struct.unpack_from('>L', header, 0x08)
                self.fdsttbl = struct.unpack_from('>%dL' % (num_sections*2), header, 12)[::2] + (mh.rawSize, )
                sect.setsectiondescription(self.fdst,"KF8 FDST INDX")
                if self.DEBUG:
                    print "\nFDST Section Map:  %d sections" % num_sections
                    for j in xrange(num_sections):
                         print "Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1])
            else:
                print "\nError: K8 Mobi with Missing FDST info"


        # read/process skeleton index info to create the skeleton table
        skeltbl = []
        if self.skelidx != 0xffffffff:
            # for i in xrange(2):
            #     fname = 'skel%04d.dat' % i
            #     data = self.sect.loadSection(self.skelidx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton")
            fileptr = 0
            for [text, tagMap] in outtbl:
                # file number, skeleton name, divtbl record count, start position, length
                skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
                fileptr += 1
        self.skeltbl = skeltbl
        if self.DEBUG:
            print "\nSkel Table:  %d entries" % len(self.skeltbl)
            print "table: filenum, skeleton name, div tbl record count, start position, length"
            for j in xrange(len(self.skeltbl)):
                print self.skeltbl[j]

        # read/process the div index to create to <div> (and <p>) table
        divtbl = []
        if self.dividx != 0xffffffff:
            # for i in xrange(3):
            #     fname = 'div%04d.dat' % i
            #     data = self.sect.loadSection(self.dividx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.dividx, "KF8 Division/Fragment")
            for [text, tagMap] in outtbl:
                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
                ctocoffset = tagMap[2][0]
                ctocdata = ctoc_text[ctocoffset]
                divtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]])
        self.divtbl = divtbl
        if self.DEBUG:
            print "\nDiv (Fragment) Table: %d entries" % len(self.divtbl)
            print "table: file position, link id text, file num, sequence number, start position, length"
            for j in xrange(len(self.divtbl)):
                print self.divtbl[j]

        # read / process other index <guide> element of opf
        othtbl = []
        if self.othidx != 0xffffffff:
            # for i in xrange(3):
            #     fname = 'oth%04d.dat' % i
            #     data = self.sect.loadSection(self.othidx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.othidx, "KF8 Other (<guide> elements)")
            for [text, tagMap] in outtbl:
                # ref_type, ref_title, div/frag number
                ctocoffset = tagMap[1][0]
                ref_title = ctoc_text[ctocoffset]
                ref_type = text
                fileno = None
                if 3 in tagMap.keys():
                    fileno  = tagMap[3][0]
                if 6 in tagMap.keys():
                    fileno = tagMap[6][0]
                othtbl.append([ref_type, ref_title, fileno])
        self.othtbl = othtbl
        if self.DEBUG:
            print "\nOther (Guide) Table: %d entries" % len(self.othtbl)
            print "table: ref_type, ref_title, divtbl entry number"
            for j in xrange(len(self.othtbl)):
                print self.othtbl[j]


    def buildParts(self, rawML):
        # now split the rawML into its flow pieces
        self.flows = []
        for j in xrange(0, len(self.fdsttbl)-1):
            start = self.fdsttbl[j]
            end = self.fdsttbl[j+1]
            self.flows.append(rawML[start:end])

        # the first piece represents the xhtml text
        text = self.flows[0]
        self.flows[0] = ''

        # walk the <skeleton> and <div> tables to build original source xhtml files
        # *without* destroying any file position information needed for later href processing
        # and create final list of file separation start: stop points and etc in partinfo
        if self.DEBUG:
            print "\nRebuilding flow piece 0: the main body of the ebook"
        self.parts = []
        self.partinfo = []
        divptr = 0
        baseptr = 0
        cnt = 0
        for [skelnum, skelname, divcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos: baseptr]
            for i in range(divcnt):
                [insertpos, idtext, filenum, seqnum, startpos, length] = self.divtbl[divptr]
                aidtext = idtext[12:-2]
                if i == 0:
                    filename = 'part%04d.xhtml' % filenum
                slice = text[baseptr: baseptr + length]
                insertpos = insertpos - skelpos
                head = skeleton[:insertpos]
                tail = skeleton[insertpos:]
                actual_inspos = insertpos
                if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')):
                    # There is an incomplete tag in either the head or tail.
                    # This can happen for some badly formed KF8 files
                    print 'The div table for %s has incorrect insert position. Calculating manually.' % skelname
                    bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
                    if bp != ep:
                        actual_inspos = ep + 1 + startpos
                if insertpos != actual_inspos:
                    print "fixed corrupt div/frag table insert position", insertpos+skelpos, actual_inspos+skelpos
                    insertpos = actual_inspos
                    self.divtbl[divptr][0] = actual_inspos + skelpos
                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
                baseptr = baseptr + length
                divptr += 1
            cnt += 1
            self.parts.append(skeleton)
            self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext])

        # assembled_text = "".join(self.parts)
        # open(pathof('assembled_text.dat'),'wb').write(assembled_text)

        # The primary css style sheet is typically stored next followed by any
        # snippets of code that were previously inlined in the
        # original xhtml but have been stripped out and placed here.
        # This can include local CDATA snippets and and svg sections.

        # The problem is that for most browsers and ereaders, you can not
        # use <img src="imageXXXX.svg" /> to import any svg image that itself
        # properly uses an <image/> tag to import some raster image - it
        # should work according to the spec but does not for almost all browsers
        # and ereaders and causes epub validation issues because those  raster
        # images are in manifest but not in xhtml text - since they only
        # referenced from an svg image

        # So we need to check the remaining flow pieces to see if they are css
        # or svg images.  if svg images, we must check if they have an <image />
        # and if so inline them into the xhtml text pieces.

        # there may be other sorts of pieces stored here but until we see one
        # in the wild to reverse engineer we won't be able to tell
        self.flowinfo.append([None, None, None, None])
        svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE)
        image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE)
        for j in xrange(1,len(self.flows)):
            flowpart = self.flows[j]
            nstr = '%04d' % j
            m = re.search(svg_tag_pattern, flowpart)
            if m != None:
                # svg
                type = 'svg'
                start = m.start()
                m2 = re.search(image_tag_pattern, flowpart)
                if m2 != None:
                    format = 'inline'
                    dir = None
                    fname = None
                    # strip off anything before <svg if inlining
                    flowpart = flowpart[start:]
                else:
                    format = 'file'
                    dir = "Images"
                    fname = 'svgimg' + nstr + '.svg'
            else:
                # search for CDATA and if exists inline it
                if flowpart.find('[CDATA[') >= 0:
                    type = 'css'
                    flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
                    format = 'inline'
                    dir = None
                    fname = None
                else:
                    # css - assume as standalone css file
                    type = 'css'
                    format = 'file'
                    dir = "Styles"
                    fname = 'style' + nstr + '.css'

            self.flows[j] = flowpart
            self.flowinfo.append([type, format, dir, fname])


        if self.DEBUG:
            print "\nFlow Map:  %d entries" % len(self.flowinfo)
            for fi in self.flowinfo:
                print fi
            print "\n"

            print "\nXHTML File Part Position Information: %d entries" % len(self.partinfo)
            for pi in self.partinfo:
                print pi

        if False: #self.Debug:
            # dump all of the locations of the aid tags used in TEXT
            # find id links only inside of tags
            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
            #    [^>]* means match any amount of chars except for  '>' char
            #    [^'"] match any amount of chars except for the quote character
            #    \s* means match any amount of whitespace
            print "\npositions of all aid= pieces"
            id_pattern = re.compile(r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE)
            for m in re.finditer(id_pattern, rawML):
                [filename, partnum, start, end] = self.getFileInfo(m.start())
                [seqnum, idtext] = self.getDivTblInfo(m.start())
                value = fromBase32(m.group(1))
                print "  aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end)
                print "       %s  divtbl entry %d" % (idtext, seqnum)

        return


    # get information div table entry by pos
    def getDivTblInfo(self, pos):
        baseptr = 0
        for j in xrange(len(self.divtbl)):
            [insertpos, idtext, filenum, seqnum, startpos, length] = self.divtbl[j]
            if pos >= insertpos and pos < (insertpos + length):
                return seqnum, 'in: ' + idtext
            if pos < insertpos:
                return seqnum, 'before: ' + idtext
        return None, None


    # get information about the part (file) that exists at pos in original rawML
    def getFileInfo(self, pos):
        for [partnum, dir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return filename, partnum, start, end
        return None, None, None, None


    # accessor functions to properly protect the internal structure
    def getNumberOfParts(self):
        return len(self.parts)

    def getPart(self,i):
        if i >= 0 and i < len(self.parts):
            return self.parts[i]
        return None

    def getPartInfo(self, i):
        if i >= 0 and i < len(self.partinfo):
            return self.partinfo[i]
        return None

    def getNumberOfFlows(self):
        return len(self.flows)

    def getFlow(self,i):
        # note flows[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flows):
            return self.flows[i]
        return None

    def getFlowInfo(self,i):
        # note flowinfo[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flowinfo):
            return self.flowinfo[i]
        return None


    def getIDTagByPosFid(self, posfid, offset):
        # first convert kindle:pos:fid and offset info to position in file
        row = fromBase32(posfid)
        off = fromBase32(offset)
        [insertpos, idtext, filenum, seqnm, startpos, length] = self.divtbl[row]
        pos = insertpos + off
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        if fname is None:
            # pos does not exist
            # default to skeleton pos instead
            print "Link To Position", pos, "does not exist, retargeting to top of target"
            pos = self.skeltbl[filenum][3]
            fname, pn, skelpos, skelend = self.getFileInfo(pos)
        # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
        # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
        # some position information encoded into Base32 name.
        # so find the closest "id=" before position the file  by actually searching in that file
        idtext = self.getIDTag(pos)
        return fname, idtext

    def getIDTag(self, pos):
        # find the first tag with a named anchor (name or id attribute) before pos
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        if pn is None and skelpos is None:
            print "Error: getIDTag - no file contains ", pos
        textblock = self.parts[pn]
        idtbl = []
        npos = pos - skelpos
        # if npos inside a tag then search all text before the its end of tag marker
        pgt = textblock.find('>',npos)
        plt = textblock.find('<',npos)
        if plt == npos or pgt < plt:
            npos = pgt + 1
        # find id and name attributes only inside of tags
        # use a reverse tag search since that is faster
        #    inside any < > pair find "id=" and "name=" attributes return it
        #    [^>]* means match any amount of chars except for  '>' char
        #    [^'"] match any amount of chars except for the quote character
        #    \s* means match any amount of whitespace
        textblock = textblock[0:npos]
        id_pattern = re.compile(r'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
        name_pattern = re.compile(r'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
        for tag in reverse_tag_iter(textblock):
            m = id_pattern.match(tag) or name_pattern.match(tag)
            if m is not None:
                return m.group(1)
        if self.DEBUG:
            print "Found no id in the textblock, link must be to top of file"
        return ''


    # do we need to do deep copying
    def setParts(self, parts):
        assert(len(parts) == len(self.parts))
        for i in range(len(parts)):
            self.parts[i] = parts[i]

    # do we need to do deep copying
    def setFlows(self, flows):
        assert(len(flows) == len(self.flows))
        for i in xrange(len(flows)):
            self.flows[i] = flows[i]


    # get information about the part (file) that exists at pos in original rawML
    def getSkelInfo(self, pos):
        for [partnum, dir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return [partnum, dir, filename, start, end, aidtext]
        return [None, None, None, None, None, None]

    # fileno is actually a reference into divtbl (a fragment)
    def getGuideText(self):
        guidetext = ''
        for [ref_type, ref_title, fileno] in self.othtbl:
            if ref_type == 'thumbimagestandard':
                continue
            if ref_type not in _guide_types and not ref_type.startswith('other.'):
                if ref_type == 'start':
                    ref_type = 'text'
                else:
                    ref_type = 'other.' + ref_type
            [pos, idtext, filenum, seqnm, startpos, length] = self.divtbl[fileno]
            [pn, dir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos)
            idtext = self.getIDTag(pos)
            linktgt = filename
            if idtext != '':
                linktgt += '#' + idtext
            guidetext += '<reference type="%s" title="%s" href="%s/%s" />\n' % (ref_type, ref_title, dir, linktgt)
        # opf is encoded utf-8 so must convert any titles properly
        guidetext = unicode(guidetext, self.mh.codec).encode("utf-8")
        return guidetext
    def __init__(self, mh, sect, files, debug=False):
        self.sect = sect
        self.files = files
        self.mi = MobiIndex(sect)
        self.mh = mh
        self.skelidx = mh.skelidx
        self.fragidx = mh.fragidx
        self.guideidx = mh.guideidx
        self.fdst = mh.fdst
        self.flowmap = {}
        self.flows = None
        self.flowinfo = []
        self.parts = None
        self.partinfo = []
        self.fdsttbl= [0,0xffffffff]
        self.DEBUG = debug

        # read in and parse the FDST info which is very similar in format to the Palm DB section
        # parsing except it provides offsets into rawML file and not the Palm DB file
        # this is needed to split up the final css, svg, etc flow section
        # that can exist at the end of the rawML file
        if self.fdst != 0xffffffff:
            header = self.sect.loadSection(self.fdst)
            if header[0:4] == "FDST":
                num_sections, = struct.unpack_from('>L', header, 0x08)
                self.fdsttbl = struct.unpack_from('>%dL' % (num_sections*2), header, 12)[::2] + (mh.rawSize, )
                sect.setsectiondescription(self.fdst,"KF8 FDST INDX")
                if self.DEBUG:
                    print "\nFDST Section Map:  %d sections" % num_sections
                    for j in xrange(num_sections):
                         print "Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1])
            else:
                print "\nError: K8 Mobi with Missing FDST info"


        # read/process skeleton index info to create the skeleton table
        skeltbl = []
        if self.skelidx != 0xffffffff:
            # for i in xrange(2):
            #     fname = 'skel%04d.dat' % i
            #     data = self.sect.loadSection(self.skelidx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton")
            fileptr = 0
            for [text, tagMap] in outtbl:
                # file number, skeleton name, fragtbl record count, start position, length
                skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
                fileptr += 1
        self.skeltbl = skeltbl
        if self.DEBUG:
            print "\nSkel Table:  %d entries" % len(self.skeltbl)
            print "table: filenum, skeleton name, frag tbl record count, start position, length"
            for j in xrange(len(self.skeltbl)):
                print self.skeltbl[j]

        # read/process the fragment index to create the fragment table
        fragtbl = []
        if self.fragidx != 0xffffffff:
            # for i in xrange(3):
            #     fname = 'frag%04d.dat' % i
            #     data = self.sect.loadSection(self.fragidx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment")
            for [text, tagMap] in outtbl:
                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
                ctocoffset = tagMap[2][0]
                ctocdata = ctoc_text[ctocoffset]
                fragtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]])
        self.fragtbl = fragtbl
        if self.DEBUG:
            print "\nFragment Table: %d entries" % len(self.fragtbl)
            print "table: file position, link id text, file num, sequence number, start position, length"
            for j in xrange(len(self.fragtbl)):
                print self.fragtbl[j]

        # read / process guide index for guide elements of opf
        guidetbl = []
        if self.guideidx != 0xffffffff:
            # for i in xrange(3):
            #     fname = 'guide%04d.dat' % i
            #     data = self.sect.loadSection(self.guideidx + i)
            #     open(pathof(fname), 'wb').write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.guideidx, "KF8 Guide elements)")
            for [text, tagMap] in outtbl:
                # ref_type, ref_title, frag number
                ctocoffset = tagMap[1][0]
                ref_title = ctoc_text[ctocoffset]
                ref_type = text
                fileno = None
                if 3 in tagMap.keys():
                    fileno  = tagMap[3][0]
                if 6 in tagMap.keys():
                    fileno = tagMap[6][0]
                guidetbl.append([ref_type, ref_title, fileno])
        self.guidetbl = guidetbl
        if self.DEBUG:
            print "\nGuide Table: %d entries" % len(self.guidetbl)
            print "table: ref_type, ref_title, fragtbl entry number"
            for j in xrange(len(self.guidetbl)):
                print self.guidetbl[j]