Esempio n. 1
0
    def getIDTagByPosFid(self, posfid, offset):
        # first convert kindle:pos:fid and offset info to position in file
        row = fromBase32(posfid)
        off = fromBase32(offset)
        [insertpos, idtext, filenum, seqnm, startpos, length] = self.divtbl[row]
        pos = insertpos + off
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        # an existing "id=" must exist in original xhtml otherwise it would not have worked for linking.
        # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
        # some position information encoded into Base32 name.

        # so find the closest "id=" before position the file  by actually searching in that file
        idtext = self.getIDTag(pos)
        return fname, idtext
Esempio n. 2
0
    def getIDTagByPosFid(self, posfid, offset):
        # first convert kindle:pos:fid and offset info to position in file
        row = fromBase32(posfid)
        off = fromBase32(offset)
        [insertpos, idtext, filenum, seqnm, startpos,
         length] = self.divtbl[row]
        pos = insertpos + off
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        # an existing "id=" must exist in original xhtml otherwise it would not have worked for linking.
        # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
        # some position information encoded into Base32 name.

        # so find the closest "id=" before position the file  by actually searching in that file
        idtext = self.getIDTag(pos)
        return fname, idtext
Esempio n. 3
0
 def getIDTagByPosFid(self, posfid, offset):
     # first convert kindle:pos:fid and offset info to position in file
     row = fromBase32(posfid)
     off = fromBase32(offset)
     [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row]
     pos = insertpos + off
     fname, pn, skelpos, skelend = self.getFileInfo(pos)
     if fname is None:
         # pos does not exist
         # default to skeleton pos instead
         print "Link To Position", pos, "does not exist, retargeting to top of target"
         pos = self.skeltbl[filenum][3]
         fname, pn, skelpos, skelend = self.getFileInfo(pos)
     # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
     # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
     # some position information encoded into Base32 name.
     # so find the closest "id=" before position the file  by actually searching in that file
     idtext = self.getIDTag(pos)
     return fname, idtext
Esempio n. 4
0
 def getIDTagByPosFid(self, posfid, offset):
     # first convert kindle:pos:fid and offset info to position in file
     row = fromBase32(posfid)
     off = fromBase32(offset)
     [insertpos, idtext, filenum, seqnm, startpos,
      length] = self.fragtbl[row]
     pos = insertpos + off
     fname, pn, skelpos, skelend = self.getFileInfo(pos)
     if fname is None:
         # pos does not exist
         # default to skeleton pos instead
         print "Link To Position", pos, "does not exist, retargeting to top of target"
         pos = self.skeltbl[filenum][3]
         fname, pn, skelpos, skelend = self.getFileInfo(pos)
     # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
     # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
     # some position information encoded into Base32 name.
     # so find the closest "id=" before position the file  by actually searching in that file
     idtext = self.getIDTag(pos)
     return fname, idtext
Esempio n. 5
0
 def __init__(self, data, debug=False):
     self._debug = debug
     self.resc = None
     self.opos = 0
     self.extrameta = []
     self.cover_name = None
     self.spine_idrefs = {}
     self.spine_order = []
     self.spine_pageattributes = {}
     self.spine_ppd = None
     # need3 indicate the book has fields which require epub3.
     # but the estimation of the source epub version from the fields is difficult.
     self.need3 = False
     self.package_ver = None
     self.extra_metadata = []
     self.refines_metadata = []
     self.extra_attributes = []
     # get header
     start_pos = data.find(b'<')
     self.resc_header = data[:start_pos]
     # get resc data length
     start = self.resc_header.find(b'=') + 1
     end = self.resc_header.find(b'&', start)
     resc_size = 0
     if end > 0:
         resc_size = fromBase32(self.resc_header[start:end])
     resc_rawbytes = len(data) - start_pos
     if resc_rawbytes == resc_size:
         self.resc_length = resc_size
     else:
         # Most RESC has a nul string at its tail but some do not.
         end_pos = data.find(b'\x00', start_pos)
         if end_pos < 0:
             self.resc_length = resc_rawbytes
         else:
             self.resc_length = end_pos - start_pos
     if self.resc_length != resc_size:
         print(
             "Warning: RESC section length({:d}bytes) does not match its size({:d}bytes)."
             .format(self.resc_length, resc_size))
     # now parse RESC after converting it to unicode from utf-8
     self.resc = unicode_str(data[start_pos:start_pos + self.resc_length])
     self.parseData()
Esempio n. 6
0
 def __init__(self, data, debug=False):
     self._debug = debug
     self.resc = None
     self.opos = 0
     self.extrameta = []
     self.cover_name = None
     self.spine_idrefs = {}
     self.spine_order = []
     self.spine_pageattributes = {}
     self.spine_ppd = None
     # need3 indicate the book has fields which require epub3.
     # but the estimation of the source epub version from the fields is difficult.
     self.need3 = False
     self.package_ver = None
     self.extra_metadata = []
     self.refines_metadata = []
     self.extra_attributes = []
     # get header
     start_pos = data.find('<')
     self.resc_header = data[:start_pos]
     # get resc data length
     start = self.resc_header.find('=') + 1
     end = self.resc_header.find('&', start)
     resc_size = 0
     if end > 0:
         resc_size = fromBase32(self.resc_header[start:end])
     resc_rawbytes = len(data) - start_pos
     if resc_rawbytes == resc_size:
         self.resc_length = resc_size
     else:
         # Most RESC has a nul string at its tail but some do not.
         end_pos = data.find('\x00', start_pos)
         if end_pos < 0:
             self.resc_length = resc_rawbytes
         else:
             self.resc_length = end_pos - start_pos
     if self.resc_length != resc_size:
         print "Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size)
     # now handle RESC
     self.resc = data[start_pos:start_pos+self.resc_length]
     self.parseData()
Esempio n. 7
0
    def buildXHTML(self):

        # first need to update all links that are internal which
        # are based on positions within the xhtml files **BEFORE**
        # cutting and pasting any pieces into the xhtml text files

        #   kindle:pos:fid:XXXX:off:YYYYYYYYYY  (used for internal link within xhtml)
        #       XXXX is the offset in records into divtbl
        #       YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position

        # pos:fid pattern
        posfid_pattern = re.compile(r'''(<a.*?href=.*?>)''', re.IGNORECASE)
        posfid_index_pattern = re.compile(r'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')

        parts = []
        print "Building proper xhtml for each file"
        for i in xrange(self.k8proc.getNumberOfParts()):
            part = self.k8proc.getPart(i)
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)

            # internal links
            srcpieces = posfid_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith('<'):
                    for m in posfid_index_pattern.finditer(tag):
                        posfid = m.group(1)
                        offset = m.group(2)
                        filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
                        if idtag == '':
                            replacement= '"' + filename + '"'
                        else:
                            replacement = '"' + filename + '#' + idtag + '"'
                        tag = posfid_index_pattern.sub(replacement, tag, 1)
                    srcpieces[j] = tag
            part = "".join(srcpieces)
            parts.append(part)

        # we are free to cut and paste as we see fit
        # we can safely remove all of the Kindlegen generated aid tags
        find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE)
        within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''')
        for i in xrange(len(parts)):
            part = parts[i]
            srcpieces = find_tag_with_aid_pattern.split(part)
            for j in range(len(srcpieces)):
                tag = srcpieces[j]
                if tag.startswith('<'):
                    for m in within_tag_aid_position_pattern.finditer(tag):
                        replacement = ''
                        tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
                    srcpieces[j] = tag
            part = "".join(srcpieces)
            parts[i] = part

        # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
        # with page-break-after style patterns
        find_tag_with_AmznPageBreak_pattern = re.compile(r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
        within_tag_AmznPageBreak_position_pattern = re.compile(r'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
        for i in xrange(len(parts)):
            part = parts[i]
            srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
            for j in range(len(srcpieces)):
                tag = srcpieces[j]
                if tag.startswith('<'):
                    srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
                        lambda m:' style="page-break-after:%s"'%m.group(1), tag)
            part = "".join(srcpieces)
            parts[i] = part

        # we have to handle substitutions for the flows  pieces first as they may
        # be inlined into the xhtml text
        #   kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
        #   kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
        #   kindle:embed:XXXX   (used for fonts)

        flows = []
        flows.append(None)
        flowinfo = []
        flowinfo.append([None, None, None, None])

        # regular expression search patterns
        img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)

        tag_pattern = re.compile(r'''(<[^>]*>)''')
        flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)

        url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE)
        url_img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE)
        font_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE)
        url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)

        for i in xrange(1, self.k8proc.getNumberOfFlows()):
            [type, format, dir, filename] = self.k8proc.getFlowInfo(i)
            flowpart = self.k8proc.getFlow(i)

            # links to raster image files from image tags
            # image_pattern
            srcpieces = img_pattern.split(flowpart)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith('<im'):
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.rscnames[imageNumber-1]
                        if imageName is not None:
                            replacement = '"../Images/' + imageName + '"'
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)
                    srcpieces[j] = tag
            flowpart = "".join(srcpieces)

            # replacements inside css url():
            srcpieces = url_pattern.split(flowpart)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]

                #  process links to raster image files
                for m in url_img_index_pattern.finditer(tag):
                    imageNumber = fromBase32(m.group(1))
                    imageName = self.rscnames[imageNumber-1]
                    osep = m.group()[0]
                    csep = m.group()[-1]
                    if imageName is not None:
                        replacement = '%s%s%s'%(osep, '../Images/' + imageName, csep)
                        self.used[imageName] = 'used'
                        tag = url_img_index_pattern.sub(replacement, tag, 1)
                    else:
                        print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)

                # process links to fonts
                for m in font_index_pattern.finditer(tag):
                    fontNumber = fromBase32(m.group(1))
                    fontName = self.rscnames[fontNumber-1]
                    osep = m.group()[0]
                    csep = m.group()[-1]
                    if fontName is None:
                        print "Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag)
                    else:
                        replacement = '%s%s%s'%(osep, '../Fonts/' + fontName, csep)
                        tag = font_index_pattern.sub(replacement, tag, 1)
                        self.used[fontName] = 'used'

                # process links to other css pieces
                for m in url_css_index_pattern.finditer(tag):
                    num = fromBase32(m.group(1))
                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                    replacement = '"../' + pdir + '/' + fnm + '"'
                    tag = url_css_index_pattern.sub(replacement, tag, 1)
                    self.used[fnm] = 'used'

                srcpieces[j] = tag
            flowpart = "".join(srcpieces)

            # store away in our own copy
            flows.append(flowpart)

            # I do no thtink this case exists and even if it does exist, it needs to be done in a separate
            # pass to prevent inlining a flow piece into another flow piece before the inserted one or the
            # target one has been fully processed

            # but keep it around if it ends up we do need it

            # flow pattern not inside url()
            # srcpieces = tag_pattern.split(flowpart)
            # for j in range(1, len(srcpieces),2):
            #     tag = srcpieces[j]
            #     if tag.startswith('<'):
            #         for m in flow_pattern.finditer(tag):
            #             num = fromBase32(m.group(1))
            #             [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
            #             flowtext = self.k8proc.getFlow(num)
            #             if fmt == 'inline':
            #                 tag = flowtext
            #             else:
            #                 replacement = '"../' + pdir + '/' + fnm + '"'
            #                 tag = flow_pattern.sub(replacement, tag, 1)
            #                 self.used[fnm] = 'used'
            #         srcpieces[j] = tag
            # flowpart = "".join(srcpieces)

        # now handle the main text xhtml parts

        # Handle the flow items in the XHTML text pieces
        # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
        tag_pattern = re.compile(r'''(<[^>]*>)''')
        flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
        for i in xrange(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
            # flow pattern
            srcpieces = tag_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith('<'):
                    for m in flow_pattern.finditer(tag):
                        num = fromBase32(m.group(1))
                        if num > 0 and num < len(self.k8proc.flowinfo):
                            [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                            flowpart = flows[num]
                            if fmt == 'inline':
                                tag = flowpart
                            else:
                                replacement = '"../' + pdir + '/' + fnm + '"'
                                tag = flow_pattern.sub(replacement, tag, 1)
                                self.used[fnm] = 'used'
                        else:
                            print "warning: ignoring non-existent flow link", tag, " value 0x%x" % num
                    srcpieces[j] = tag
            part = "".join(srcpieces)

            # store away modified version
            parts[i] = part

        # Handle any embedded raster images links in style= attributes urls
        style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)

        for i in xrange(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # replace urls in style attributes
            srcpieces = style_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if 'kindle:embed' in tag:
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.rscnames[imageNumber-1]
                        osep = m.group()[0]
                        csep = m.group()[-1]
                        if imageName is not None:
                            replacement = '%s%s%s'%(osep, '../Images/' + imageName, csep)
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print "Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag)
                    srcpieces[j] = tag
            part = "".join(srcpieces)

            # store away modified version
            parts[i] = part

        # Handle any embedded raster images links in the xhtml text
        # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
        img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')

        for i in xrange(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # links to raster image files
            # image_pattern
            srcpieces = img_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith('<im'):
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.rscnames[imageNumber-1]
                        if imageName is not None:
                            replacement = '"../Images/' + imageName + '"'
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)
                    srcpieces[j] = tag
            part = "".join(srcpieces)
            # store away modified version
            parts[i] = part

        # finally perform any general cleanups needed to make valid XHTML
        # these include:
        #   in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
        #   in svg tags replace "viewbox" attributes with "viewBox"
        #   in <li> remove value="XX" attributes since these are illegal
        tag_pattern = re.compile(r'''(<[^>]*>)''')
        li_value_pattern = re.compile(r'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE)

        for i in xrange(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # tag pattern
            srcpieces = tag_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith('<svg') or tag.startswith('<SVG'):
                    tag = tag.replace('preserveaspectratio','preserveAspectRatio')
                    tag = tag.replace('viewbox','viewBox')
                elif tag.startswith('<li ') or tag.startswith('<LI '):
                    tagpieces = li_value_pattern.split(tag)
                    tag = "".join(tagpieces)
                srcpieces[j] = tag
            part = "".join(srcpieces)
            # store away modified version
            parts[i] = part

        self.k8proc.setFlows(flows)
        self.k8proc.setParts(parts)

        return self.used
Esempio n. 8
0
    def buildParts(self, rawML):
        # now split the rawML into its flow pieces
        self.flows = []
        for j in xrange(0, len(self.fdsttbl)-1):
            start = self.fdsttbl[j]
            end = self.fdsttbl[j+1]
            if end == 0xffffffff:
                end = len(rawML)
                if self.DEBUG:
                    print "splitting rawml starting at %d and ending at %d into flow piece %d" % (start, end, j)
            self.flows.append(rawML[start:end])

        # the first piece represents the xhtml text
        text = self.flows[0]
        self.flows[0] = ''

        # walk the <skeleton> and <div> tables to build original source xhtml files
        # *without* destroying any file position information needed for later href processing
        # and create final list of file separation start: stop points and etc in partinfo
        if self.DEBUG:
            print "\nRebuilding flow piece 0: the main body of the ebook"
        self.parts = []
        self.partinfo = []
        divptr = 0
        baseptr = 0
        for [skelnum, skelname, divcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos: baseptr]
            for i in range(divcnt):
                [insertpos, idtext, filenum, seqnum, startpos, length] = self.divtbl[divptr]
                if self.DEBUG:
                    print "    moving div/frag %d starting at %d of length %d" % (divptr, startpos, length)
                    print "        inside of skeleton number %d at postion %d" %  (skelnum, insertpos)
                if i == 0:
                    aidtext = idtext[12:-2]
                    filename = 'part%04d.xhtml' % filenum
                slice = text[baseptr: baseptr + length]
                insertpos = insertpos - skelpos
                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
                baseptr = baseptr + length
                divptr += 1
            self.parts.append(skeleton)
            self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext])

        # The primary css style sheet is typically stored next followed by any
        # snippets of code that were previously inlined in the
        # original xhtml but have been stripped out and placed here.
        # This can include local CDATA snippets and and svg sections.

        # The problem is that for most browsers and ereaders, you can not
        # use <img src="imageXXXX.svg" /> to import any svg image that itself
        # properly uses an <image/> tag to import some raster image - it
        # should work according to the spec but does not for almost all browsers
        # and ereaders and causes epub validation issues because those  raster
        # images are in manifest but not in xhtml text - since they only
        # referenced from an svg image

        # So we need to check the remaining flow pieces to see if they are css
        # or svg images.  if svg images, we must check if they have an <image />
        # and if so inline them into the xhtml text pieces.

        # there may be other sorts of pieces stored here but until we see one
        # in the wild to reverse engineer we won't be able to tell
        self.flowinfo.append([None, None, None, None])
        svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE)
        image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE)
        for j in xrange(1,len(self.flows)):
            flowpart = self.flows[j]
            nstr = '%04d' % j
            m = re.search(svg_tag_pattern, flowpart)
            if m != None:
                # svg
                type = 'svg'
                start = m.start()
                m2 = re.search(image_tag_pattern, flowpart)
                if m2 != None:
                    format = 'inline'
                    dir = None
                    fname = None
                    # strip off anything before <svg if inlining
                    flowpart = flowpart[start:]
                else:
                    format = 'file'
                    dir = "Images"
                    fname = 'svgimg' + nstr + '.svg'
            else:
                # search for CDATA and if exists inline it
                if flowpart.find('[CDATA[') >= 0:
                    type = 'css'
                    flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
                    format = 'inline'
                    dir = None
                    fname = None
                else:
                    # css - assume as standalone css file
                    type = 'css'
                    format = 'file'
                    dir = "Styles"
                    fname = 'style' + nstr + '.css'

            self.flows[j] = flowpart
            self.flowinfo.append([type, format, dir, fname])
        
        if self.DEBUG:
            print "\nFlow Map:  %d entries" % len(self.flowinfo)
            for fi in self.flowinfo:
                print fi
            print "\n"

            print "\nXHTML File Part Position Information: %d entries" % len(self.partinfo)
            for pi in self.partinfo:
                print pi

        if False:  # self.DEBUG:
            # dump all of the locations of the aid tags used in TEXT
            # find id links only inside of tags
            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
            #    [^>]* means match any amount of chars except for  '>' char
            #    [^'"] match any amount of chars except for the quote character
            #    \s* means match any amount of whitespace
            print "\npositions of all aid= pieces"
            id_pattern = re.compile(r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE)
            for m in re.finditer(id_pattern, rawML):
                print "%0x %s %0x" % (m.start(), m.group(1), fromBase32(m.group(1)))
                [filename, partnum, start, end] = self.getFileInfo(m.start())
                print "   in  %d %0x %0x" % (partnum, start, end)

        return
Esempio n. 9
0
    def buildParts(self, rawML):
        # now split the rawML into its flow pieces
        self.flows = []
        for j in xrange(0, len(self.fdsttbl) - 1):
            start = self.fdsttbl[j]
            end = self.fdsttbl[j + 1]
            if end == 0xffffffff:
                end = len(rawML)
                if self.DEBUG:
                    print "splitting rawml starting at %d and ending at %d into flow piece %d" % (
                        start, end, j)
            self.flows.append(rawML[start:end])

        # the first piece represents the xhtml text
        text = self.flows[0]
        self.flows[0] = ''

        # walk the <skeleton> and <div> tables to build original source xhtml files
        # *without* destroying any file position information needed for later href processing
        # and create final list of file separation start: stop points and etc in partinfo
        if self.DEBUG:
            print "\nRebuilding flow piece 0: the main body of the ebook"
        self.parts = []
        self.partinfo = []
        divptr = 0
        baseptr = 0
        for [skelnum, skelname, divcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos:baseptr]
            for i in range(divcnt):
                [insertpos, idtext, filenum, seqnum, startpos,
                 length] = self.divtbl[divptr]
                if self.DEBUG:
                    print "    moving div/frag %d starting at %d of length %d" % (
                        divptr, startpos, length)
                    print "        inside of skeleton number %d at postion %d" % (
                        skelnum, insertpos)
                if i == 0:
                    aidtext = idtext[12:-2]
                    filename = 'part%04d.xhtml' % filenum
                slice = text[baseptr:baseptr + length]
                insertpos = insertpos - skelpos
                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
                baseptr = baseptr + length
                divptr += 1
            self.parts.append(skeleton)
            self.partinfo.append(
                [skelnum, 'Text', filename, skelpos, baseptr, aidtext])

        # The primary css style sheet is typically stored next followed by any
        # snippets of code that were previously inlined in the
        # original xhtml but have been stripped out and placed here.
        # This can include local CDATA snippets and and svg sections.

        # The problem is that for most browsers and ereaders, you can not
        # use <img src="imageXXXX.svg" /> to import any svg image that itself
        # properly uses an <image/> tag to import some raster image - it
        # should work according to the spec but does not for almost all browsers
        # and ereaders and causes epub validation issues because those  raster
        # images are in manifest but not in xhtml text - since they only
        # referenced from an svg image

        # So we need to check the remaining flow pieces to see if they are css
        # or svg images.  if svg images, we must check if they have an <image />
        # and if so inline them into the xhtml text pieces.

        # there may be other sorts of pieces stored here but until we see one
        # in the wild to reverse engineer we won't be able to tell
        self.flowinfo.append([None, None, None, None])
        svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE)
        image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE)
        for j in xrange(1, len(self.flows)):
            flowpart = self.flows[j]
            nstr = '%04d' % j
            m = re.search(svg_tag_pattern, flowpart)
            if m != None:
                # svg
                type = 'svg'
                start = m.start()
                m2 = re.search(image_tag_pattern, flowpart)
                if m2 != None:
                    format = 'inline'
                    dir = None
                    fname = None
                    # strip off anything before <svg if inlining
                    flowpart = flowpart[start:]
                else:
                    format = 'file'
                    dir = "Images"
                    fname = 'svgimg' + nstr + '.svg'
            else:
                # search for CDATA and if exists inline it
                if flowpart.find('[CDATA[') >= 0:
                    type = 'css'
                    flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
                    format = 'inline'
                    dir = None
                    fname = None
                else:
                    # css - assume as standalone css file
                    type = 'css'
                    format = 'file'
                    dir = "Styles"
                    fname = 'style' + nstr + '.css'

            self.flows[j] = flowpart
            self.flowinfo.append([type, format, dir, fname])

        if self.DEBUG:
            print "\nFlow Map:  %d entries" % len(self.flowinfo)
            for fi in self.flowinfo:
                print fi
            print "\n"

            print "\nXHTML File Part Position Information: %d entries" % len(
                self.partinfo)
            for pi in self.partinfo:
                print pi

        if False:  # self.DEBUG:
            # dump all of the locations of the aid tags used in TEXT
            # find id links only inside of tags
            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
            #    [^>]* means match any amount of chars except for  '>' char
            #    [^'"] match any amount of chars except for the quote character
            #    \s* means match any amount of whitespace
            print "\npositions of all aid= pieces"
            id_pattern = re.compile(
                r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''', re.IGNORECASE)
            for m in re.finditer(id_pattern, rawML):
                print "%0x %s %0x" % (m.start(), m.group(1),
                                      fromBase32(m.group(1)))
                [filename, partnum, start, end] = self.getFileInfo(m.start())
                print "   in  %d %0x %0x" % (partnum, start, end)

        return
Esempio n. 10
0
    def buildParts(self, rawML):
        # now split the rawML into its flow pieces
        self.flows = []
        for j in xrange(0, len(self.fdsttbl) - 1):
            start = self.fdsttbl[j]
            end = self.fdsttbl[j + 1]
            self.flows.append(rawML[start:end])

        # the first piece represents the xhtml text
        text = self.flows[0]
        self.flows[0] = ''

        # walk the <skeleton> and fragment tables to build original source xhtml files
        # *without* destroying any file position information needed for later href processing
        # and create final list of file separation start: stop points and etc in partinfo
        if self.DEBUG:
            print "\nRebuilding flow piece 0: the main body of the ebook"
        self.parts = []
        self.partinfo = []
        fragptr = 0
        baseptr = 0
        cnt = 0
        for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos:baseptr]
            for i in range(fragcnt):
                [insertpos, idtext, filenum, seqnum, startpos,
                 length] = self.fragtbl[fragptr]
                aidtext = idtext[12:-2]
                if i == 0:
                    filename = 'part%04d.xhtml' % filenum
                slice = text[baseptr:baseptr + length]
                insertpos = insertpos - skelpos
                head = skeleton[:insertpos]
                tail = skeleton[insertpos:]
                actual_inspos = insertpos
                if (tail.find(b'>') < tail.find(b'<')
                        or head.rfind(b'>') < head.rfind(b'<')):
                    # There is an incomplete tag in either the head or tail.
                    # This can happen for some badly formed KF8 files
                    print 'The fragment table for %s has incorrect insert position. Calculating manually.' % skelname
                    bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
                    if bp != ep:
                        actual_inspos = ep + 1 + startpos
                if insertpos != actual_inspos:
                    print "fixed corrupt fragment table insert position", insertpos + skelpos, actual_inspos + skelpos
                    insertpos = actual_inspos
                    self.fragtbl[fragptr][0] = actual_inspos + skelpos
                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
                baseptr = baseptr + length
                fragptr += 1
            cnt += 1
            self.parts.append(skeleton)
            self.partinfo.append(
                [skelnum, 'Text', filename, skelpos, baseptr, aidtext])

        assembled_text = "".join(self.parts)
        if self.DEBUG:
            outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat')
            open(pathof(outassembled), 'wb').write(assembled_text)

        # The primary css style sheet is typically stored next followed by any
        # snippets of code that were previously inlined in the
        # original xhtml but have been stripped out and placed here.
        # This can include local CDATA snippets and and svg sections.

        # The problem is that for most browsers and ereaders, you can not
        # use <img src="imageXXXX.svg" /> to import any svg image that itself
        # properly uses an <image/> tag to import some raster image - it
        # should work according to the spec but does not for almost all browsers
        # and ereaders and causes epub validation issues because those  raster
        # images are in manifest but not in xhtml text - since they only
        # referenced from an svg image

        # So we need to check the remaining flow pieces to see if they are css
        # or svg images.  if svg images, we must check if they have an <image />
        # and if so inline them into the xhtml text pieces.

        # there may be other sorts of pieces stored here but until we see one
        # in the wild to reverse engineer we won't be able to tell
        self.flowinfo.append([None, None, None, None])
        svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE)
        image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE)
        for j in xrange(1, len(self.flows)):
            flowpart = self.flows[j]
            nstr = '%04d' % j
            m = re.search(svg_tag_pattern, flowpart)
            if m is not None:
                # svg
                type = 'svg'
                start = m.start()
                m2 = re.search(image_tag_pattern, flowpart)
                if m2 is not None:
                    format = 'inline'
                    dir = None
                    fname = None
                    # strip off anything before <svg if inlining
                    flowpart = flowpart[start:]
                else:
                    format = 'file'
                    dir = "Images"
                    fname = 'svgimg' + nstr + '.svg'
            else:
                # search for CDATA and if exists inline it
                if flowpart.find('[CDATA[') >= 0:
                    type = 'css'
                    flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
                    format = 'inline'
                    dir = None
                    fname = None
                else:
                    # css - assume as standalone css file
                    type = 'css'
                    format = 'file'
                    dir = "Styles"
                    fname = 'style' + nstr + '.css'

            self.flows[j] = flowpart
            self.flowinfo.append([type, format, dir, fname])

        if self.DEBUG:
            print "\nFlow Map:  %d entries" % len(self.flowinfo)
            for fi in self.flowinfo:
                print fi
            print "\n"

            print "\nXHTML File Part Position Information: %d entries" % len(
                self.partinfo)
            for pi in self.partinfo:
                print pi

        if False:  # self.Debug:
            # dump all of the locations of the aid tags used in TEXT
            # find id links only inside of tags
            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
            #    [^>]* means match any amount of chars except for  '>' char
            #    [^'"] match any amount of chars except for the quote character
            #    \s* means match any amount of whitespace
            print "\npositions of all aid= pieces"
            id_pattern = re.compile(
                r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''', re.IGNORECASE)
            for m in re.finditer(id_pattern, rawML):
                [filename, partnum, start, end] = self.getFileInfo(m.start())
                [seqnum, idtext] = self.getFragTblInfo(m.start())
                value = fromBase32(m.group(1))
                print "  aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (
                    m.group(1), value, m.start(), partnum, start, end)
                print "       %s  fragtbl entry %d" % (idtext, seqnum)

        return
Esempio n. 11
0
    def buildXHTML(self):

        # first need to update all links that are internal which
        # are based on positions within the xhtml files **BEFORE**
        # cutting and pasting any pieces into the xhtml text files

        #   kindle:pos:fid:XXXX:off:YYYYYYYYYY  (used for internal link within xhtml)
        #       XXXX is the offset in records into divtbl
        #       YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position


        # pos:fid pattern
        posfid_pattern = re.compile(r'''(<a.*?href=.*?>)''', re.IGNORECASE)
        posfid_index_pattern = re.compile(r'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')

        parts = []
        print "Building proper xhtml for each file"
        for i in xrange(self.k8proc.getNumberOfParts()):
            part = self.k8proc.getPart(i)
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)

            # internal links
            srcpieces = posfid_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith('<'):
                    for m in posfid_index_pattern.finditer(tag):
                        posfid = m.group(1)
                        offset = m.group(2)
                        filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
                        if idtag == '':
                            replacement= '"' + filename + '"'
                        else:
                            replacement = '"' + filename + '#' + idtag + '"'
                        tag = posfid_index_pattern.sub(replacement, tag, 1)
                    srcpieces[j] = tag
            part = "".join(srcpieces)
            parts.append(part)


        # we are free to cut and paste as we see fit
        # we can safely remove all of the Kindlegen generated aid tags
        find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE)
        within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''')
        for i in xrange(len(parts)):
            part = parts[i]
            srcpieces = find_tag_with_aid_pattern.split(part)
            for j in range(len(srcpieces)):
                tag = srcpieces[j]
                if tag.startswith('<'):
                    for m in within_tag_aid_position_pattern.finditer(tag):
                        replacement = ''
                        tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
                    srcpieces[j] = tag
            part = "".join(srcpieces)
            parts[i] = part

        # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
        # with page-break-after style patterns
        find_tag_with_AmznPageBreak_pattern = re.compile(r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
        within_tag_AmznPageBreak_position_pattern = re.compile(r'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
        for i in xrange(len(parts)):
            part = parts[i]
            srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
            for j in range(len(srcpieces)):
                tag = srcpieces[j]
                if tag.startswith('<'):
                    srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
                        lambda m:' style="page-break-after:%s"'%m.group(1), tag)
            part = "".join(srcpieces)
            parts[i] = part


        # we have to handle substitutions for the flows  pieces first as they may
        # be inlined into the xhtml text
        #   kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
        #   kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
        #   kindle:embed:XXXX   (used for fonts)

        flows = []
        flows.append(None)
        flowinfo = []
        flowinfo.append([None, None, None, None])

        # regular expression search patterns
        img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)

        tag_pattern = re.compile(r'''(<[^>]*>)''')
        flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)

        url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE)
        url_img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE)
        font_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE)
        url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)

        for i in xrange(1, self.k8proc.getNumberOfFlows()):
            [type, format, dir, filename] = self.k8proc.getFlowInfo(i)
            flowpart = self.k8proc.getFlow(i)

            # links to raster image files from image tags
            # image_pattern
            srcpieces = img_pattern.split(flowpart)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith('<im'):
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.imgnames[imageNumber-1]
                        if imageName is not None:
                            replacement = '"../Images/' + imageName + '"'
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)
                    srcpieces[j] = tag
            flowpart = "".join(srcpieces)

            # replacements inside css url():
            srcpieces = url_pattern.split(flowpart)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]

                #  process links to raster image files
                for m in url_img_index_pattern.finditer(tag):
                    imageNumber = fromBase32(m.group(1))
                    imageName = self.imgnames[imageNumber-1]
                    osep = m.group()[0]
                    csep = m.group()[-1]
                    if imageName is not None:
                        replacement = '%s%s%s'%(osep, '../Images/' + imageName, csep)
                        self.used[imageName] = 'used'
                        tag = url_img_index_pattern.sub(replacement, tag, 1)
                    else:
                        print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)

                # process links to fonts
                for m in font_index_pattern.finditer(tag):
                    fontNumber = fromBase32(m.group(1))
                    fontName = self.imgnames[fontNumber-1]
                    osep = m.group()[0]
                    csep = m.group()[-1]
                    if fontName is None:
                        print "Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag)
                    else:
                        replacement = '%s%s%s'%(osep, '../Fonts/' + fontName, csep)
                        tag = font_index_pattern.sub(replacement, tag, 1)
                        self.used[fontName] = 'used'


                # process links to other css pieces
                for m in url_css_index_pattern.finditer(tag):
                    num = fromBase32(m.group(1))
                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                    replacement = '"../' + pdir + '/' + fnm + '"'
                    tag = url_css_index_pattern.sub(replacement, tag, 1)
                    self.used[fnm] = 'used'

                srcpieces[j] = tag
            flowpart = "".join(srcpieces)

            # store away in our own copy
            flows.append(flowpart)

            # I do no thtink this case exists and even if it does exist, it needs to be done in a separate
            # pass to prevent inlining a flow piece into another flow piece before the inserted one or the
            # target one has been fully processed

            # but keep it around if it ends up we do need it

            # # flow pattern not inside url()
            # srcpieces = tag_pattern.split(flowpart)
            # for j in range(1, len(srcpieces),2):
            #     tag = srcpieces[j]
            #     if tag.startswith('<'):
            #         for m in flow_pattern.finditer(tag):
            #             num = fromBase32(m.group(1))
            #             [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
            #             flowtext = self.k8proc.getFlow(num)
            #             if fmt == 'inline':
            #                 tag = flowtext
            #             else:
            #                 replacement = '"../' + pdir + '/' + fnm + '"'
            #                 tag = flow_pattern.sub(replacement, tag, 1)
            #                 self.used[fnm] = 'used'
            #         srcpieces[j] = tag
            # flowpart = "".join(srcpieces)

        # now handle the main text xhtml parts

        # Handle the flow items in the XHTML text pieces
        # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
        tag_pattern = re.compile(r'''(<[^>]*>)''')
        flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
        for i in xrange(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
            # flow pattern
            srcpieces = tag_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith('<'):
                    for m in flow_pattern.finditer(tag):
                        num = fromBase32(m.group(1))
                        if num > 0 and  num < len(self.k8proc.flowinfo):
                            [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                            flowpart = flows[num]
                            if fmt == 'inline':
                                tag = flowpart
                            else:
                                replacement = '"../' + pdir + '/' + fnm + '"'
                                tag = flow_pattern.sub(replacement, tag, 1)
                                self.used[fnm] = 'used'
                        else:
                            print "warning: ignoring non-existent flow link", tag, " value 0x%x" % num
                    srcpieces[j] = tag
            part = "".join(srcpieces)

            # store away modified version
            parts[i] = part

        # Handle any embedded raster images links in style= attributes urls
        style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)

        for i in xrange(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # replace urls in style attributes
            srcpieces = style_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if 'kindle:embed' in tag:
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.imgnames[imageNumber-1]
                        osep = m.group()[0]
                        csep = m.group()[-1]
                        if imageName is not None:
                            replacement = '%s%s%s'%(osep, '../Images/' + imageName, csep)
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print "Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag)
                    srcpieces[j] = tag
            part = "".join(srcpieces)

            # store away modified version
            parts[i] = part

        # Handle any embedded raster images links in the xhtml text
        # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
        img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')

        for i in xrange(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # links to raster image files
            # image_pattern
            srcpieces = img_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith('<im'):
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.imgnames[imageNumber-1]
                        if imageName is not None:
                            replacement = '"../Images/' + imageName + '"'
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)
                    srcpieces[j] = tag
            part = "".join(srcpieces)
            # store away modified version
            parts[i] = part


        # finally perform any general cleanups needed to make valid XHTML
        # these include:
        #   in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
        #   in svg tags replace "viewbox" attributes with "viewBox"
        #   in <li> remove value="XX" attributes since these are illegal
        tag_pattern = re.compile(r'''(<[^>]*>)''')
        li_value_pattern = re.compile(r'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE)

        for i in xrange(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # tag pattern
            srcpieces = tag_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith('<svg') or tag.startswith('<SVG'):
                    tag = tag.replace('preserveaspectratio','preserveAspectRatio')
                    tag = tag.replace('viewbox','viewBox')
                elif tag.startswith('<li ') or tag.startswith('<LI '):
                    tagpieces = li_value_pattern.split(tag)
                    tag = "".join(tagpieces)
                srcpieces[j] = tag
            part = "".join(srcpieces)
            # store away modified version
            parts[i] = part

        self.k8proc.setFlows(flows)
        self.k8proc.setParts(parts)

        return self.used
Esempio n. 12
0
    def buildXHTML(self):

        # first need to update all links that are internal which
        # are based on positions within the xhtml files **BEFORE**
        # cutting and pasting any pieces into the xhtml text files

        #   kindle:pos:fid:XXXX:off:YYYYYYYYYY  (used for internal link within xhtml)
        #       XXXX is the offset in records into divtbl
        #       YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position

        # pos:fid pattern
        posfid_pattern = re.compile(r"""(<a.*?href=.*?>)""", re.IGNORECASE)
        posfid_index_pattern = re.compile(r"""['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']""")

        parts = []
        print "Building proper xhtml for each file"
        for i in xrange(self.k8proc.getNumberOfParts()):
            part = self.k8proc.getPart(i)
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)

            # internal links
            srcpieces = re.split(posfid_pattern, part)
            for j in range(1, len(srcpieces), 2):
                tag = srcpieces[j]
                if tag.startswith("<"):
                    for m in re.finditer(posfid_index_pattern, tag):
                        posfid = m.group(1)
                        offset = m.group(2)
                        filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
                        if idtag == "":
                            replacement = '"' + filename + '"'
                        else:
                            replacement = '"' + filename + "#" + idtag + '"'
                        tag = re.sub(posfid_index_pattern, replacement, tag, 1)
                    srcpieces[j] = tag
            part = "".join(srcpieces)
            parts.append(part)

        # we are free to cut and paste as we see fit
        # we can safely remove all of the Kindlegen generated aid tags
        find_tag_with_aid_pattern = re.compile(r"""(<[^>]*\said\s*=[^>]*>)""", re.IGNORECASE)
        within_tag_aid_position_pattern = re.compile(r"""\said\s*=['"][^'"]*['"]""")
        for i in xrange(len(parts)):
            part = parts[i]
            srcpieces = re.split(find_tag_with_aid_pattern, part)
            for j in range(len(srcpieces)):
                tag = srcpieces[j]
                if tag.startswith("<"):
                    for m in re.finditer(within_tag_aid_position_pattern, tag):
                        replacement = ""
                        tag = re.sub(within_tag_aid_position_pattern, replacement, tag, 1)
                    srcpieces[j] = tag
            part = "".join(srcpieces)
            parts[i] = part

        # we can safely remove all of the Kindlegen generated data-AmznPageBreak tags
        find_tag_with_AmznPageBreak_pattern = re.compile(r"""(<[^>]*\sdata-AmznPageBreak=[^>]*>)""", re.IGNORECASE)
        within_tag_AmznPageBreak_position_pattern = re.compile(r"""\sdata-AmznPageBreak=['"][^'"]*['"]""")
        for i in xrange(len(parts)):
            part = parts[i]
            srcpieces = re.split(find_tag_with_AmznPageBreak_pattern, part)
            for j in range(len(srcpieces)):
                tag = srcpieces[j]
                if tag.startswith("<"):
                    for m in re.finditer(within_tag_AmznPageBreak_position_pattern, tag):
                        replacement = ""
                        tag = re.sub(within_tag_AmznPageBreak_position_pattern, replacement, tag, 1)
                    srcpieces[j] = tag
            part = "".join(srcpieces)
            parts[i] = part

        # we have to handle substitutions for the flows  pieces first as they may
        # be inlined into the xhtml text
        #   kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
        #   kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
        #   kindle:embed:XXXX   (used for fonts)

        flows = []
        flows.append(None)
        flowinfo = []
        flowinfo.append([None, None, None, None])

        # regular expression search patterns
        img_pattern = re.compile(r"""(<[img\s|image\s][^>]*>)""", re.IGNORECASE)
        img_index_pattern = re.compile(r"""['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]""", re.IGNORECASE)

        tag_pattern = re.compile(r"""(<[^>]*>)""")
        flow_pattern = re.compile(r"""['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]""", re.IGNORECASE)

        url_pattern = re.compile(r"""(url\(.*?\))""", re.IGNORECASE)
        url_img_index_pattern = re.compile(r"""kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*""", re.IGNORECASE)
        font_index_pattern = re.compile(r"""kindle:embed:([0-9|A-V]+)""", re.IGNORECASE)
        url_css_index_pattern = re.compile(r"""kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*""", re.IGNORECASE)

        for i in xrange(1, self.k8proc.getNumberOfFlows()):
            [type, format, dir, filename] = self.k8proc.getFlowInfo(i)
            flowpart = self.k8proc.getFlow(i)

            # links to raster image files from image tags
            # image_pattern
            srcpieces = re.split(img_pattern, flowpart)
            for j in range(1, len(srcpieces), 2):
                tag = srcpieces[j]
                if tag.startswith("<im"):
                    for m in re.finditer(img_index_pattern, tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.imgnames[imageNumber - 1]
                        if imageName != None:
                            replacement = '"../Images/' + imageName + '"'
                            self.used[imageName] = "used"
                            tag = re.sub(img_index_pattern, replacement, tag, 1)
                        else:
                            print "Error: Referenced image %s was not recognized as a valid image in %s" % (
                                imageNumber,
                                tag,
                            )
                    srcpieces[j] = tag
            flowpart = "".join(srcpieces)

            # replacements inside css url():
            srcpieces = re.split(url_pattern, flowpart)
            for j in range(1, len(srcpieces), 2):
                tag = srcpieces[j]

                #  process links to raster image files
                for m in re.finditer(url_img_index_pattern, tag):
                    imageNumber = fromBase32(m.group(1))
                    imageName = self.imgnames[imageNumber - 1]
                    if imageName != None:
                        replacement = '"../Images/' + imageName + '"'
                        self.used[imageName] = "used"
                        tag = re.sub(url_img_index_pattern, replacement, tag, 1)
                    else:
                        print "Error: Referenced image %s was not recognized as a valid image in %s" % (
                            imageNumber,
                            tag,
                        )
                # process links to fonts
                for m in re.finditer(font_index_pattern, tag):
                    fontNumber = fromBase32(m.group(1))
                    fontName = self.imgnames[fontNumber - 1]
                    if fontName is None:
                        print "Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag)
                    else:
                        replacement = '"../Fonts/' + fontName + '"'
                        tag = re.sub(font_index_pattern, replacement, tag, 1)
                        self.used[fontName] = "used"

                # process links to other css pieces
                for m in re.finditer(url_css_index_pattern, tag):
                    num = fromBase32(m.group(1))
                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                    flowtext = self.k8proc.getFlow(num)
                    replacement = '"../' + pdir + "/" + fnm + '"'
                    tag = re.sub(url_css_index_pattern, replacement, tag, 1)
                    self.used[fnm] = "used"

                srcpieces[j] = tag
            flowpart = "".join(srcpieces)

            # flow pattern not inside url()
            srcpieces = re.split(tag_pattern, flowpart)
            for j in range(1, len(srcpieces), 2):
                tag = srcpieces[j]
                if tag.startswith("<"):
                    for m in re.finditer(flow_pattern, tag):
                        num = fromBase32(m.group(1))
                        [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                        flowtext = self.k8proc.getFlow(num)
                        if fmt == "inline":
                            tag = flowtext
                        else:
                            replacement = '"../' + pdir + "/" + fnm + '"'
                            tag = re.sub(flow_pattern, replacement, tag, 1)
                            self.used[fnm] = "used"
                    srcpieces[j] = tag
            flowpart = "".join(srcpieces)

            # store away in our own copy
            flows.append(flowpart)

        # now handle the main text xhtml parts

        # Handle the flow items in the XHTML text pieces
        # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
        tag_pattern = re.compile(r"""(<[^>]*>)""")
        flow_pattern = re.compile(r"""['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]""", re.IGNORECASE)
        for i in xrange(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # flow pattern
            srcpieces = re.split(tag_pattern, part)
            for j in range(1, len(srcpieces), 2):
                tag = srcpieces[j]
                if tag.startswith("<"):
                    for m in re.finditer(flow_pattern, tag):
                        num = fromBase32(m.group(1))
                        [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                        flowpart = self.k8proc.getFlow(num)
                        if fmt == "inline":
                            tag = flowpart
                        else:
                            replacement = '"../' + pdir + "/" + fnm + '"'
                            tag = re.sub(flow_pattern, replacement, tag, 1)
                            self.used[fnm] = "used"
                    srcpieces[j] = tag
            part = "".join(srcpieces)
            # store away modified version
            parts[i] = part

        # Handle any embedded raster images links in the xhtml text
        # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
        img_pattern = re.compile(r"""(<[img\s|image\s][^>]*>)""", re.IGNORECASE)
        img_index_pattern = re.compile(r"""['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]""")
        for i in xrange(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # links to raster image files
            # image_pattern
            srcpieces = re.split(img_pattern, part)
            for j in range(1, len(srcpieces), 2):
                tag = srcpieces[j]
                if tag.startswith("<im"):
                    for m in re.finditer(img_index_pattern, tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.imgnames[imageNumber - 1]
                        if imageName != None:
                            replacement = '"../Images/' + imageName + '"'
                            self.used[imageName] = "used"
                            tag = re.sub(img_index_pattern, replacement, tag, 1)
                        else:
                            print "Error: Referenced image %s was not recognized as a valid image in %s" % (
                                imageNumber,
                                tag,
                            )
                    srcpieces[j] = tag
            part = "".join(srcpieces)
            # store away modified version
            parts[i] = part

        # finally perform any general cleanups needed to make valid XHTML
        # these include:
        #   in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
        #   in svg tags replace "viewbox" attributes with "viewBox"
        #   in <li> remove value="XX" attributes since these are illegal
        tag_pattern = re.compile(r"""(<[^>]*>)""")
        li_value_pattern = re.compile(r"""\svalue\s*=\s*['"][^'"]*['"]""", re.IGNORECASE)
        for i in xrange(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # tag pattern
            srcpieces = re.split(tag_pattern, part)
            for j in range(1, len(srcpieces), 2):
                tag = srcpieces[j]
                if tag.startswith("<svg") or tag.startswith("<SVG"):
                    tag = tag.replace("preserveaspectratio", "preserveAspectRatio")
                    tag = tag.replace("viewbox", "viewBox")
                elif tag.startswith("<li ") or tag.startswith("<LI "):
                    tagpieces = re.split(li_value_pattern, tag)
                    tag = "".join(tagpieces)
                srcpieces[j] = tag
            part = "".join(srcpieces)
            # store away modified version
            parts[i] = part

        self.k8proc.setFlows(flows)
        self.k8proc.setParts(parts)

        return self.used
Esempio n. 13
0
    def buildParts(self, rawML):
        # now split the rawML into its flow pieces
        self.flows = []
        for j in xrange(0, len(self.fdsttbl)-1):
            start = self.fdsttbl[j]
            end = self.fdsttbl[j+1]
            self.flows.append(rawML[start:end])

        # the first piece represents the xhtml text
        text = self.flows[0]
        self.flows[0] = ''

        # walk the <skeleton> and fragment tables to build original source xhtml files
        # *without* destroying any file position information needed for later href processing
        # and create final list of file separation start: stop points and etc in partinfo
        if self.DEBUG:
            print "\nRebuilding flow piece 0: the main body of the ebook"
        self.parts = []
        self.partinfo = []
        fragptr = 0
        baseptr = 0
        cnt = 0
        for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos: baseptr]
            for i in range(fragcnt):
                [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
                aidtext = idtext[12:-2]
                if i == 0:
                    filename = 'part%04d.xhtml' % filenum
                slice = text[baseptr: baseptr + length]
                insertpos = insertpos - skelpos
                head = skeleton[:insertpos]
                tail = skeleton[insertpos:]
                actual_inspos = insertpos
                if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')):
                    # There is an incomplete tag in either the head or tail.
                    # This can happen for some badly formed KF8 files
                    print 'The fragment table for %s has incorrect insert position. Calculating manually.' % skelname
                    bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
                    if bp != ep:
                        actual_inspos = ep + 1 + startpos
                if insertpos != actual_inspos:
                    print "fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos
                    insertpos = actual_inspos
                    self.fragtbl[fragptr][0] = actual_inspos + skelpos
                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
                baseptr = baseptr + length
                fragptr += 1
            cnt += 1
            self.parts.append(skeleton)
            self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext])

        assembled_text = "".join(self.parts)
        if self.DEBUG:
            outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat')
            open(pathof(outassembled),'wb').write(assembled_text)

        # The primary css style sheet is typically stored next followed by any
        # snippets of code that were previously inlined in the
        # original xhtml but have been stripped out and placed here.
        # This can include local CDATA snippets and and svg sections.

        # The problem is that for most browsers and ereaders, you can not
        # use <img src="imageXXXX.svg" /> to import any svg image that itself
        # properly uses an <image/> tag to import some raster image - it
        # should work according to the spec but does not for almost all browsers
        # and ereaders and causes epub validation issues because those  raster
        # images are in manifest but not in xhtml text - since they only
        # referenced from an svg image

        # So we need to check the remaining flow pieces to see if they are css
        # or svg images.  if svg images, we must check if they have an <image />
        # and if so inline them into the xhtml text pieces.

        # there may be other sorts of pieces stored here but until we see one
        # in the wild to reverse engineer we won't be able to tell
        self.flowinfo.append([None, None, None, None])
        svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE)
        image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE)
        for j in xrange(1,len(self.flows)):
            flowpart = self.flows[j]
            nstr = '%04d' % j
            m = re.search(svg_tag_pattern, flowpart)
            if m is not None:
                # svg
                type = 'svg'
                start = m.start()
                m2 = re.search(image_tag_pattern, flowpart)
                if m2 is not None:
                    format = 'inline'
                    dir = None
                    fname = None
                    # strip off anything before <svg if inlining
                    flowpart = flowpart[start:]
                else:
                    format = 'file'
                    dir = "Images"
                    fname = 'svgimg' + nstr + '.svg'
            else:
                # search for CDATA and if exists inline it
                if flowpart.find('[CDATA[') >= 0:
                    type = 'css'
                    flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
                    format = 'inline'
                    dir = None
                    fname = None
                else:
                    # css - assume as standalone css file
                    type = 'css'
                    format = 'file'
                    dir = "Styles"
                    fname = 'style' + nstr + '.css'

            self.flows[j] = flowpart
            self.flowinfo.append([type, format, dir, fname])


        if self.DEBUG:
            print "\nFlow Map:  %d entries" % len(self.flowinfo)
            for fi in self.flowinfo:
                print fi
            print "\n"

            print "\nXHTML File Part Position Information: %d entries" % len(self.partinfo)
            for pi in self.partinfo:
                print pi

        if False: #self.Debug:
            # dump all of the locations of the aid tags used in TEXT
            # find id links only inside of tags
            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
            #    [^>]* means match any amount of chars except for  '>' char
            #    [^'"] match any amount of chars except for the quote character
            #    \s* means match any amount of whitespace
            print "\npositions of all aid= pieces"
            id_pattern = re.compile(r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE)
            for m in re.finditer(id_pattern, rawML):
                [filename, partnum, start, end] = self.getFileInfo(m.start())
                [seqnum, idtext] = self.getFragTblInfo(m.start())
                value = fromBase32(m.group(1))
                print "  aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end)
                print "       %s  fragtbl entry %d" % (idtext, seqnum)

        return