Example #1
0
 def parseNCX(self):
     indx_data = []
     tag_fieldname_map = {
             1: ['pos',0],
             2: ['len',0],
             3: ['noffs',0],
             4: ['hlvl',0],
             5: ['koffs',0],
             6: ['pos_fid',0],
             21: ['parent',0],
             22: ['child1',0],
             23: ['childn',0]
     }
     if self.ncxidx != 0xffffffff:
         outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
         if DEBUG_NCX:
             print ctoc_text
             print outtbl
         num = 0
         for [text, tagMap] in outtbl:
             tmp = {
                     'name': text,
                     'pos':  -1,
                     'len':  0,
                     'noffs': -1,
                     'text' : "Unknown Text",
                     'hlvl' : -1,
                     'kind' : "Unknown Kind",
                     'pos_fid' : None,
                     'parent' : -1,
                     'child1' : -1,
                     'childn' : -1,
                     'num'  : num
                     }
             for tag in tag_fieldname_map.keys():
                 [fieldname, i] = tag_fieldname_map[tag]
                 if tag in tagMap:
                     fieldvalue = tagMap[tag][i]
                     if tag == 6:
                         pos_fid = toBase32(fieldvalue,4)
                         fieldvalue2 = tagMap[tag][i+1]
                         pos_off = toBase32(fieldvalue2,10)
                         fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off)
                     tmp[fieldname] = fieldvalue
                     if tag == 3:
                         toctext = ctoc_text.get(fieldvalue, 'Unknown Text')
                         if self.mh.codec != 'utf-8':
                             toctext = unicode(toctext, self.mh.codec).encode('utf-8')
                         tmp['text'] = toctext
                     if tag == 5:
                         kindtext = ctoc_text.get(fieldvalue, 'Unknown Kind')
                         if self.mh.codec != 'utf-8':
                             kindtext = unicode(kindtext, self.mh.codec).encode('utf-8')
                         tmp['kind'] = kindtext
             indx_data.append(tmp)
             if DEBUG_NCX:
                 print "record number: ", num
                 print "name: ", tmp['name'],
                 print "position", tmp['pos']," length: ", tmp['len']
                 print "text: ", tmp['text']
                 print "kind: ", tmp['kind']
                 print "heading level: ", tmp['hlvl']
                 print "parent:", tmp['parent']
                 print "first child: ",tmp['child1']," last child: ", tmp['childn']
                 print "pos_fid is ", tmp['pos_fid']
                 print "\n\n"
             num += 1
     self.indx_data = indx_data
     return indx_data
Example #2
0
def processMobi8(mh,
                 metadata,
                 sect,
                 files,
                 rscnames,
                 pagemapproc,
                 k8resc,
                 obfuscate_data,
                 apnxfile=None,
                 epubver='2'):
    global DUMP
    global WRITE_RAW_DATA

    # extract raw markup langauge
    rawML = mh.getRawML()
    if DUMP or WRITE_RAW_DATA:
        outraw = os.path.join(files.k8dir,
                              files.getInputFileBasename() + '.rawml')
        open(pathof(outraw), 'wb').write(rawML)

    # KF8 require other indexes which contain parsing information and the FDST info
    # to process the rawml back into the xhtml files, css files, svg image files, etc
    k8proc = K8Processor(mh, sect, files, DUMP)
    k8proc.buildParts(rawML)

    # collect information for the guide first
    guidetext = k8proc.getGuideText()

    # if the guide was empty, add in any guide info from metadata, such as StartOffset
    if not guidetext and 'StartOffset' in metadata.keys():
        # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part...
        # Taking that into account, we only care about the *last* StartOffset, which
        # should always be the correct one in these cases (the one actually pointing
        # to the right place in the mobi8 part).
        starts = metadata['StartOffset']
        last_start = starts[-1]
        last_start = int(last_start)
        if last_start == 0xffffffff:
            last_start = 0
        seq, idtext = k8proc.getFragTblInfo(last_start)
        filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), '0000000000')
        linktgt = filename
        if idtext != '':
            linktgt += '#' + idtext
        guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt

    # if apnxfile is passed in use it for page map information
    if apnxfile is not None and pagemapproc is None:
        apnxdata = "00000000" + file(apnxfile, 'rb').read()
        pagemapproc = PageMapProcessor(mh, apnxdata)

    # generate the page map
    pagemapxml = ''
    if pagemapproc is not None:
        pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc)
        outpm = os.path.join(files.k8oebps, 'page-map.xml')
        open(pathof(outpm), 'wb').write(pagemapxml)
        if DUMP:
            print pagemapproc.getNames()
            print pagemapproc.getOffsets()
            print "\n\nPage Map"
            print pagemapxml

    # process the toc ncx
    # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
    print "Processing ncx / toc"
    ncx = ncxExtract(mh, files)
    ncx_data = ncx.parseNCX()
    # extend the ncx data with filenames and proper internal idtags
    for i in range(len(ncx_data)):
        ncxmap = ncx_data[i]
        [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':')
        filename, idtag = k8proc.getIDTagByPosFid(fid, off)
        ncxmap['filename'] = filename
        ncxmap['idtag'] = idtag
        ncx_data[i] = ncxmap

    # convert the rawML to a set of xhtml files
    print "Building an epub-like structure"
    htmlproc = XHTMLK8Processor(rscnames, k8proc)
    usedmap = htmlproc.buildXHTML()

    # write out the xhtml svg, and css files
    # fileinfo = [skelid|coverpage, dir, name]
    fileinfo = []
    # first create a cover page if none exists
    if CREATE_COVER_PAGE:
        cover = CoverProcessor(files, metadata, rscnames)
        cover_img = cover.getImageName()
        need_to_create_cover_page = False
        if cover_img is not None:
            if k8resc is None or not k8resc.hasSpine():
                part = k8proc.getPart(0)
                if part.find(cover_img) == -1:
                    need_to_create_cover_page = True
            else:
                if "coverpage" not in k8resc.spine_idrefs.keys():
                    part = k8proc.getPart(int(k8resc.spine_order[0]))
                    if part.find(cover_img) == -1:
                        k8resc.prepend_to_spine("coverpage", "inserted", "no",
                                                None)
                if k8resc.spine_order[0] == "coverpage":
                    need_to_create_cover_page = True
            if need_to_create_cover_page:
                filename = cover.getXHTMLName()
                fileinfo.append(["coverpage", 'Text', filename])
                guidetext += cover.guide_toxml()
                cover.writeXHTML()

    n = k8proc.getNumberOfParts()
    for i in range(n):
        part = k8proc.getPart(i)
        [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i)
        fileinfo.append([str(skelnum), dir, filename])
        fname = os.path.join(files.k8oebps, dir, filename)
        open(pathof(fname), 'wb').write(part)
    n = k8proc.getNumberOfFlows()
    for i in range(1, n):
        [type, format, dir, filename] = k8proc.getFlowInfo(i)
        flowpart = k8proc.getFlow(i)
        if format == 'file':
            fileinfo.append([None, dir, filename])
            fname = os.path.join(files.k8oebps, dir, filename)
            open(pathof(fname), 'wb').write(flowpart)

    # create the opf
    opf = OPFProcessor(files, metadata.copy(), fileinfo, rscnames, True, mh,
                       usedmap, pagemapxml, guidetext, k8resc, epubver)
    uuid = opf.writeOPF(bool(obfuscate_data))

    if opf.hasNCX():
        # Create a toc.ncx.
        ncx.writeK8NCX(ncx_data, metadata)
    if opf.hasNAV():
        # Create a navigation document.
        nav = NAVProcessor(files)
        nav.writeNAV(ncx_data, guidetext, metadata)

    # make an epub-like structure of it all
    print "Creating an epub-like file"
    files.makeEPUB(usedmap, obfuscate_data, uuid)
def processMobi8(mh, metadata, sect, files, imgnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'):
    global DUMP
    global WRITE_RAW_DATA

    # extract raw markup langauge
    rawML = mh.getRawML()
    if DUMP or WRITE_RAW_DATA:
        outraw = os.path.join(files.k8dir,files.getInputFileBasename() + '.rawml')
        open(pathof(outraw),'wb').write(rawML)

    # KF8 require other indexes which contain parsing information and the FDST info
    # to process the rawml back into the xhtml files, css files, svg image files, etc
    k8proc = K8Processor(mh, sect, files, DUMP)
    k8proc.buildParts(rawML)

    # collect information for the guide first
    guidetext = k8proc.getGuideText()

    # if the guide was empty, add in any guide info from metadata, such as StartOffset
    if not guidetext and 'StartOffset' in metadata.keys():
        # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part...
        # Taking that into account, we only care about the *last* StartOffset, which
        # should always be the correct one in these cases (the one actually pointing
        # to the right place in the mobi8 part).
        starts = metadata['StartOffset']
        last_start = starts[-1]
        last_start = int(last_start)
        if last_start == 0xffffffff:
            last_start = 0
        seq, idtext = k8proc.getFragTblInfo(last_start)
        filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), '0000000000')
        linktgt = filename
        if idtext != '':
            linktgt += '#' + idtext
        guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt

    # if apnxfile is passed in use it for page map information
    if apnxfile is not None and pagemapproc is None:
        apnxdata = "00000000" + file(apnxfile, 'rb').read()
        pagemapproc = PageMapProcessor(mh, apnxdata)

    # generate the page map
    pagemapxml = ''
    if pagemapproc is not None:
        pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc)
        outpm = os.path.join(files.k8oebps,'page-map.xml')
        open(pathof(outpm),'wb').write(pagemapxml)
        if DUMP:
            print pagemapproc.getNames()
            print pagemapproc.getOffsets()
            print "\n\nPage Map"
            print pagemapxml

    # process the toc ncx
    # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
    print "Processing ncx / toc"
    ncx = ncxExtract(mh, files)
    ncx_data = ncx.parseNCX()
    # extend the ncx data with filenames and proper internal idtags
    for i in range(len(ncx_data)):
        ncxmap = ncx_data[i]
        [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':')
        filename, idtag = k8proc.getIDTagByPosFid(fid, off)
        ncxmap['filename'] = filename
        ncxmap['idtag'] = idtag
        ncx_data[i] = ncxmap

    # convert the rawML to a set of xhtml files
    print "Building an epub-like structure"
    htmlproc = XHTMLK8Processor(imgnames, k8proc)
    usedmap = htmlproc.buildXHTML()


    # write out the xhtml svg, and css files
    # fileinfo = [skelid|coverpage, dir, name]
    fileinfo = []
    # first create a cover page if none exists
    if CREATE_COVER_PAGE:
        cover = CoverProcessor(files, metadata, imgnames)
        cover_img = cover.getImageName()
        need_to_create_cover_page = False
        if cover_img is not None:
            if k8resc is None or not k8resc.hasSpine():
                part = k8proc.getPart(0)
                if part.find(cover_img) == -1:
                    need_to_create_cover_page = True
            else:
                if "coverpage" not in k8resc.spine_idrefs.keys():
                    part = k8proc.getPart(int(k8resc.spine_order[0]))
                    if part.find(cover_img) == -1:
                        k8resc.prepend_to_spine("coverpage", "inserted", "no", None)
                if k8resc.spine_order[0] == "coverpage":
                    need_to_create_cover_page = True
            if need_to_create_cover_page:
                filename = cover.getXHTMLName()
                fileinfo.append(["coverpage", 'Text', filename])
                guidetext += cover.guide_toxml()
                cover.writeXHTML()

    n =  k8proc.getNumberOfParts()
    for i in range(n):
        part = k8proc.getPart(i)
        [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i)
        fileinfo.append([str(skelnum), dir, filename])
        fname = os.path.join(files.k8oebps,dir,filename)
        open(pathof(fname),'wb').write(part)
    n = k8proc.getNumberOfFlows()
    for i in range(1, n):
        [type, format, dir, filename] = k8proc.getFlowInfo(i)
        flowpart = k8proc.getFlow(i)
        if format == 'file':
            fileinfo.append([None, dir, filename])
            fname = os.path.join(files.k8oebps,dir,filename)
            open(pathof(fname),'wb').write(flowpart)

    # create the opf
    opf = OPFProcessor(files, metadata.copy(), fileinfo, imgnames, True, mh, usedmap, pagemapxml, guidetext, k8resc, epubver)
    uuid = opf.writeOPF(bool(obfuscate_data))

    if opf.hasNCX():
        # Create a toc.ncx.
        ncx.writeK8NCX(ncx_data, metadata)
    if opf.hasNAV():
        # Create a navigation document.
        nav = NAVProcessor(files)
        nav.writeNAV(ncx_data, guidetext, metadata)

    # make an epub-like structure of it all
    print "Creating an epub-like file"
    files.makeEPUB(usedmap, obfuscate_data, uuid)