def processPrintReplica(metadata, files, rscnames, mh): global DUMP global WRITE_RAW_DATA rawML = mh.getRawML() if DUMP or WRITE_RAW_DATA: outraw = os.path.join(files.outdir,files.getInputFileBasename() + '.rawpr') with open(pathof(outraw),'wb') as f: f.write(rawML) fileinfo = [] print("Print Replica ebook detected") try: numTables, = struct.unpack_from(b'>L', rawML, 0x04) tableIndexOffset = 8 + 4*numTables # for each table, read in count of sections, assume first section is a PDF # and output other sections as binary files for i in range(numTables): sectionCount, = struct.unpack_from(b'>L', rawML, 0x08 + 4*i) for j in range(sectionCount): sectionOffset, sectionLength, = struct.unpack_from(b'>LL', rawML, tableIndexOffset) tableIndexOffset += 8 pdf_fpath = u'' if j == 0: if azw2zip_cfg.isOutputPdf(): pdf_fpath = os.path.join(files.outdir, '..', azw2zip_cfg.makeOutputFileName(metadata) + ('.%03d.pdf' % (i+1))) entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.pdf' % (i+1))) else: entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.%03d.data' % ((i+1),j))) with open(pathof(entryName), 'wb') as f: f.write(rawML[sectionOffset:(sectionOffset+sectionLength)]) if pdf_fpath: with open(pathof(pdf_fpath), 'wb') as f: f.write(rawML[sectionOffset:(sectionOffset+sectionLength)]) except Exception as e: print('Error processing Print Replica: ' + str(e)) fileinfo.append([None,'', files.getInputFileBasename() + '.pdf']) usedmap = {} for name in rscnames: if name is not None: usedmap[name] = 'used' opf = OPFProcessor(files, metadata, fileinfo, rscnames, False, mh, usedmap) opf.writeOPF()
def process_all_mobi_headers(files, sect, mhlst, K8Boundary, k8only=False): imgnames = [] for mh in mhlst: if mh.isK8(): print "\n\nProcessing K8 format Ebook ..." elif mh.isPrintReplica(): print "\nProcessing PrintReplica (.azw4) format Ebook ..." else: print "\nProcessing Mobi format Ebook ..." if DEBUG: # write out raw mobi header data mhname = os.path.join(files.outdir, "header.dat") if mh.isK8(): mhname = os.path.join(files.outdir, "header_K8.dat") file(mhname, 'wb').write(mh.header) # process each mobi header if mh.isEncrypted(): raise unpackException('file is encrypted') # build up the metadata metadata = mh.getMetaData() metadata['Language'] = mh.Language() metadata['Title'] = [unicode(mh.title, mh.codec).encode("utf-8")] metadata['Codec'] = [mh.codec] metadata['UniqueID'] = [str(mh.unique_id)] if DEBUG: print "MetaData from EXTH" print metadata # save the raw markup language rawML = mh.getRawML() if DEBUG or WRITE_RAW_DATA: ext = '.rawml' if mh.isK8(): outraw = os.path.join(files.k8dir, files.getInputFileBasename() + ext) else: if mh.isPrintReplica(): ext = '.rawpr' outraw = os.path.join(files.outdir, files.getInputFileBasename() + ext) else: outraw = os.path.join(files.mobi7dir, files.getInputFileBasename() + ext) file(outraw, 'wb').write(rawML) # process additional sections that represent images, resources, fonts, and etc # build up a list of image names to use to postprocess the rawml print "Unpacking images, resources, fonts, etc" firstaddl = mh.getfirstAddl() if DEBUG: print "firstaddl is ", firstaddl print "num_sections is ", sect.num_sections print "K8Boundary is ", K8Boundary beg = firstaddl end = sect.num_sections if firstaddl < K8Boundary: end = K8Boundary obfuscate_data = [] for i in xrange(beg, end): if DEBUG: print "Section is ", i data = sect.loadSection(i) type = data[0:4] if type in ["FLIS", "FCIS", "FDST", "DATP"]: if DEBUG: print 'First 4 bytes: %s' % toHex(data[0:4]) fname = "%05d" % (1 + i - beg) fname = type + fname if mh.isK8(): fname += "_K8" fname += '.dat' outname = os.path.join(files.outdir, fname) file(outname, 'wb').write(data) print "Skipping ", type, " section" imgnames.append(None) continue elif type == "SRCS": # The mobi file was created by kindlegen and contains a zip archive with all source files. # Extract the archive and save it. print " Info: File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME) file(srcname, 'wb').write(data[16:]) imgnames.append(None) continue elif type == "FONT": # fonts only exist in K8 ebooks # Format: # bytes 0 - 3: 'FONT' # bytes 4 - 7: uncompressed size # bytes 8 - 11: flags # bit 0x0001 - zlib compression # bit 0x0002 - obfuscated with xor string # bytes 12 - 15: offset to start of compressed font data # bytes 16 - 19: length of xor string stored before the start of the comnpress font data # bytes 19 - 23: start of xor string usize, fflags, dstart, xor_len, xor_start = struct.unpack_from( '>LLLLL', data, 4) font_data = data[dstart:] extent = len(font_data) extent = min(extent, 1040) if fflags & 0x0002: # obfuscated so need to de-obfuscate the first 1040 bytes key = bytearray(data[xor_start:xor_start + xor_len]) buf = bytearray(font_data) for n in xrange(extent): buf[n] ^= key[n % xor_len] font_data = bytes(buf) if fflags & 0x0001: # ZLIB compressed data wbits, err = read_zlib_header(font_data[:2]) if err is None: adler32, = struct.unpack_from('>I', font_data, len(font_data) - 4) font_data = zlib.decompress(font_data[2:-4], -wbits, usize) if len(font_data) != usize: print 'Font Decompression Error: Uncompressed font size mismatch' if False: # For some reason these almost never match, probably Amazon has a # buggy Adler32 implementation sig = (zlib.adler32(font_data) & 0xffffffff) if sig != adler32: print 'Font Decompression Error' print 'Adler checksum did not match. Stored: %d Calculated: %d' % ( adler32, sig) else: print "Error Decoding Font", str(err) hdr = font_data[0:4] if hdr == '\0\1\0\0' or hdr == 'true' or hdr == 'ttcf': ext = '.ttf' elif hdr == 'OTTO': ext = '.otf' else: print "Warning: unknown font header %s" % hdr.encode('hex') ext = '.dat' fontname = "font%05d" % (1 + i - beg) fontname += ext if (ext == '.ttf' or ext == '.otf') and (fflags & 0x0002): obfuscate_data.append(fontname) print " extracting font: ", fontname outfnt = os.path.join(files.imgdir, fontname) file(outfnt, 'wb').write(font_data) imgnames.append(fontname) continue elif type == "RESC": # resources only exist in K8 ebooks # not sure what they are, looks like # a piece of the top of the original content.opf # file, so only write them out # if DEBUG is True if DEBUG: data = data[4:] rescname = "resc%05d.dat" % (1 + i - beg) print " extracting resource: ", rescname outrsc = os.path.join(files.imgdir, rescname) file(outrsc, 'wb').write(data) imgnames.append(None) continue if data == EOF_RECORD: if DEBUG: print "Skip section %i as it contains the EOF record." % i imgnames.append(None) continue # if reach here should be an image but double check to make sure # Get the proper file extension imgtype = imghdr.what(None, data) if imgtype is None: print "Warning: Section %s contains no image or an unknown image format" % i imgnames.append(None) if DEBUG: print 'First 4 bytes: %s' % toHex(data[0:4]) fname = "unknown%05d.dat" % (1 + i - beg) outname = os.path.join(files.outdir, fname) file(outname, 'wb').write(data) else: imgname = "image%05d.%s" % (1 + i - beg, imgtype) print " extracting image: ", imgname outimg = os.path.join(files.imgdir, imgname) file(outimg, 'wb').write(data) imgnames.append(imgname) # FIXME all of this PrintReplica code is untested! # Process print replica book. if mh.isPrintReplica() and not k8only: filenames = [] print "Print Replica ebook detected" try: mh.processPrintReplica(files) except Exception, e: print 'Error processing Print Replica: ' + str(e) filenames.append(['', files.getInputFileBasename() + '.pdf']) usedmap = {} for name in imgnames: if name != None: usedmap[name] = 'used' opf = OPFProcessor(files, metadata, filenames, imgnames, False, mh, usedmap) opf.writeOPF() continue if mh.isK8(): # K8 mobi # require other indexes which contain parsing information and the FDST info # to process the rawml back into the xhtml files, css files, svg image files, etc k8proc = K8Processor(mh, sect, DEBUG) k8proc.buildParts(rawML) # collect information for the guide first guidetext = k8proc.getGuideText() # add in any guide info from metadata, such as StartOffset if 'StartOffset' in metadata.keys(): starts = metadata['StartOffset'] last_start = starts.pop() if int(last_start) == 0xffffffff: last_start = '0' filename, partnum, beg, end = k8proc.getFileInfo( int(last_start)) idtext = k8proc.getIDTag(int(last_start)) linktgt = filename if idtext != '': linktgt += '#' + idtext guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() # extend the ncx data with # info about filenames and proper internal idtags for i in range(len(ncx_data)): ncxmap = ncx_data[i] [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':') filename, idtag = k8proc.getIDTagByPosFid(fid, off) ncxmap['filename'] = filename ncxmap['idtag'] = idtag ncx_data[i] = ncxmap # write out the toc.ncx ncx.writeK8NCX(ncx_data, metadata) # convert the rawML to a set of xhtml files htmlproc = XHTMLK8Processor(imgnames, k8proc) usedmap = htmlproc.buildXHTML() # write out the files filenames = [] n = k8proc.getNumberOfParts() for i in range(n): part = k8proc.getPart(i) [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) filenames.append([dir, filename]) fname = os.path.join(files.k8oebps, dir, filename) file(fname, 'wb').write(part) n = k8proc.getNumberOfFlows() for i in range(1, n): [type, format, dir, filename] = k8proc.getFlowInfo(i) flowpart = k8proc.getFlow(i) if format == 'file': filenames.append([dir, filename]) fname = os.path.join(files.k8oebps, dir, filename) file(fname, 'wb').write(flowpart) opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext) if obfuscate_data: uuid = opf.writeOPF(True) else: uuid = opf.writeOPF() # make an epub of it all files.makeEPUB(usedmap, obfuscate_data, uuid) elif not k8only: # An original Mobi # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() ncx.writeNCX(metadata) positionMap = {} # If Dictionary build up the positionMap if mh.isDictionary(): if mh.DictInLanguage(): metadata['DictInLanguage'] = mh.DictInLanguage() if mh.DictOutLanguage(): metadata['DictOutLanguage'] = mh.DictOutLanguage() positionMap = dictSupport(mh, sect).getPositionMap() # convert the rawml back to Mobi ml proc = HTMLProcessor(files, metadata, imgnames) srctext = proc.findAnchors(rawML, ncx_data, positionMap) srctext, usedmap = proc.insertHREFS() filenames = [] # write the proper mobi html fname = files.getInputFileBasename() + '.html' filenames.append(['', fname]) outhtml = os.path.join(files.mobi7dir, fname) file(outhtml, 'wb').write(srctext) # create an OPF # extract guidetext from srctext guidetext = '' guidematch = re.search(r'''<guide>(.*)</guide>''', srctext, re.IGNORECASE + re.DOTALL) if guidematch: replacetext = r'''href="''' + filenames[0][ 1] + r'''#filepos\1"''' guidetext = re.sub(r'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidematch.group(1)) guidetext += '\n' guidetext = unicode(guidetext, mh.codec).encode("utf-8") opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext) opf.writeOPF()
def process_all_mobi_headers(files, sect, mhlst, K8Boundary, k8only=False): imgnames = [] for mh in mhlst: if mh.isK8(): print "\n\nProcessing K8 format Ebook ..." elif mh.isPrintReplica(): print "\nProcessing PrintReplica (.azw4) format Ebook ..." else: print "\nProcessing Mobi format Ebook ..." if DEBUG: # write out raw mobi header data mhname = os.path.join(files.outdir, "header.dat") if mh.isK8(): mhname = os.path.join(files.outdir, "header_K8.dat") file(mhname, "wb").write(mh.header) # process each mobi header if mh.isEncrypted(): raise unpackException("file is encrypted") # build up the metadata metadata = mh.getMetaData() metadata["Language"] = mh.Language() metadata["Title"] = [unicode(mh.title, mh.codec).encode("utf-8")] metadata["Codec"] = [mh.codec] metadata["UniqueID"] = [str(mh.unique_id)] if DEBUG: print "MetaData from EXTH" print metadata # save the raw markup language rawML = mh.getRawML() if DEBUG or WRITE_RAW_DATA: ext = ".rawml" if mh.isK8(): outraw = os.path.join(files.k8dir, files.getInputFileBasename() + ext) else: if mh.isPrintReplica(): ext = ".rawpr" outraw = os.path.join(files.outdir, files.getInputFileBasename() + ext) else: outraw = os.path.join(files.mobi7dir, files.getInputFileBasename() + ext) file(outraw, "wb").write(rawML) # process additional sections that represent images, resources, fonts, and etc # build up a list of image names to use to postprocess the rawml print "Unpacking images, resources, fonts, etc" firstaddl = mh.getfirstAddl() if DEBUG: print "firstaddl is ", firstaddl print "num_sections is ", sect.num_sections print "K8Boundary is ", K8Boundary beg = firstaddl end = sect.num_sections if firstaddl < K8Boundary: end = K8Boundary obfuscate_data = [] for i in xrange(beg, end): if DEBUG: print "Section is ", i data = sect.loadSection(i) type = data[0:4] if type in ["FLIS", "FCIS", "FDST", "DATP"]: if DEBUG: print "First 4 bytes: %s" % toHex(data[0:4]) fname = "%05d" % (1 + i - beg) fname = type + fname if mh.isK8(): fname += "_K8" fname += ".dat" outname = os.path.join(files.outdir, fname) file(outname, "wb").write(data) print "Skipping ", type, " section" imgnames.append(None) continue elif type == "SRCS": # The mobi file was created by kindlegen and contains a zip archive with all source files. # Extract the archive and save it. print " Info: File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME) file(srcname, "wb").write(data[16:]) imgnames.append(None) continue elif type == "FONT": # fonts only exist in K8 ebooks # Format: # bytes 0 - 3: 'FONT' # bytes 4 - 7: uncompressed size # bytes 8 - 11: flags # bit 0x0001 - zlib compression # bit 0x0002 - obfuscated with xor string # bytes 12 - 15: offset to start of compressed font data # bytes 16 - 19: length of xor string stored before the start of the comnpress font data # bytes 19 - 23: start of xor string usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(">LLLLL", data, 4) font_data = data[dstart:] extent = len(font_data) extent = min(extent, 1040) if fflags & 0x0002: # obfuscated so need to de-obfuscate the first 1040 bytes key = bytearray(data[xor_start : xor_start + xor_len]) buf = bytearray(font_data) for n in xrange(extent): buf[n] ^= key[n % xor_len] font_data = bytes(buf) if fflags & 0x0001: # ZLIB compressed data wbits, err = read_zlib_header(font_data[:2]) if err is None: adler32, = struct.unpack_from(">I", font_data, len(font_data) - 4) font_data = zlib.decompress(font_data[2:-4], -wbits, usize) if len(font_data) != usize: print "Font Decompression Error: Uncompressed font size mismatch" if False: # For some reason these almost never match, probably Amazon has a # buggy Adler32 implementation sig = zlib.adler32(font_data) & 0xFFFFFFFF if sig != adler32: print "Font Decompression Error" print "Adler checksum did not match. Stored: %d Calculated: %d" % (adler32, sig) else: print "Error Decoding Font", str(err) hdr = font_data[0:4] if hdr == "\0\1\0\0" or hdr == "true" or hdr == "ttcf": ext = ".ttf" elif hdr == "OTTO": ext = ".otf" else: print "Warning: unknown font header %s" % hdr.encode("hex") ext = ".dat" fontname = "font%05d" % (1 + i - beg) fontname += ext if (ext == ".ttf" or ext == ".otf") and (fflags & 0x0002): obfuscate_data.append(fontname) print " extracting font: ", fontname outfnt = os.path.join(files.imgdir, fontname) file(outfnt, "wb").write(font_data) imgnames.append(fontname) continue elif type == "RESC": # resources only exist in K8 ebooks # not sure what they are, looks like # a piece of the top of the original content.opf # file, so only write them out # if DEBUG is True if DEBUG: data = data[4:] rescname = "resc%05d.dat" % (1 + i - beg) print " extracting resource: ", rescname outrsc = os.path.join(files.imgdir, rescname) file(outrsc, "wb").write(data) imgnames.append(None) continue if data == EOF_RECORD: if DEBUG: print "Skip section %i as it contains the EOF record." % i imgnames.append(None) continue # if reach here should be an image but double check to make sure # Get the proper file extension imgtype = imghdr.what(None, data) if imgtype is None: print "Warning: Section %s contains no image or an unknown image format" % i imgnames.append(None) if DEBUG: print "First 4 bytes: %s" % toHex(data[0:4]) fname = "unknown%05d.dat" % (1 + i - beg) outname = os.path.join(files.outdir, fname) file(outname, "wb").write(data) else: imgname = "image%05d.%s" % (1 + i - beg, imgtype) print " extracting image: ", imgname outimg = os.path.join(files.imgdir, imgname) file(outimg, "wb").write(data) imgnames.append(imgname) # FIXME all of this PrintReplica code is untested! # Process print replica book. if mh.isPrintReplica() and not k8only: filenames = [] print "Print Replica ebook detected" try: mh.processPrintReplica(files) except Exception, e: print "Error processing Print Replica: " + str(e) filenames.append(["", files.getInputFileBasename() + ".pdf"]) usedmap = {} for name in imgnames: if name != None: usedmap[name] = "used" opf = OPFProcessor(files, metadata, filenames, imgnames, False, mh, usedmap) opf.writeOPF() continue if mh.isK8(): # K8 mobi # require other indexes which contain parsing information and the FDST info # to process the rawml back into the xhtml files, css files, svg image files, etc k8proc = K8Processor(mh, sect, DEBUG) k8proc.buildParts(rawML) # collect information for the guide first guidetext = k8proc.getGuideText() # add in any guide info from metadata, such as StartOffset if "StartOffset" in metadata.keys(): starts = metadata["StartOffset"] last_start = starts.pop() if int(last_start) == 0xFFFFFFFF: last_start = "0" filename, partnum, beg, end = k8proc.getFileInfo(int(last_start)) idtext = k8proc.getIDTag(int(last_start)) linktgt = filename if idtext != "": linktgt += "#" + idtext guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() # extend the ncx data with # info about filenames and proper internal idtags for i in range(len(ncx_data)): ncxmap = ncx_data[i] [junk1, junk2, junk3, fid, junk4, off] = ncxmap["pos_fid"].split(":") filename, idtag = k8proc.getIDTagByPosFid(fid, off) ncxmap["filename"] = filename ncxmap["idtag"] = idtag ncx_data[i] = ncxmap # write out the toc.ncx ncx.writeK8NCX(ncx_data, metadata) # convert the rawML to a set of xhtml files htmlproc = XHTMLK8Processor(imgnames, k8proc) usedmap = htmlproc.buildXHTML() # write out the files filenames = [] n = k8proc.getNumberOfParts() for i in range(n): part = k8proc.getPart(i) [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) filenames.append([dir, filename]) fname = os.path.join(files.k8oebps, dir, filename) file(fname, "wb").write(part) n = k8proc.getNumberOfFlows() for i in range(1, n): [type, format, dir, filename] = k8proc.getFlowInfo(i) flowpart = k8proc.getFlow(i) if format == "file": filenames.append([dir, filename]) fname = os.path.join(files.k8oebps, dir, filename) file(fname, "wb").write(flowpart) opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext) if obfuscate_data: uuid = opf.writeOPF(True) else: uuid = opf.writeOPF() # make an epub of it all files.makeEPUB(usedmap, obfuscate_data, uuid) elif not k8only: # An original Mobi # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() ncx.writeNCX(metadata) positionMap = {} # If Dictionary build up the positionMap if mh.isDictionary(): if mh.DictInLanguage(): metadata["DictInLanguage"] = mh.DictInLanguage() if mh.DictOutLanguage(): metadata["DictOutLanguage"] = mh.DictOutLanguage() positionMap = dictSupport(mh, sect).getPositionMap() # convert the rawml back to Mobi ml proc = HTMLProcessor(files, metadata, imgnames) srctext = proc.findAnchors(rawML, ncx_data, positionMap) srctext, usedmap = proc.insertHREFS() filenames = [] # write the proper mobi html fname = files.getInputFileBasename() + ".html" filenames.append(["", fname]) outhtml = os.path.join(files.mobi7dir, fname) file(outhtml, "wb").write(srctext) # create an OPF # extract guidetext from srctext guidetext = "" guidematch = re.search(r"""<guide>(.*)</guide>""", srctext, re.IGNORECASE + re.DOTALL) if guidematch: replacetext = r'''href="''' + filenames[0][1] + r'''#filepos\1"''' guidetext = re.sub(r"""filepos=['"]{0,1}0*(\d+)['"]{0,1}""", replacetext, guidematch.group(1)) guidetext += "\n" guidetext = unicode(guidetext, mh.codec).encode("utf-8") opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext) opf.writeOPF()
def processMobi7(mh, metadata, sect, files, rscnames): global DUMP global WRITE_RAW_DATA # An original Mobi rawML = mh.getRawML() if DUMP or WRITE_RAW_DATA: outraw = os.path.join(files.mobi7dir, files.getInputFileBasename() + '.rawml') open(pathof(outraw), 'wb').write(rawML) # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() ncx.writeNCX(metadata) positionMap = {} # if Dictionary build up the positionMap if mh.isDictionary(): if mh.DictInLanguage(): metadata['DictInLanguage'] = [mh.DictInLanguage()] if mh.DictOutLanguage(): metadata['DictOutLanguage'] = [mh.DictOutLanguage()] positionMap = dictSupport(mh, sect).getPositionMap() # convert the rawml back to Mobi ml proc = HTMLProcessor(files, metadata, rscnames) srctext = proc.findAnchors(rawML, ncx_data, positionMap) srctext, usedmap = proc.insertHREFS() # write the proper mobi html fileinfo = [] # fname = files.getInputFileBasename() + '.html' fname = 'book.html' fileinfo.append([None, '', fname]) outhtml = os.path.join(files.mobi7dir, fname) open(pathof(outhtml), 'wb').write(srctext) # extract guidetext from srctext guidetext = '' pagemapxml = '' guidematch = re.search(r'''<guide>(.*)</guide>''', srctext, re.IGNORECASE + re.DOTALL) if guidematch: guidetext = guidematch.group(1) # sometimes old mobi guide from srctext horribly written so need to clean up guidetext = guidetext.replace("\r", "") guidetext = guidetext.replace('<REFERENCE', '<reference') guidetext = guidetext.replace(' HREF=', ' href=') guidetext = guidetext.replace(' TITLE=', ' title=') guidetext = guidetext.replace(' TYPE=', ' type=') # reference must be a self-closing tag # and any href must be replaced with filepos information ref_tag_pattern = re.compile(r'''(<reference [^>]*>)''', re.IGNORECASE) guidepieces = ref_tag_pattern.split(guidetext) for i in range(1, len(guidepieces), 2): reftag = guidepieces[i] # remove any href there now to replace with filepos reftag = re.sub(r'''href\s*=[^'"]*['"][^'"]*['"]''', '', reftag) # make sure the reference tag ends properly if not reftag.endswith("/>"): reftag = reftag[0:-1] + "/>" guidepieces[i] = reftag guidetext = "".join(guidepieces) replacetext = r'''href="''' + fileinfo[0][2] + r'''#filepos\1"''' guidetext = re.sub(r'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidetext) guidetext += '\n' if isinstance(guidetext, unicode): guidetext = guidetext.decode(mh.codec).encode("utf-8") else: guidetext = unicode(guidetext, mh.codec).encode("utf-8") if 'StartOffset' in metadata.keys(): for value in metadata['StartOffset']: if int(value) == 0xffffffff: value = '0' starting_offset = value # get guide items from metadata metaguidetext = '<reference type="text" href="' + fileinfo[0][ 2] + '#filepos' + starting_offset + '" />\n' guidetext += metaguidetext # create an OPF opf = OPFProcessor(files, metadata, fileinfo, rscnames, ncx.isNCX, mh, usedmap, pagemapxml, guidetext) opf.writeOPF()
def processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'): global DUMP global WRITE_RAW_DATA # extract raw markup langauge rawML = mh.getRawML() if DUMP or WRITE_RAW_DATA: outraw = os.path.join(files.k8dir, files.getInputFileBasename() + '.rawml') open(pathof(outraw), 'wb').write(rawML) # KF8 require other indexes which contain parsing information and the FDST info # to process the rawml back into the xhtml files, css files, svg image files, etc k8proc = K8Processor(mh, sect, files, DUMP) k8proc.buildParts(rawML) # collect information for the guide first guidetext = k8proc.getGuideText() # if the guide was empty, add in any guide info from metadata, such as StartOffset if not guidetext and 'StartOffset' in metadata.keys(): # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part... # Taking that into account, we only care about the *last* StartOffset, which # should always be the correct one in these cases (the one actually pointing # to the right place in the mobi8 part). starts = metadata['StartOffset'] last_start = starts[-1] last_start = int(last_start) if last_start == 0xffffffff: last_start = 0 seq, idtext = k8proc.getFragTblInfo(last_start) filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), '0000000000') linktgt = filename if idtext != '': linktgt += '#' + idtext guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt # if apnxfile is passed in use it for page map information if apnxfile is not None and pagemapproc is None: apnxdata = "00000000" + file(apnxfile, 'rb').read() pagemapproc = PageMapProcessor(mh, apnxdata) # generate the page map pagemapxml = '' if pagemapproc is not None: pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc) outpm = os.path.join(files.k8oebps, 'page-map.xml') open(pathof(outpm), 'wb').write(pagemapxml) if DUMP: print pagemapproc.getNames() print pagemapproc.getOffsets() print "\n\nPage Map" print pagemapxml # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num print "Processing ncx / toc" ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() # extend the ncx data with filenames and proper internal idtags for i in range(len(ncx_data)): ncxmap = ncx_data[i] [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':') filename, idtag = k8proc.getIDTagByPosFid(fid, off) ncxmap['filename'] = filename ncxmap['idtag'] = idtag ncx_data[i] = ncxmap # convert the rawML to a set of xhtml files print "Building an epub-like structure" htmlproc = XHTMLK8Processor(rscnames, k8proc) usedmap = htmlproc.buildXHTML() # write out the xhtml svg, and css files # fileinfo = [skelid|coverpage, dir, name] fileinfo = [] # first create a cover page if none exists if CREATE_COVER_PAGE: cover = CoverProcessor(files, metadata, rscnames) cover_img = cover.getImageName() need_to_create_cover_page = False if cover_img is not None: if k8resc is None or not k8resc.hasSpine(): part = k8proc.getPart(0) if part.find(cover_img) == -1: need_to_create_cover_page = True else: if "coverpage" not in k8resc.spine_idrefs.keys(): part = k8proc.getPart(int(k8resc.spine_order[0])) if part.find(cover_img) == -1: k8resc.prepend_to_spine("coverpage", "inserted", "no", None) if k8resc.spine_order[0] == "coverpage": need_to_create_cover_page = True if need_to_create_cover_page: filename = cover.getXHTMLName() fileinfo.append(["coverpage", 'Text', filename]) guidetext += cover.guide_toxml() cover.writeXHTML() n = k8proc.getNumberOfParts() for i in range(n): part = k8proc.getPart(i) [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) fileinfo.append([str(skelnum), dir, filename]) fname = os.path.join(files.k8oebps, dir, filename) open(pathof(fname), 'wb').write(part) n = k8proc.getNumberOfFlows() for i in range(1, n): [type, format, dir, filename] = k8proc.getFlowInfo(i) flowpart = k8proc.getFlow(i) if format == 'file': fileinfo.append([None, dir, filename]) fname = os.path.join(files.k8oebps, dir, filename) open(pathof(fname), 'wb').write(flowpart) # create the opf opf = OPFProcessor(files, metadata.copy(), fileinfo, rscnames, True, mh, usedmap, pagemapxml, guidetext, k8resc, epubver) uuid = opf.writeOPF(bool(obfuscate_data)) if opf.hasNCX(): # Create a toc.ncx. ncx.writeK8NCX(ncx_data, metadata) if opf.hasNAV(): # Create a navigation document. nav = NAVProcessor(files) nav.writeNAV(ncx_data, guidetext, metadata) # make an epub-like structure of it all print "Creating an epub-like file" files.makeEPUB(usedmap, obfuscate_data, uuid)
else: entryName = os.path.join( files.outdir, files.getInputFileBasename() + ('.%03d.%03d.data' % ((i + 1), j))) open(pathof(entryName), 'wb').write( rawML[sectionOffset:(sectionOffset + sectionLength)]) except Exception, e: print 'Error processing Print Replica: ' + str(e) fileinfo.append([None, '', files.getInputFileBasename() + '.pdf']) usedmap = {} for name in rscnames: if name is not None: usedmap[name] = 'used' opf = OPFProcessor(files, metadata, fileinfo, rscnames, False, mh, usedmap) opf.writeOPF() def processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'): global DUMP global WRITE_RAW_DATA
def processMobi7(mh, metadata, sect, files, imgnames): global DUMP global WRITE_RAW_DATA # An original Mobi rawML = mh.getRawML() if DUMP or WRITE_RAW_DATA: outraw = os.path.join(files.mobi7dir,files.getInputFileBasename() + '.rawml') open(pathof(outraw),'wb').write(rawML) # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() ncx.writeNCX(metadata) positionMap = {} # if Dictionary build up the positionMap if mh.isDictionary(): if mh.DictInLanguage(): metadata['DictInLanguage'] = [mh.DictInLanguage()] if mh.DictOutLanguage(): metadata['DictOutLanguage'] = [mh.DictOutLanguage()] positionMap = dictSupport(mh, sect).getPositionMap() # convert the rawml back to Mobi ml proc = HTMLProcessor(files, metadata, imgnames) srctext = proc.findAnchors(rawML, ncx_data, positionMap) srctext, usedmap = proc.insertHREFS() # write the proper mobi html fileinfo=[] fname = files.getInputFileBasename() + '.html' fileinfo.append([None,'', fname]) outhtml = os.path.join(files.mobi7dir, fname) open(pathof(outhtml), 'wb').write(srctext) # extract guidetext from srctext guidetext ='' pagemapxml = '' guidematch = re.search(r'''<guide>(.*)</guide>''',srctext,re.IGNORECASE+re.DOTALL) if guidematch: guidetext = guidematch.group(1) # sometimes old mobi guide from srctext horribly written so need to clean up guidetext = guidetext.replace("\r", "") guidetext = guidetext.replace('<REFERENCE', '<reference') guidetext = guidetext.replace(' HREF=', ' href=') guidetext = guidetext.replace(' TITLE=', ' title=') guidetext = guidetext.replace(' TYPE=', ' type=') # reference must be a self-closing tag # and any href must be replaced with filepos information ref_tag_pattern = re.compile(r'''(<reference [^>]*>)''', re.IGNORECASE) guidepieces = ref_tag_pattern.split(guidetext) for i in range(1,len(guidepieces), 2): reftag = guidepieces[i] # remove any href there now to replace with filepos reftag = re.sub(r'''href\s*=[^'"]*['"][^'"]*['"]''','', reftag) # make sure the reference tag ends properly if not reftag.endswith("/>"): reftag = reftag[0:-1] + "/>" guidepieces[i] = reftag guidetext = "".join(guidepieces) replacetext = r'''href="'''+fileinfo[0][2]+r'''#filepos\1"''' guidetext = re.sub(r'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidetext) guidetext += '\n' if isinstance(guidetext, unicode): guidetext = guidetext.decode(mh.codec).encode("utf-8") else: guidetext = unicode(guidetext, mh.codec).encode("utf-8") if 'StartOffset' in metadata.keys(): for value in metadata['StartOffset']: if int(value) == 0xffffffff: value = '0' starting_offset = value # get guide items from metadata metaguidetext = '<reference type="text" href="'+fileinfo[0][2]+'#filepos'+starting_offset+'" />\n' guidetext += metaguidetext # create an OPF opf = OPFProcessor(files, metadata, fileinfo, imgnames, ncx.isNCX, mh, usedmap, pagemapxml, guidetext) opf.writeOPF()
def processMobi8(mh, metadata, sect, files, imgnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'): global DUMP global WRITE_RAW_DATA # extract raw markup langauge rawML = mh.getRawML() if DUMP or WRITE_RAW_DATA: outraw = os.path.join(files.k8dir,files.getInputFileBasename() + '.rawml') open(pathof(outraw),'wb').write(rawML) # KF8 require other indexes which contain parsing information and the FDST info # to process the rawml back into the xhtml files, css files, svg image files, etc k8proc = K8Processor(mh, sect, files, DUMP) k8proc.buildParts(rawML) # collect information for the guide first guidetext = k8proc.getGuideText() # if the guide was empty, add in any guide info from metadata, such as StartOffset if not guidetext and 'StartOffset' in metadata.keys(): # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part... # Taking that into account, we only care about the *last* StartOffset, which # should always be the correct one in these cases (the one actually pointing # to the right place in the mobi8 part). starts = metadata['StartOffset'] last_start = starts[-1] last_start = int(last_start) if last_start == 0xffffffff: last_start = 0 seq, idtext = k8proc.getFragTblInfo(last_start) filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), '0000000000') linktgt = filename if idtext != '': linktgt += '#' + idtext guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt # if apnxfile is passed in use it for page map information if apnxfile is not None and pagemapproc is None: apnxdata = "00000000" + file(apnxfile, 'rb').read() pagemapproc = PageMapProcessor(mh, apnxdata) # generate the page map pagemapxml = '' if pagemapproc is not None: pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc) outpm = os.path.join(files.k8oebps,'page-map.xml') open(pathof(outpm),'wb').write(pagemapxml) if DUMP: print pagemapproc.getNames() print pagemapproc.getOffsets() print "\n\nPage Map" print pagemapxml # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num print "Processing ncx / toc" ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() # extend the ncx data with filenames and proper internal idtags for i in range(len(ncx_data)): ncxmap = ncx_data[i] [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':') filename, idtag = k8proc.getIDTagByPosFid(fid, off) ncxmap['filename'] = filename ncxmap['idtag'] = idtag ncx_data[i] = ncxmap # convert the rawML to a set of xhtml files print "Building an epub-like structure" htmlproc = XHTMLK8Processor(imgnames, k8proc) usedmap = htmlproc.buildXHTML() # write out the xhtml svg, and css files # fileinfo = [skelid|coverpage, dir, name] fileinfo = [] # first create a cover page if none exists if CREATE_COVER_PAGE: cover = CoverProcessor(files, metadata, imgnames) cover_img = cover.getImageName() need_to_create_cover_page = False if cover_img is not None: if k8resc is None or not k8resc.hasSpine(): part = k8proc.getPart(0) if part.find(cover_img) == -1: need_to_create_cover_page = True else: if "coverpage" not in k8resc.spine_idrefs.keys(): part = k8proc.getPart(int(k8resc.spine_order[0])) if part.find(cover_img) == -1: k8resc.prepend_to_spine("coverpage", "inserted", "no", None) if k8resc.spine_order[0] == "coverpage": need_to_create_cover_page = True if need_to_create_cover_page: filename = cover.getXHTMLName() fileinfo.append(["coverpage", 'Text', filename]) guidetext += cover.guide_toxml() cover.writeXHTML() n = k8proc.getNumberOfParts() for i in range(n): part = k8proc.getPart(i) [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) fileinfo.append([str(skelnum), dir, filename]) fname = os.path.join(files.k8oebps,dir,filename) open(pathof(fname),'wb').write(part) n = k8proc.getNumberOfFlows() for i in range(1, n): [type, format, dir, filename] = k8proc.getFlowInfo(i) flowpart = k8proc.getFlow(i) if format == 'file': fileinfo.append([None, dir, filename]) fname = os.path.join(files.k8oebps,dir,filename) open(pathof(fname),'wb').write(flowpart) # create the opf opf = OPFProcessor(files, metadata.copy(), fileinfo, imgnames, True, mh, usedmap, pagemapxml, guidetext, k8resc, epubver) uuid = opf.writeOPF(bool(obfuscate_data)) if opf.hasNCX(): # Create a toc.ncx. ncx.writeK8NCX(ncx_data, metadata) if opf.hasNAV(): # Create a navigation document. nav = NAVProcessor(files) nav.writeNAV(ncx_data, guidetext, metadata) # make an epub-like structure of it all print "Creating an epub-like file" files.makeEPUB(usedmap, obfuscate_data, uuid)
sectionOffset, sectionLength, = struct.unpack_from('>LL', rawML, tableIndexOffset) tableIndexOffset += 8 if j == 0: entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.pdf' % (i+1))) else: entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.%03d.data' % ((i+1),j))) open(pathof(entryName), 'wb').write(rawML[sectionOffset:(sectionOffset+sectionLength)]) except Exception, e: print 'Error processing Print Replica: ' + str(e) fileinfo.append([None,'', files.getInputFileBasename() + '.pdf']) usedmap = {} for name in imgnames: if name is not None: usedmap[name] = 'used' opf = OPFProcessor(files, metadata, fileinfo, imgnames, False, mh, usedmap) opf.writeOPF() def processMobi8(mh, metadata, sect, files, imgnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'): global DUMP global WRITE_RAW_DATA # extract raw markup langauge rawML = mh.getRawML() if DUMP or WRITE_RAW_DATA: outraw = os.path.join(files.k8dir,files.getInputFileBasename() + '.rawml') open(pathof(outraw),'wb').write(rawML) # KF8 require other indexes which contain parsing information and the FDST info