def copy_book_contents_to(self, destdir): destdir = unicode_str(destdir) if destdir is None or not unipath.isdir(destdir): raise WrapperException('destination directory does not exist') for id in self.id_to_filepath: rpath = self.id_to_filepath[id] in_manifest = id in self.id_to_mime data = self.readfile(id) filepath = os.path.join(destdir,rpath) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(base) if isinstance(data,text_type): data = utf8_str(data) with open(pathof(filepath),'wb') as fp: fp.write(data) for id in self.book_href_to_filepath: rpath = self.book_href_to_filepath[id] data = self.readotherfile(id) filepath = os.path.join(destdir,rpath) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(base) if isinstance(data,text_type): data = utf8_str(data) with open(pathof(filepath),'wb') as fp: fp.write(data)
def failed(script_type, msg): wrapper = _XML_HEADER if script_type is None: wrapper += "<wrapper>\n<result>failed</result>\n<changes/>\n" else: wrapper += '<wrapper type="%s">\n<result>failed</result>\n<changes/>\n' % script_type wrapper += "<msg>%s</msg>\n</wrapper>\n" % msg # write it to stdout and exit if PY3: sys.stdout.buffer.write(utf8_str(wrapper)) else: sys.stdout.write(utf8_str(wrapper))
def parseExceptionsFile(filename): safename = utf8_str(filename) words_list = [] snippet = min(32, os.path.getsize(pathof(safename))) raw = open(pathof(safename), 'rb').read(snippet) if raw.startswith(codecs.BOM_UTF8): enc = 'utf-8-sig' else: encodings = ['utf-8', 'utf-16' 'windows-1252', 'windows-1250'] for e in encodings: try: fh = file_open(pathof(safename), 'r', encoding=e) fh.readlines() fh.seek(0) except UnicodeDecodeError: print('Got unicode error with %s , trying different encoding' % e) else: break enc = e try: with file_open(pathof(safename), 'r', encoding=enc) as fd: words_list = [line.rstrip() for line in fd] # words_list = filter(None, words_list) words_list = [_f for _f in words_list if _f] print('Parsing apostrophe exception file %s' % filename) except: print('Error parsing apostrophe exception file %s: ignoring' % filename) words_list = [] return words_list
def Idpf_encryption_key(uid): # remove whitespace changing nothing else key = utf8_str(uid) key = key.replace(bchr(0x20),b'') key = key.replace(bchr(0x09),b'') key = key.replace(bchr(0x0d),b'') key = key.replace(bchr(0x0a),b'') key = SHA1(key) return key
def write_opf(self): if self.op is not None: filepath = pathof(os.path.join(self.outdir, 'OEBPS', self.opfname)) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(base) with open(filepath,'wb') as fp: data = utf8_str(self.build_opf()) fp.write(data)
def Adobe_encryption_key(uid): # strip it down to simple valid hex characters # being careful to generate a string of 16 bytes in length key = utf8_str(uid) if key.startswith(b"urn:uuid:"): key = key[9:] key = key.replace(b'-',b'') key = re.sub(r'[^a-fA-F0-9]', b'', key) key = binascii.unhexlify((key + key)[:32]) return key
def Idpf_encryption_key(uid): # remove whitespace changing nothing else key = utf8_str(uid) if key.startswith(b"urn:uuid:"): key = key[9:] key = key.replace(bchr(0x20),b'') key = key.replace(bchr(0x09),b'') key = key.replace(bchr(0x0d),b'') key = key.replace(bchr(0x0a),b'') key = SHA1(key) return key
def writefile(self, id, data): id = unicode_str(id) filepath = self.id_to_filepath.get(id, None) if filepath is None: raise WrapperException('Id does not exist in manifest') mime = self.id_to_mime.get(id,'') filepath = os.path.join(self.outdir, filepath) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(pathof(base)) if mime.endswith('+xml') or isinstance(data, text_type): data = utf8_str(data) with open(filepath,'wb') as fp: fp.write(data) self.modified[id] = 'file'
def writeotherfile(self, book_href, data): id = unicode_str(book_href) filepath = self.id_to_filepath.get(id, None) if filepath is None: raise WrapperException('book href does not exist') if id in PROTECTED_FILES: raise WrapperException('Attempt to modify protected file') filepath = os.path.join(self.outdir, filepath) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(base) if isinstance(data, text_type): data = utf8_str(data) with open(filepath,'wb') as fp: fp.write(data) self.modified[id] = 'file'
def writeotherfile(self, book_href, data): id = unicode_str(book_href) if id in self.id_to_href: raise WrapperException("Incorrect interface routine - use writefile") filepath = self.id_to_filepath.get(id, None) if filepath is None: raise WrapperException("book href does not exist") if id in PROTECTED_FILES: raise WrapperException("Attempt to modify protected file") filepath = os.path.join(self.outdir, filepath) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(base) if isinstance(data, text_type): data = utf8_str(data) with open(filepath, "wb") as fp: fp.write(data) self.modified[id] = "file"
def writeotherfile(self, book_href, data): id = unicode_str(book_href) if id in self.id_to_href: raise WrapperException('Incorrect interface routine - use writefile') filepath = self.id_to_filepath.get(id, None) if filepath is None: raise WrapperException('book href does not exist') if id in PROTECTED_FILES: raise WrapperException('Attempt to modify protected file') filepath = os.path.join(self.outdir, filepath) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(base) if isinstance(data, text_type): data = utf8_str(data) with open(filepath,'wb') as fp: fp.write(data) self.modified[id] = 'file'
def addotherfile(self, book_href, data) : id = unicode_str(book_href) if id in self.other: raise WrapperException('book href must be unquie') desired_path = id.replace("/",os.sep) filepath = os.path.join(self.outdir,desired_path) if unipath.isfile(filepath): raise WrapperException('desired path already exists') base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(pathof(base)) if isinstance(data, text_type): data = utf8_str(data) with open(pathof(filepath),'wb')as fp: fp.write(data) self.other.append(id) self.added.append(id) self.id_to_filepath[id] = desired_path
def addfile(self, uniqueid, basename, data, mime=None, properties=None, fallback=None, overlay=None): uniqueid = unicode_str(uniqueid) if uniqueid in self.id_to_href: raise WrapperException('Manifest Id is not unique') basename = unicode_str(basename) mime = unicode_str(mime) if mime is None: ext = os.path.splitext(basename)[1] ext = ext.lower() mime = ext_mime_map.get(ext, None) if mime is None: raise WrapperException("Mime Type Missing") if mime == "application/x-dtbncx+xml" and self.epub_version.startswith("2"): raise WrapperException('Can not add or remove an ncx under epub2') group = mime_group_map.get(mime,"Misc") default_path = self.group_paths[group][0] bookpath = basename if default_path != "": bookpath = default_path + "/" + basename href = buildRelativePath(self.opfbookpath, bookpath) if href in self.href_to_id: raise WrapperException('Basename already exists') # now actually write out the new file filepath = bookpath.replace("/",os.sep) self.id_to_filepath[uniqueid] = filepath filepath = os.path.join(self.outdir,filepath) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(base) if mime in TEXT_MIMETYPES or isinstance(data, text_type): data = utf8_str(data) with open(filepath,'wb') as fp: fp.write(data) self.id_to_href[uniqueid] = href self.id_to_mime[uniqueid] = mime self.id_to_props[uniqueid] = properties self.id_to_fall[uniqueid] = fallback self.id_to_over[uniqueid] = overlay self.id_to_bookpath[uniqueid] = bookpath self.href_to_id[href] = uniqueid self.bookpath_to_id[bookpath] = uniqueid self.added.append(uniqueid) self.modified[self.opfbookpath] = 'file' return uniqueid
def copy_book_contents_to(self, destdir): destdir = unicode_str(destdir) if destdir is None or not unipath.isdir(destdir): raise WrapperException('destination directory does not exist') for id in self.id_to_filepath: rpath = self.id_to_filepath[id] in_manifest = id in self.id_to_mime if in_manifest: data = self.readfile(id) else: data = self.readotherfile(id) filepath = os.path.join(destdir,rpath) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(base) if isinstance(data,text_type): data = utf8_str(data) with open(pathof(filepath),'wb') as fp: fp.write(data)
def addfile(self, uniqueid, basename, data, mime=None, properties=None, fallback=None, overlay=None): uniqueid = unicode_str(uniqueid) basename = unicode_str(basename) mime = unicode_str(mime) if mime is None: ext = os.path.splitext(basename)[1] ext = ext.lower() mime = ext_mime_map.get(ext, None) if mime is None: raise WrapperException("Mime Type Missing") if mime.startswith("audio"): base = 'Audio' elif mime.startswith("video"): base = "Video" else: base = mime_base_map.get(mime,'Misc') href = base + "/" + basename if uniqueid in self.id_to_href: raise WrapperException('Manifest Id is not unique') if href in self.href_to_id: raise WrapperException('Basename is not unique') # now actually write out the new file filepath = href.replace("/",os.sep) filepath = os.path.join('OEBPS', filepath) self.id_to_filepath[uniqueid] = filepath filepath = os.path.join(self.outdir,filepath) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(base) if mime.endswith('+xml') or isinstance(data, text_type): data = utf8_str(data) with open(filepath,'wb') as fp: fp.write(data) self.id_to_href[uniqueid] = href self.id_to_mime[uniqueid] = mime self.id_to_props[uniqueid] = properties self.id_to_fall[uniqueid] = fallback self.id_to_over[uniqueid] = overlay self.href_to_id[href] = uniqueid self.added.append(uniqueid) self.modified['OEBPS/content.opf'] = 'file' return uniqueid
def writeotherfile(self, book_href, data): id = unicode_str(book_href) id = unquoteurl(id) if id is None: raise WrapperException('None is not a valid book href') if id not in self.other and id in self.id_to_href: raise WrapperException('Incorrect interface routine - use writefile') filepath = self.book_href_to_filepath.get(id, None) if filepath is None: raise WrapperException('Book href does not exist') if id in PROTECTED_FILES: raise WrapperException('Attempt to modify protected file') filepath = os.path.join(self.outdir, filepath) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(base) if isinstance(data, text_type): data = utf8_str(data) with open(filepath,'wb') as fp: fp.write(data) self.modified[id] = 'file'
def addfile(self, uniqueid, basename, data, mime=None): uniqueid = unicode_str(uniqueid) basename = unicode_str(basename) mime = unicode_str(mime) if mime is None: ext = os.path.splitext(basename)[1] ext = ext.lower() mime = ext_mime_map.get(ext, None) if mime is None: raise WrapperException("Mime Type Missing") if mime.startswith("audio"): base = "Audio" elif mime.startswith("video"): base = "Video" else: base = mime_base_map.get(mime, "Misc") href = base + "/" + basename if uniqueid in self.id_to_href: raise WrapperException("Manifest Id is not unique") if href in self.href_to_id: raise WrapperException("Basename is not unique") # now actually write out the new file filepath = href.replace("/", os.sep) filepath = os.path.join("OEBPS", filepath) self.id_to_filepath[uniqueid] = filepath filepath = os.path.join(self.outdir, filepath) base = os.path.dirname(filepath) if not unipath.exists(base): os.makedirs(base) if mime.endswith("+xml") or isinstance(data, text_type): data = utf8_str(data) with open(filepath, "wb") as fp: fp.write(data) self.id_to_href[uniqueid] = href self.id_to_mime[uniqueid] = mime self.href_to_id[href] = uniqueid self.added.append(uniqueid) self.modified["OEBPS/content.opf"] = "file" return uniqueid
def cmdDo(self): global CRITERIA if self.dashBox.current() == 0: dash_settings = '' elif self.dashBox.current() == 1: dash_settings = 'd' elif self.dashBox.current() == 2: dash_settings = 'i' else: dash_settings = 'D' if self.use_file.get(): self.cust_file_path.config(state="normal") if len(self.cust_file_path.get()): apos_exception_file = self.cust_file_path.get() if not unipath.exists(utf8_str(apos_exception_file)): print('Apostrophe exception file %s does not exist!' % apos_exception_file) apos_exception_file = None else: apos_exception_file = None self.cust_file_path.config(state="readonly") else: apos_exception_file = None CRITERIA['apos_exception_file'] = apos_exception_file smarty_attr = self.edu_quotes.get( ) + dash_settings + self.edu_ellipses.get() if smarty_attr == '': smarty_attr = '0' CRITERIA['smarty_attr'] = smarty_attr CRITERIA['use_unicode'] = self.unicodevar.get() indices = self.filelist.curselection() CRITERIA['files'] = [self.filelist.get(index) for index in indices] self.quitApp()
def cmdDo(self): global CRITERIA if self.dashBox.current() == 0: dash_settings = '' elif self.dashBox.current() == 1: dash_settings = 'd' elif self.dashBox.current() == 2: dash_settings = 'i' else: dash_settings = 'D' if self.use_file.get(): self.cust_file_path.config(state="normal") if len(self.cust_file_path.get()): apos_exception_file = self.cust_file_path.get() if not unipath.exists(utf8_str(apos_exception_file)): print ('Apostrophe exception file %s does not exist!' % apos_exception_file) apos_exception_file = None else: apos_exception_file = None self.cust_file_path.config(state="readonly") else: apos_exception_file = None CRITERIA['apos_exception_file'] = apos_exception_file smarty_attr = self.edu_quotes.get() + dash_settings + self.edu_ellipses.get() if smarty_attr == '': smarty_attr = '0' CRITERIA['smarty_attr'] = smarty_attr CRITERIA['use_unicode'] = self.unicodevar.get() indices = self.filelist.curselection() CRITERIA['files'] = [self.filelist.get(index) for index in indices] self.quitApp()
def main(argv=unicode_argv()): if len(argv) != 5: failed( None, msg="Launcher: improper number of arguments passed to launcher.py") return -1 ebook_root = argv[1] outdir = argv[2] script_type = argv[3] target_file = argv[4] script_home = os.path.dirname(target_file) plugin_name = os.path.split(script_home)[-1] plugin_dir = os.path.dirname(script_home) script_module = os.path.splitext(os.path.basename(target_file))[0] # do basic sanity checking anyway if script_type not in SUPPORTED_SCRIPT_TYPES: failed(None, msg="Launcher: script type %s is not supported" % script_type) return -1 ok = unipath.exists(ebook_root) and unipath.isdir(ebook_root) ok = ok and unipath.exists(outdir) and unipath.isdir(outdir) ok = ok and unipath.exists(script_home) and unipath.isdir(script_home) ok = ok and unipath.exists(target_file) and unipath.isfile(target_file) if not ok: failed(None, msg="Launcher: missing or incorrect paths passed in") return -1 # update sys with path to target module home directory sys.path.append(script_home) # load and parse opf if present op = None opf_path = os.path.join(ebook_root, 'OEBPS', 'content.opf') if unipath.exists(opf_path) and unipath.isfile(opf_path): op = Opf_Parser(opf_path) # create a wrapper for record keeping and safety rk = Wrapper(ebook_root, outdir, op, plugin_dir, plugin_name) # get the correct container if script_type == 'edit': bc = BookContainer(rk) elif script_type == 'input': bc = InputContainer(rk) elif script_type == 'validation': bc = ValidationContainer(rk) else: bc = OutputContainer(rk) # start the target script ps = ProcessScript(script_type, script_module, bc) ps.launch() # get standard error and standard out from the target script successmsg = '' for data in ps.stdouttext: successmsg += unicode_str(data) successmsg = escapeit(successmsg) errorlog = '' for data in ps.stderrtext: errorlog += unicode_str(data) errorlog = escapeit(errorlog) # get the target's script wrapper xml resultxml = "".join(ps.wrapout) resultxml += "<msg>\n" if ps.exitcode == 0: resultxml += successmsg if _DEBUG: resultxml += errorlog else: if _DEBUG: resultxml += successmsg resultxml += errorlog resultxml += '</msg>\n</wrapper>\n' # write it to stdout and exit if PY3: sys.stdout.buffer.write(utf8_str(resultxml)) else: sys.stdout.write(utf8_str(resultxml)) return 0
def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): ''' Apply inflection rule. @param mainEntry: The word to inflect. @param inflectionRuleData: The inflection rules. @param start: The start position of the inflection rule to use. @param end: The end position of the inflection rule to use. @return: The string with the inflected word or None if an error occurs. ''' mode = -1 byteArray = array.array(array_format, mainEntry) position = len(byteArray) for charOffset in range(start, end): char = inflectionRuleData[charOffset:charOffset + 1] abyte = ord(char) if abyte >= 0x0a and abyte <= 0x13: # Move cursor backwards offset = abyte - 0x0a if mode not in [0x02, 0x03]: mode = 0x02 position = len(byteArray) position -= offset elif abyte > 0x13: if mode == -1: print( "Error: Unexpected first byte %i of inflection rule" % abyte) return None elif position == -1: print( "Error: Unexpected first byte %i of inflection rule" % abyte) return None else: if mode == 0x01: # Insert at word start byteArray.insert(position, abyte) position += 1 elif mode == 0x02: # Insert at word end byteArray.insert(position, abyte) elif mode == 0x03: # Delete at word end position -= 1 deleted = byteArray.pop(position) if bchr(deleted) != char: if DEBUG_DICT: print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) print( "Error: Delete operation of inflection rule failed" ) return None elif mode == 0x04: # Delete at word start deleted = byteArray.pop(position) if bchr(deleted) != char: if DEBUG_DICT: print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) print( "Error: Delete operation of inflection rule failed" ) return None else: print( "Error: Inflection rule mode %x is not implemented" % mode) return None elif abyte == 0x01: # Insert at word start if mode not in [0x01, 0x04]: position = 0 mode = abyte elif abyte == 0x02: # Insert at word end if mode not in [0x02, 0x03]: position = len(byteArray) mode = abyte elif abyte == 0x03: # Delete at word end if mode not in [0x02, 0x03]: position = len(byteArray) mode = abyte elif abyte == 0x04: # Delete at word start if mode not in [0x01, 0x04]: position = 0 # Delete at word start mode = abyte else: print("Error: Inflection rule mode %x is not implemented" % abyte) return None return utf8_str(byteArray.tostring())
def processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'): global DUMP global WRITE_RAW_DATA # extract raw markup langauge rawML = mh.getRawML() if DUMP or WRITE_RAW_DATA: outraw = os.path.join(files.k8dir,files.getInputFileBasename() + '.rawml') with open(pathof(outraw),'wb') as f: f.write(rawML) # KF8 require other indexes which contain parsing information and the FDST info # to process the rawml back into the xhtml files, css files, svg image files, etc k8proc = K8Processor(mh, sect, files, DUMP) k8proc.buildParts(rawML) # collect information for the guide first guidetext = unicode_str(k8proc.getGuideText()) # if the guide was empty, add in any guide info from metadata, such as StartOffset if not guidetext and 'StartOffset' in metadata: # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part... # Taking that into account, we only care about the *last* StartOffset, which # should always be the correct one in these cases (the one actually pointing # to the right place in the mobi8 part). starts = metadata['StartOffset'] last_start = starts[-1] last_start = int(last_start) if last_start == 0xffffffff: last_start = 0 seq, idtext = k8proc.getFragTblInfo(last_start) filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), b'0000000000') linktgt = filename idtext = unicode_str(idtext, mh.codec) if idtext != '': linktgt += '#' + idtext guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt # if apnxfile is passed in use it for page map information if apnxfile is not None and pagemapproc is None: with open(apnxfile, 'rb') as f: apnxdata = b"00000000" + f.read() pagemapproc = PageMapProcessor(mh, apnxdata) # generate the page map pagemapxml = '' if pagemapproc is not None: pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc) outpm = os.path.join(files.k8oebps,'page-map.xml') with open(pathof(outpm),'wb') as f: f.write(pagemapxml.encode('utf-8')) if DUMP: print(pagemapproc.getNames()) print(pagemapproc.getOffsets()) print("\n\nPage Map") print(pagemapxml) # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num print("Processing ncx / toc") ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() # extend the ncx data with filenames and proper internal idtags for i in range(len(ncx_data)): ncxmap = ncx_data[i] [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':') filename, idtag = k8proc.getIDTagByPosFid(fid, off) ncxmap['filename'] = filename ncxmap['idtag'] = unicode_str(idtag) ncx_data[i] = ncxmap # convert the rawML to a set of xhtml files print("Building an epub-like structure") htmlproc = XHTMLK8Processor(rscnames, k8proc) usedmap = htmlproc.buildXHTML() # write out the xhtml svg, and css files # fileinfo = [skelid|coverpage, dir, name] fileinfo = [] # first create a cover page if none exists if CREATE_COVER_PAGE: cover = CoverProcessor(files, metadata, rscnames) cover_img = utf8_str(cover.getImageName()) need_to_create_cover_page = False if cover_img is not None: if k8resc is None or not k8resc.hasSpine(): part = k8proc.getPart(0) if part.find(cover_img) == -1: need_to_create_cover_page = True else: if "coverpage" not in k8resc.spine_idrefs: part = k8proc.getPart(int(k8resc.spine_order[0])) if part.find(cover_img) == -1: k8resc.prepend_to_spine("coverpage", "inserted", "no", None) if k8resc.spine_order[0] == "coverpage": need_to_create_cover_page = True if need_to_create_cover_page: filename = cover.getXHTMLName() fileinfo.append(["coverpage", 'Text', filename]) guidetext += cover.guide_toxml() cover.writeXHTML() n = k8proc.getNumberOfParts() for i in range(n): part = k8proc.getPart(i) [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) fileinfo.append([str(skelnum), dir, filename]) fname = os.path.join(files.k8oebps,dir,filename) with open(pathof(fname),'wb') as f: f.write(part) n = k8proc.getNumberOfFlows() for i in range(1, n): [ptype, pformat, pdir, filename] = k8proc.getFlowInfo(i) flowpart = k8proc.getFlow(i) if pformat == b'file': fileinfo.append([None, pdir, filename]) fname = os.path.join(files.k8oebps,pdir,filename) with open(pathof(fname),'wb') as f: f.write(flowpart) # create the opf opf = OPFProcessor(files, metadata.copy(), fileinfo, rscnames, True, mh, usedmap, pagemapxml=pagemapxml, guidetext=guidetext, k8resc=k8resc, epubver=epubver) uuid = opf.writeOPF(bool(obfuscate_data)) if opf.hasNCX(): # Create a toc.ncx. ncx.writeK8NCX(ncx_data, metadata) if opf.hasNAV(): # Create a navigation document. nav = NAVProcessor(files) nav.writeNAV(ncx_data, guidetext, metadata) # 表紙の番号取得 cover_offset = int(mh.metadata.get('CoverOffset', ['-1'])[0]) if not CREATE_COVER_PAGE: cover_offset = None # make an epub-like structure of it all print("Creating an epub-like file") files.makeEPUB(usedmap, obfuscate_data, uuid, azw2zip_cfg.isOutputEpub(), azw2zip_cfg.makeOutputFileName(mh.getMetaData()), cover_offset)
def buildXHTML(self): # first need to update all links that are internal which # are based on positions within the xhtml files **BEFORE** # cutting and pasting any pieces into the xhtml text files # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml) # XXXX is the offset in records into divtbl # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position # pos:fid pattern posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE) posfid_index_pattern = re.compile( br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''') parts = [] print("Building proper xhtml for each file") for i in range(self.k8proc.getNumberOfParts()): part = self.k8proc.getPart(i) [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i) # internal links srcpieces = posfid_pattern.split(part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith(b'<'): for m in posfid_index_pattern.finditer(tag): posfid = m.group(1) offset = m.group(2) filename, idtag = self.k8proc.getIDTagByPosFid( posfid, offset) if idtag == b'': replacement = b'"' + utf8_str(filename) + b'"' else: replacement = b'"' + utf8_str( filename) + b'#' + idtag + b'"' tag = posfid_index_pattern.sub(replacement, tag, 1) srcpieces[j] = tag part = b"".join(srcpieces) parts.append(part) # we are free to cut and paste as we see fit # we can safely remove all of the Kindlegen generated aid tags # change aid ids that are in k8proc.linked_aids to xhtml ids find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE) within_tag_aid_position_pattern = re.compile( br'''\said\s*=['"]([^'"]*)['"]''') for i in range(len(parts)): part = parts[i] srcpieces = find_tag_with_aid_pattern.split(part) for j in range(len(srcpieces)): tag = srcpieces[j] if tag.startswith(b'<'): for m in within_tag_aid_position_pattern.finditer(tag): try: aid = m.group(1) except IndexError: aid = None replacement = b'' if aid in self.k8proc.linked_aids: replacement = b' id="aid-' + aid + b'"' tag = within_tag_aid_position_pattern.sub( replacement, tag, 1) srcpieces[j] = tag part = b"".join(srcpieces) parts[i] = part # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags # with page-break-after style patterns find_tag_with_AmznPageBreak_pattern = re.compile( br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE) within_tag_AmznPageBreak_position_pattern = re.compile( br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''') for i in range(len(parts)): part = parts[i] srcpieces = find_tag_with_AmznPageBreak_pattern.split(part) for j in range(len(srcpieces)): tag = srcpieces[j] if tag.startswith(b'<'): srcpieces[ j] = within_tag_AmznPageBreak_position_pattern.sub( lambda m: b' style="page-break-after:' + m.group(1) + b'"', tag) part = b"".join(srcpieces) parts[i] = part # we have to handle substitutions for the flows pieces first as they may # be inlined into the xhtml text # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) # kindle:embed:XXXX (used for fonts) flows = [] flows.append(None) flowinfo = [] flowinfo.append([None, None, None, None]) # regular expression search patterns img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) img_index_pattern = re.compile( br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) tag_pattern = re.compile(br'''(<[^>]*>)''') flow_pattern = re.compile( br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE) url_img_index_pattern = re.compile( br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE) font_index_pattern = re.compile( br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE) url_css_index_pattern = re.compile( br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE) url_svg_image_pattern = re.compile( br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE) for i in range(1, self.k8proc.getNumberOfFlows()): [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i) flowpart = self.k8proc.getFlow(i) # links to raster image files from image tags # image_pattern srcpieces = img_pattern.split(flowpart) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith(b'<im'): for m in img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.rscnames[imageNumber - 1] if imageName is not None: replacement = b'"../Images/' + utf8_str( imageName) + b'"' self.used[imageName] = 'used' tag = img_index_pattern.sub(replacement, tag, 1) else: print( "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)) srcpieces[j] = tag flowpart = b"".join(srcpieces) # replacements inside css url(): srcpieces = url_pattern.split(flowpart) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] # process links to raster image files for m in url_img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.rscnames[imageNumber - 1] osep = m.group()[0:1] csep = m.group()[-1:] if imageName is not None: replacement = osep + b'../Images/' + utf8_str( imageName) + csep self.used[imageName] = 'used' tag = url_img_index_pattern.sub(replacement, tag, 1) else: print( "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)) # process links to fonts for m in font_index_pattern.finditer(tag): fontNumber = fromBase32(m.group(1)) fontName = self.rscnames[fontNumber - 1] osep = m.group()[0:1] csep = m.group()[-1:] if fontName is None: print( "Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag)) else: replacement = osep + b'../Fonts/' + utf8_str( fontName) + csep tag = font_index_pattern.sub(replacement, tag, 1) self.used[fontName] = 'used' # process links to other css pieces for m in url_css_index_pattern.finditer(tag): num = fromBase32(m.group(1)) [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str( fnm) + b'"' tag = url_css_index_pattern.sub(replacement, tag, 1) self.used[fnm] = 'used' # process links to svg images for m in url_svg_image_pattern.finditer(tag): num = fromBase32(m.group(1)) [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str( fnm) + b'"' tag = url_svg_image_pattern.sub(replacement, tag, 1) self.used[fnm] = 'used' srcpieces[j] = tag flowpart = b"".join(srcpieces) # store away in our own copy flows.append(flowpart) # I do not think this case exists and even if it does exist, it needs to be done in a separate # pass to prevent inlining a flow piece into another flow piece before the inserted one or the # target one has been fully processed # but keep it around if it ends up we do need it # flow pattern not inside url() # srcpieces = tag_pattern.split(flowpart) # for j in range(1, len(srcpieces),2): # tag = srcpieces[j] # if tag.startswith(b'<'): # for m in flow_pattern.finditer(tag): # num = fromBase32(m.group(1)) # [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) # flowtext = self.k8proc.getFlow(num) # if fmt == b'inline': # tag = flowtext # else: # replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' # tag = flow_pattern.sub(replacement, tag, 1) # self.used[fnm] = 'used' # srcpieces[j] = tag # flowpart = b"".join(srcpieces) # now handle the main text xhtml parts # Handle the flow items in the XHTML text pieces # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) tag_pattern = re.compile(br'''(<[^>]*>)''') flow_pattern = re.compile( br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) for i in range(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # flow pattern srcpieces = tag_pattern.split(part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith(b'<'): for m in flow_pattern.finditer(tag): num = fromBase32(m.group(1)) if num > 0 and num < len(self.k8proc.flowinfo): [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) flowpart = flows[num] if fmt == b'inline': tag = flowpart else: replacement = b'"../' + utf8_str( pdir) + b'/' + utf8_str(fnm) + b'"' tag = flow_pattern.sub(replacement, tag, 1) self.used[fnm] = 'used' else: print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num) srcpieces[j] = tag part = b''.join(srcpieces) # store away modified version parts[i] = part # Handle any embedded raster images links in style= attributes urls style_pattern = re.compile( br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE) img_index_pattern = re.compile( br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) for i in range(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # replace urls in style attributes srcpieces = style_pattern.split(part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if b'kindle:embed' in tag: for m in img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.rscnames[imageNumber - 1] osep = m.group()[0:1] csep = m.group()[-1:] if imageName is not None: replacement = osep + b'../Images/' + utf8_str( imageName) + csep self.used[imageName] = 'used' tag = img_index_pattern.sub(replacement, tag, 1) else: print( "Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag)) srcpieces[j] = tag part = b"".join(srcpieces) # store away modified version parts[i] = part # Handle any embedded raster images links in the xhtml text # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) img_index_pattern = re.compile( br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''') for i in range(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # links to raster image files # image_pattern srcpieces = img_pattern.split(part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith(b'<im'): for m in img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.rscnames[imageNumber - 1] if imageName is not None: replacement = b'"../Images/' + utf8_str( imageName) + b'"' self.used[imageName] = 'used' tag = img_index_pattern.sub(replacement, tag, 1) else: print( "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)) srcpieces[j] = tag part = b"".join(srcpieces) # store away modified version parts[i] = part # finally perform any general cleanups needed to make valid XHTML # these include: # in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio" # in svg tags replace "viewbox" attributes with "viewBox" # in <li> remove value="XX" attributes since these are illegal tag_pattern = re.compile(br'''(<[^>]*>)''') li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE) for i in range(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # tag pattern srcpieces = tag_pattern.split(part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith(b'<svg') or tag.startswith(b'<SVG'): tag = tag.replace(b'preserveaspectratio', b'preserveAspectRatio') tag = tag.replace(b'viewbox', b'viewBox') elif tag.startswith(b'<li ') or tag.startswith(b'<LI '): tagpieces = li_value_pattern.split(tag) tag = b"".join(tagpieces) srcpieces[j] = tag part = b"".join(srcpieces) # store away modified version parts[i] = part self.k8proc.setFlows(flows) self.k8proc.setParts(parts) return self.used
def processMobi7(mh, metadata, sect, files, rscnames): global DUMP global WRITE_RAW_DATA # An original Mobi rawML = mh.getRawML() if DUMP or WRITE_RAW_DATA: outraw = os.path.join(files.mobi7dir,files.getInputFileBasename() + '.rawml') with open(pathof(outraw),'wb') as f: f.write(rawML) # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() ncx.writeNCX(metadata) positionMap = {} # if Dictionary build up the positionMap if mh.isDictionary(): if mh.DictInLanguage(): metadata['DictInLanguage'] = [mh.DictInLanguage()] if mh.DictOutLanguage(): metadata['DictOutLanguage'] = [mh.DictOutLanguage()] positionMap = dictSupport(mh, sect).getPositionMap() # convert the rawml back to Mobi ml proc = HTMLProcessor(files, metadata, rscnames) srctext = proc.findAnchors(rawML, ncx_data, positionMap) srctext, usedmap = proc.insertHREFS() # write the proper mobi html fileinfo=[] # fname = files.getInputFileBasename() + '.html' fname = 'book.html' fileinfo.append([None,'', fname]) outhtml = os.path.join(files.mobi7dir, fname) with open(pathof(outhtml), 'wb') as f: f.write(srctext) # extract guidetext from srctext guidetext =b'' # no pagemap support for older mobis # pagemapxml = None guidematch = re.search(br'''<guide>(.*)</guide>''',srctext,re.IGNORECASE+re.DOTALL) if guidematch: guidetext = guidematch.group(1) # sometimes old mobi guide from srctext horribly written so need to clean up guidetext = guidetext.replace(b"\r", b"") guidetext = guidetext.replace(b'<REFERENCE', b'<reference') guidetext = guidetext.replace(b' HREF=', b' href=') guidetext = guidetext.replace(b' TITLE=', b' title=') guidetext = guidetext.replace(b' TYPE=', b' type=') # reference must be a self-closing tag # and any href must be replaced with filepos information ref_tag_pattern = re.compile(br'''(<reference [^>]*>)''', re.IGNORECASE) guidepieces = ref_tag_pattern.split(guidetext) for i in range(1,len(guidepieces), 2): reftag = guidepieces[i] # remove any href there now to replace with filepos reftag = re.sub(br'''href\s*=[^'"]*['"][^'"]*['"]''',b'', reftag) # make sure the reference tag ends properly if not reftag.endswith(b"/>"): reftag = reftag[0:-1] + b"/>" guidepieces[i] = reftag guidetext = b''.join(guidepieces) replacetext = br'''href="'''+utf8_str(fileinfo[0][2])+ br'''#filepos\1"''' guidetext = re.sub(br'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidetext) guidetext += b'\n' if 'StartOffset' in metadata: for value in metadata['StartOffset']: if int(value) == 0xffffffff: value = '0' starting_offset = value # get guide items from metadata metaguidetext = b'<reference type="text" href="'+utf8_str(fileinfo[0][2])+b'#filepos'+utf8_str(starting_offset)+b'" />\n' guidetext += metaguidetext if isinstance(guidetext, binary_type): guidetext = guidetext.decode(mh.codec) # create an OPF opf = OPFProcessor(files, metadata, fileinfo, rscnames, ncx.isNCX, mh, usedmap, guidetext=guidetext) opf.writeOPF()
def main(argv=unicode_argv()): if len(argv) != 5: failed(None, msg="Launcher: improper number of arguments passed to launcher.py") return -1 ebook_root = argv[1] outdir = argv[2] script_type = argv[3] target_file = argv[4] script_home = os.path.dirname(target_file) script_module = os.path.splitext(os.path.basename(target_file))[0] # do basic sanity checking anyway if script_type not in SUPPORTED_SCRIPT_TYPES: failed(None, msg="Launcher: script type %s is not supported" % script_type) return -1 ok = unipath.exists(ebook_root) and unipath.isdir(ebook_root) ok = ok and unipath.exists(outdir) and unipath.isdir(outdir) ok = ok and unipath.exists(script_home) and unipath.isdir(script_home) ok = ok and unipath.exists(target_file) and unipath.isfile(target_file) if not ok: failed(None, msg="Launcher: missing or incorrect paths passed in") return -1 # update sys with path to target module home directory if script_home not in sys.path: sys.path.append(script_home) # load and parse opf if present op = None opf_path = os.path.join(ebook_root, "OEBPS", "content.opf") if unipath.exists(opf_path) and unipath.isfile(opf_path): op = Opf_Parser(opf_path) # create a wrapper for record keeping and safety rk = Wrapper(ebook_root, outdir, op) # get the correct container if script_type == "edit": bc = BookContainer(rk) elif script_type == "input": bc = InputContainer(rk) else: bc = OutputContainer(rk) # start the target script ps = ProcessScript(script_type, script_module, bc) ps.launch() # get standard error and standard out from the target script successmsg = "" for data in ps.stdouttext: successmsg += unicode_str(data) successmsg = escapeit(successmsg) errorlog = "" for data in ps.stderrtext: errorlog += unicode_str(data) errorlog = escapeit(errorlog) # get the target's script wrapper xml resultxml = "".join(ps.wrapout) resultxml += "<msg>\n" if ps.exitcode == 0: resultxml += successmsg if _DEBUG: resultxml += errorlog else: if _DEBUG: resultxml += successmsg resultxml += errorlog resultxml += "</msg>\n</wrapper>\n" # write it to stdout and exit if PY3: sys.stdout.buffer.write(utf8_str(resultxml)) else: sys.stdout.write(utf8_str(resultxml)) return 0
def getGuideText(self): guidetext = b'' for [ref_type, ref_title, fileno] in self.guidetbl: if ref_type == b'thumbimagestandard': continue if ref_type not in _guide_types and not ref_type.startswith( b'other.'): if ref_type == b'start': ref_type = b'text' else: ref_type = b'other.' + ref_type [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno] [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos) idtext = self.getIDTag(pos) linktgt = filename.encode('utf-8') if idtext != b'': linktgt += b'#' + idtext guidetext += b'<reference type="' + ref_type + b'" title="' + ref_title + b'" href="' + utf8_str( pdir) + b'/' + linktgt + b'" />\n' # opf is encoded utf-8 so must convert any titles properly guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8") return guidetext