def complete(self, wc): if wc == 2: self.complete_input() elif wc == 3: self.complete_output() else: q = list(self.words[1:3]) q = [os.path.splitext(x)[0 if x.startswith('.') else 1].partition('.')[-1].lower() for x in q] if not q[1]: q[1] = 'oeb' q = tuple(q) if q in self.cache: ans = [x for x in self.cache[q] if x.startswith(self.prefix)] else: from calibre.ebooks.conversion.cli import create_option_parser from calibre.utils.logging import Log log = Log() log.outputs = [] ans = [] if not self.prefix or self.prefix.startswith('-'): try: parser, _ = create_option_parser(self.words[:3], log) ans += list(get_opts_from_parser(parser, self.prefix)) except: pass if self.previous.startswith('-'): ans += list(files_and_dirs(self.prefix, None)) send(ans)
def setup_pipeline(self, *args): oidx = self.groups.currentIndex().row() input_format = self.input_format output_format = self.output_format output_path = 'dummy.'+output_format log = Log() log.outputs = [] input_file = 'dummy.'+input_format if input_format in ARCHIVE_FMTS: input_file = 'dummy.html' self.plumber = Plumber(input_file, output_path, log) def widget_factory(cls): return cls(self.stack, self.plumber.get_option_by_name, self.plumber.get_option_help, self.db, self.book_id) self.mw = widget_factory(MetadataWidget) self.setWindowTitle(_('Convert')+ ' ' + unicode(self.mw.title.text())) lf = widget_factory(LookAndFeelWidget) hw = widget_factory(HeuristicsWidget) sr = widget_factory(SearchAndReplaceWidget) ps = widget_factory(PageSetupWidget) sd = widget_factory(StructureDetectionWidget) toc = widget_factory(TOCWidget) from calibre.gui2.actions.toc_edit import SUPPORTED toc.manually_fine_tune_toc.setVisible(output_format.upper() in SUPPORTED) debug = widget_factory(DebugWidget) output_widget = self.plumber.output_plugin.gui_configuration_widget( self.stack, self.plumber.get_option_by_name, self.plumber.get_option_help, self.db, self.book_id) input_widget = self.plumber.input_plugin.gui_configuration_widget( self.stack, self.plumber.get_option_by_name, self.plumber.get_option_help, self.db, self.book_id) while True: c = self.stack.currentWidget() if not c: break self.stack.removeWidget(c) widgets = [self.mw, lf, hw, ps, sd, toc, sr] if input_widget is not None: widgets.append(input_widget) if output_widget is not None: widgets.append(output_widget) widgets.append(debug) for w in widgets: self.stack.addWidget(w) w.set_help_signal.connect(self.help.setPlainText) self._groups_model = GroupModel(widgets) self.groups.setModel(self._groups_model) idx = oidx if -1 < oidx < self._groups_model.rowCount() else 0 self.groups.setCurrentIndex(self._groups_model.index(idx)) self.stack.setCurrentIndex(idx) try: shutil.rmtree(self.plumber.archive_input_tdir, ignore_errors=True) except: pass
def main(args=sys.argv): log = Log() parser, plumber = create_option_parser(args, log) opts, leftover_args = parser.parse_args(args) if len(leftover_args) > 3: log.error('Extra arguments not understood:', u', '.join(leftover_args[3:])) return 1 for x in ('read_metadata_from_opf', 'cover'): if getattr(opts, x, None) is not None: setattr(opts, x, abspath(getattr(opts, x))) if opts.search_replace: opts.search_replace = read_sr_patterns(opts.search_replace, log) recommendations = [(n.dest, getattr(opts, n.dest), OptionRecommendation.HIGH) for n in parser.options_iter() if n.dest] plumber.merge_ui_recommendations(recommendations) try: plumber.run() except ConversionUserFeedBack as e: ll = {'info': log.info, 'warn': log.warn, 'error':log.error}.get(e.level, log.info) ll(e.title) if e.det_msg: log.debug(e.detmsg) ll(e.msg) raise SystemExit(1) log(_('Output saved to'), ' ', plumber.output) return 0
def get_metadata(stream): from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre.utils.magick.draw import save_cover_data_to from calibre import CurrentDir stream.seek(0) try: raw = stream.read(3) except: raw = '' stream.seek(0) if raw == b'TPZ': from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) from calibre.utils.logging import Log log = Log() try: mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) except: mi = MetaInformation(_('Unknown'), [_('Unknown')]) mh = MetadataHeader(stream, log) if mh.title and mh.title != _('Unknown'): mi.title = mh.title if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi else: size = 1024**3 if hasattr(stream, 'seek') and hasattr(stream, 'tell'): pos = stream.tell() stream.seek(0, 2) size = stream.tell() stream.seek(pos) if size < 4*1024*1024: with TemporaryDirectory('_mobi_meta_reader') as tdir: with CurrentDir(tdir): mr = MobiReader(stream, log) parse_cache = {} mr.extract_content(tdir, parse_cache) if mr.embedded_mi is not None: mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index)) else: try: data = mh.section_data(mh.first_image_index) except: data = '' if data and what(None, data) in {'jpg', 'jpeg', 'gif', 'png', 'bmp', 'webp'}: try: mi.cover_data = ('jpg', save_cover_data_to(data, 'cover.jpg', return_data=True)) except Exception: log.exception('Failed to read MOBI cover') return mi
def __init__(self, files): s = Stream() self.log_stream = s.stream log = Log() log.outputs = [s] ContainerBase.__init__(self, log=log) self.mime_map = {k:self.guess_type(k) for k in files} self.files = files
def setup_pipeline(self, *args): oidx = self.groups.currentIndex().row() output_format = self.output_format input_path = 'dummy.epub' output_path = 'dummy.'+output_format log = Log() log.outputs = [] self.plumber = Plumber(input_path, output_path, log, merge_plugin_recs=False) def widget_factory(cls): return cls(self.stack, self.plumber.get_option_by_name, self.plumber.get_option_help, self.db) self.setWindowTitle(_('Bulk Convert')) lf = widget_factory(LookAndFeelWidget) hw = widget_factory(HeuristicsWidget) sr = widget_factory(SearchAndReplaceWidget) ps = widget_factory(PageSetupWidget) sd = widget_factory(StructureDetectionWidget) toc = widget_factory(TOCWidget) output_widget = None name = self.plumber.output_plugin.name.lower().replace(' ', '_') try: output_widget = importlib.import_module( 'calibre.gui2.convert.'+name) pw = output_widget.PluginWidget pw.ICON = I('back.png') pw.HELP = _('Options specific to the output format.') output_widget = widget_factory(pw) except ImportError: pass while True: c = self.stack.currentWidget() if not c: break self.stack.removeWidget(c) widgets = [lf, hw, ps, sd, toc, sr] if output_widget is not None: widgets.append(output_widget) for w in widgets: self.stack.addWidget(w) self.connect(w, SIGNAL('set_help(PyQt_PyObject)'), self.help.setPlainText) self._groups_model = GroupModel(widgets) self.groups.setModel(self._groups_model) idx = oidx if -1 < oidx < self._groups_model.rowCount() else 0 self.groups.setCurrentIndex(self._groups_model.index(idx)) self.stack.setCurrentIndex(idx) try: shutil.rmtree(self.plumber.archive_input_tdir, ignore_errors=True) except: pass
def setup_pipeline(self, *args): oidx = self.groups.currentIndex().row() output_format = self.output_format input_path = 'dummy.epub' output_path = 'dummy.'+output_format log = Log() log.outputs = [] self.plumber = Plumber(input_path, output_path, log, merge_plugin_recs=False) def widget_factory(cls): return cls(self.stack, self.plumber.get_option_by_name, self.plumber.get_option_help, self.db) self.setWindowTitle(_('Bulk Convert')) lf = widget_factory(LookAndFeelWidget) hw = widget_factory(HeuristicsWidget) sr = widget_factory(SearchAndReplaceWidget) ps = widget_factory(PageSetupWidget) sd = widget_factory(StructureDetectionWidget) toc = widget_factory(TOCWidget) toc.manually_fine_tune_toc.hide() output_widget = self.plumber.output_plugin.gui_configuration_widget( self.stack, self.plumber.get_option_by_name, self.plumber.get_option_help, self.db) while True: c = self.stack.currentWidget() if not c: break self.stack.removeWidget(c) widgets = [lf, hw, ps, sd, toc, sr] if output_widget is not None: widgets.append(output_widget) for w in widgets: self.stack.addWidget(w) self.connect(w, SIGNAL('set_help(PyQt_PyObject)'), self.help.setPlainText) self._groups_model = GroupModel(widgets) self.groups.setModel(self._groups_model) idx = oidx if -1 < oidx < self._groups_model.rowCount() else 0 self.groups.setCurrentIndex(self._groups_model.index(idx)) self.stack.setCurrentIndex(idx) try: shutil.rmtree(self.plumber.archive_input_tdir, ignore_errors=True) except: pass
def genesis(self, gui): log = Log() log.outputs = [] self.plumber = Plumber('dummy.epub', 'dummy.epub', log, dummy=True, merge_plugin_recs=False) def widget_factory(cls): return cls(self, self.plumber.get_option_by_name, self.plumber.get_option_help, None, None) self.load_conversion_widgets() widgets = list(map(widget_factory, self.conversion_widgets)) self.model = Model(widgets) self.list.setModel(self.model) for w in widgets: w.changed_signal.connect(self.changed_signal) self.stack.addWidget(w) self.list.currentChanged = self.category_current_changed self.list.setCurrentIndex(self.model.index(0))
def __init__(self, path): tmpdir = PersistentTemporaryDirectory("_kobo-driver-extended") zf = zipfile.ZipFile(path) zf.extractall(tmpdir) self.root = os.path.abspath(tmpdir) self.log = Log() self.dirtied = set([]) self.cache = {} self.mime_map = {} print("Container:__init__:Got container path {0}".format(self.root)) if os.path.exists(os.path.join(self.root, 'mimetype')): os.remove(os.path.join(self.root, 'mimetype')) container_path = os.path.join(self.root, 'META-INF', 'container.xml') if not os.path.exists(container_path): raise InvalidEpub('No META-INF/container.xml in epub') self.container = etree.fromstring(open(container_path, 'rb').read()) opf_files = self.container.xpath((r'child::ocf:rootfiles/ocf:rootfile[@media-type="{0}" and @full-path]'.format(guess_type('a.opf')[0])), namespaces = self.namespaces) if not opf_files: raise InvalidEpub('META-INF/container.xml contains no link to OPF file') opf_path = os.path.join(self.root, *opf_files[0].get('full-path').split('/')) if not os.path.exists(opf_path): raise InvalidEpub('OPF file does not exist at location pointed to by META-INF/container.xml') # Map of relative paths with / separators to absolute # paths on filesystem with os separators self.name_map = {} for dirpath, dirnames, filenames in os.walk(self.root): for f in filenames: path = os.path.join(dirpath, f) name = os.path.relpath(path, self.root).replace(os.sep, '/') self.name_map[name] = path self.mime_map[name] = guess_type(f)[0] if path == opf_path: self.opf_name = name self.mime_map[name] = guess_type('a.opf')[0] opf = self.opf for item in opf.xpath('//opf:manifest/opf:item[@href and @media-type]', namespaces = self.namespaces): href = unquote(item.get('href')) item.set("href", href) self.mime_map[self.href_to_name(href, os.path.dirname(self.opf_name).replace(os.sep, '/'))] = item.get('media-type') self.set(self.opf_name, opf)
def main(args=None): parser = option_parser() opts, args = parser.parse_args(args or sys.argv[1:]) log = Log(level=Log.DEBUG if opts.verbose else Log.INFO) if not args: parser.print_help() log.error(_('You must provide the input file to polish')) raise SystemExit(1) if len(args) > 2: parser.print_help() log.error(_('Unknown extra arguments')) raise SystemExit(1) if len(args) == 1: inbook = args[0] base, ext = inbook.rpartition('.')[0::2] outbook = base + '_polished.' + ext else: inbook, outbook = args popts = ALL_OPTS.copy() for k, v in popts.iteritems(): popts[k] = getattr(opts, k, None) O = namedtuple('Options', ' '.join(popts.iterkeys())) popts = O(**popts) report = [] if not tuple(filter(None, (getattr(popts, name) for name in ALL_OPTS))): parser.print_help() log.error(_('You must specify at least one action to perform')) raise SystemExit(1) polish({inbook:outbook}, popts, log, report.append) log('') log(REPORT) for msg in report: log(msg) log('Output written to:', outbook)
from calibre.library import db from calibre.utils.logging import Log import json from itertools import chain log = Log() mydb = db('~/Archive/CalTest/').new_api from calibre_plugins.crossref_doi_download import DoiMeta from calibre.ebooks.metadata.book.base import Metadata from calibre_plugins.crossref_doi_download.doi_reader import DoiReader from crossref_fields import USED_FIELDS, IGNORED_FIELDS, COND_FIELDS, check_uninterpreted_fields, check_uninterpreted_list # from calibre_plugins.crossref_doi_download.crossref_fields import USED_FIELDS, IGNORED_FIELDS, COND_FIELDS, check_uninterpreted_fields dm = DoiMeta('./plugin/') url1 = 'https://api.crossref.org/works/10.1002/bmc.835?mailto=vikoya5988%40oniaj.com' def get_prop_list(results, prop): for ind, res in enumerate(results): if prop in res: print(ind, res[prop]) reader = DoiReader(log) br = dm.browser br.set_debug_http(True) # user_agent = # [('User-agent', # 'Mozilla/5.0 (X11;U;Linux 2.4.2.-2 i586; en-us;m18) Gecko/200010131 Netscape6/6.01'
def do_download_for_worker(book, options, merge, notification=lambda x, y: x): ''' Child job, to download story when run as a worker job ''' from calibre_plugins.fanficfare_plugin import FanFicFareBase fffbase = FanFicFareBase(options['plugin_path']) with fffbase: # so the sys.path was modified while loading the # plug impl. from calibre_plugins.fanficfare_plugin.dialogs import NotGoingToDownload from calibre_plugins.fanficfare_plugin.prefs import ( SAVE_YES, SAVE_YES_UNLESS_SITE, OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY, CALIBREONLYSAVECOL) from calibre_plugins.fanficfare_plugin.wordcount import get_word_count from fanficfare import adapters, writers from fanficfare.epubutils import get_update_data from fanficfare.six import text_type as unicode from calibre_plugins.fanficfare_plugin.fff_util import get_fff_config try: logger.info("\n\n" + ("-" * 80) + " " + book['url']) ## No need to download at all. Can happen now due to ## collision moving into book for CALIBREONLY changing to ## ADDNEW when story URL not in library. if book['collision'] in (CALIBREONLY, CALIBREONLYSAVECOL): logger.info("Skipping CALIBREONLY 'update' down inside worker") return book book['comment'] = _('Download started...') configuration = get_fff_config(book['url'], options['fileform'], options['personal.ini']) if not options[ 'updateepubcover'] and 'epub_for_update' in book and book[ 'collision'] in (UPDATE, UPDATEALWAYS): configuration.set("overrides", "never_make_cover", "true") # images only for epub, html, even if the user mistakenly # turned it on else where. if options['fileform'] not in ("epub", "html"): configuration.set("overrides", "include_images", "false") adapter = adapters.getAdapter(configuration, book['url']) adapter.is_adult = book['is_adult'] adapter.username = book['username'] adapter.password = book['password'] adapter.setChaptersRange(book['begin'], book['end']) ## each site download job starts with a new copy of the ## cookiejar and basic_cache from the FG process. They ## are not shared between different sites' BG downloads if configuration.getConfig('use_browser_cache'): if 'browser_cache' in options: configuration.set_browser_cache(options['browser_cache']) else: options['browser_cache'] = configuration.get_browser_cache( ) if 'browser_cachefile' in options: options['browser_cache'].load_cache( options['browser_cachefile']) if 'basic_cache' in options: configuration.set_basic_cache(options['basic_cache']) else: options['basic_cache'] = configuration.get_basic_cache() options['basic_cache'].load_cache(options['basic_cachefile']) if 'cookiejar' in options: configuration.set_cookiejar(options['cookiejar']) else: options['cookiejar'] = configuration.get_cookiejar() options['cookiejar'].load_cookiejar(options['cookiejarfile']) story = adapter.getStoryMetadataOnly() if not story.getMetadata("series") and 'calibre_series' in book: adapter.setSeries(book['calibre_series'][0], book['calibre_series'][1]) # set PI version instead of default. if 'version' in options: story.setMetadata('version', options['version']) book['title'] = story.getMetadata("title", removeallentities=True) book['author_sort'] = book['author'] = story.getList( "author", removeallentities=True) book['publisher'] = story.getMetadata("publisher") book['url'] = story.getMetadata("storyUrl", removeallentities=True) book['tags'] = story.getSubjectTags(removeallentities=True) book['comments'] = story.get_sanitized_description() book['series'] = story.getMetadata("series", removeallentities=True) if story.getMetadataRaw('datePublished'): book['pubdate'] = story.getMetadataRaw( 'datePublished').replace(tzinfo=local_tz) if story.getMetadataRaw('dateUpdated'): book['updatedate'] = story.getMetadataRaw( 'dateUpdated').replace(tzinfo=local_tz) if story.getMetadataRaw('dateCreated'): book['timestamp'] = story.getMetadataRaw( 'dateCreated').replace(tzinfo=local_tz) else: book['timestamp'] = datetime.now().replace( tzinfo=local_tz) # need *something* there for calibre. writer = writers.getWriter(options['fileform'], configuration, adapter) outfile = book['outfile'] ## checks were done earlier, it's new or not dup or newer--just write it. if book['collision'] in (ADDNEW, SKIP, OVERWRITE, OVERWRITEALWAYS) or \ ('epub_for_update' not in book and book['collision'] in (UPDATE, UPDATEALWAYS)): # preserve logfile even on overwrite. if 'epub_for_update' in book: adapter.logfile = get_update_data( book['epub_for_update'])[6] # change the existing entries id to notid so # write_epub writes a whole new set to indicate overwrite. if adapter.logfile: adapter.logfile = adapter.logfile.replace( "span id", "span notid") if book['collision'] == OVERWRITE and 'fileupdated' in book: lastupdated = story.getMetadataRaw('dateUpdated') fileupdated = book['fileupdated'] # updated doesn't have time (or is midnight), use dates only. # updated does have time, use full timestamps. if (lastupdated.time() == time.min and fileupdated.date() > lastupdated.date()) or \ (lastupdated.time() != time.min and fileupdated > lastupdated): raise NotGoingToDownload( _("Not Overwriting, web site is not newer."), 'edit-undo.png', showerror=False) logger.info("write to %s" % outfile) inject_cal_cols(book, story, configuration) writer.writeStory(outfilename=outfile, forceOverwrite=True, notification=notification) if adapter.story.chapter_error_count > 0: book['comment'] = _('Download %(fileform)s completed, %(failed)s failed chapters, %(total)s total chapters.')%\ {'fileform':options['fileform'], 'failed':adapter.story.chapter_error_count, 'total':story.getMetadata("numChapters")} book[ 'chapter_error_count'] = adapter.story.chapter_error_count else: book['comment'] = _('Download %(fileform)s completed, %(total)s chapters.')%\ {'fileform':options['fileform'], 'total':story.getMetadata("numChapters")} book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() ## checks were done earlier, just update it. elif 'epub_for_update' in book and book['collision'] in ( UPDATE, UPDATEALWAYS): # update now handled by pre-populating the old images and # chapters in the adapter rather than merging epubs. #urlchaptercount = int(story.getMetadata('numChapters').replace(',','')) # returns int adjusted for start-end range. urlchaptercount = story.getChapterCount() (url, chaptercount, adapter.oldchapters, adapter.oldimgs, adapter.oldcover, adapter.calibrebookmark, adapter.logfile, adapter.oldchaptersmap, adapter.oldchaptersdata) = get_update_data( book['epub_for_update'])[0:9] # dup handling from fff_plugin needed for anthology updates. if book['collision'] == UPDATE: if chaptercount == urlchaptercount: if merge: book['comment'] = _( "Already contains %d chapters. Reuse as is." ) % chaptercount book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata( ) book['outfile'] = book[ 'epub_for_update'] # for anthology merge ops. return book else: # not merge, raise NotGoingToDownload( _("Already contains %d chapters.") % chaptercount, 'edit-undo.png', showerror=False) elif chaptercount > urlchaptercount: raise NotGoingToDownload( _("Existing epub contains %d chapters, web site only has %d. Use Overwrite to force update." ) % (chaptercount, urlchaptercount), 'dialog_error.png') elif chaptercount == 0: raise NotGoingToDownload( _("FanFicFare doesn't recognize chapters in existing epub, epub is probably from a different source. Use Overwrite to force update." ), 'dialog_error.png') if not (book['collision'] == UPDATEALWAYS and chaptercount == urlchaptercount) \ and adapter.getConfig("do_update_hook"): chaptercount = adapter.hookForUpdates(chaptercount) logger.info("Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount)) logger.info("write to %s" % outfile) inject_cal_cols(book, story, configuration) writer.writeStory(outfilename=outfile, forceOverwrite=True, notification=notification) if adapter.story.chapter_error_count > 0: book['comment'] = _('Update %(fileform)s completed, added %(added)s chapters, %(failed)s failed chapters, for %(total)s total.')%\ {'fileform':options['fileform'], 'failed':adapter.story.chapter_error_count, 'added':(urlchaptercount-chaptercount), 'total':urlchaptercount} book[ 'chapter_error_count'] = adapter.story.chapter_error_count else: book['comment'] = _('Update %(fileform)s completed, added %(added)s chapters for %(total)s total.')%\ {'fileform':options['fileform'],'added':(urlchaptercount-chaptercount),'total':urlchaptercount} book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() else: ## Shouldn't ever get here, but hey, it happened once ## before with prefs['collision'] raise Exception( "Impossible state reached -- Book: %s:\nOptions:%s:" % (book, options)) if options['do_wordcount'] == SAVE_YES or ( options['do_wordcount'] == SAVE_YES_UNLESS_SITE and not story.getMetadataRaw('numWords')): try: wordcount = get_word_count(outfile) # logger.info("get_word_count:%s"%wordcount) story.setMetadata('numWords', wordcount) writer.writeStory(outfilename=outfile, forceOverwrite=True) book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() except: logger.error("WordCount failed") if options['smarten_punctuation'] and options['fileform'] == "epub" \ and calibre_version >= (0, 9, 39): # for smarten punc from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS from calibre.utils.logging import Log from collections import namedtuple # do smarten_punctuation from calibre's polish feature data = {'smarten_punctuation': True} opts = ALL_OPTS.copy() opts.update(data) O = namedtuple('Options', ' '.join(six.iterkeys(ALL_OPTS))) opts = O(**opts) log = Log(level=Log.DEBUG) polish({outfile: outfile}, opts, log, logger.info) except NotGoingToDownload as d: book['good'] = False book['status'] = _('Bad') book['showerror'] = d.showerror book['comment'] = unicode(d) book['icon'] = d.icon except Exception as e: book['good'] = False book['status'] = _('Error') book['comment'] = unicode(e) book['icon'] = 'dialog_error.png' book['status'] = _('Error') logger.info("Exception: %s:%s" % (book, book['comment']), exc_info=True) return book
def run(self, opts): from calibre.utils.serialize import msgpack_dumps scripts = {} for x in ('console', 'gui'): for name in basenames[x]: if name in ('calibre-complete', 'calibre_postinstall'): continue scripts[name] = x dest = self.j(self.RESOURCES, 'scripts.calibre_msgpack') if self.newer(dest, self.j(self.SRC, 'calibre', 'linux.py')): self.info('\tCreating ' + self.b(dest)) with open(dest, 'wb') as f: f.write(msgpack_dumps(scripts)) from calibre.web.feeds.recipes.collection import \ serialize_builtin_recipes, iterate_over_builtin_recipe_files files = [x[1] for x in iterate_over_builtin_recipe_files()] dest = self.j(self.RESOURCES, 'builtin_recipes.xml') if self.newer(dest, files): self.info('\tCreating builtin_recipes.xml') xml = serialize_builtin_recipes() with open(dest, 'wb') as f: f.write(xml) recipe_icon_dir = self.a(self.j(self.RESOURCES, '..', 'recipes', 'icons')) dest = os.path.splitext(dest)[0] + '.zip' files += glob.glob(self.j(recipe_icon_dir, '*.png')) if self.newer(dest, files): self.info('\tCreating builtin_recipes.zip') with zipfile.ZipFile(dest, 'w', zipfile.ZIP_STORED) as zf: for n in sorted(files, key=self.b): with open(n, 'rb') as f: zf.writestr(self.b(n), f.read()) dest = self.j(self.RESOURCES, 'ebook-convert-complete.calibre_msgpack') files = [] for x in os.walk(self.j(self.SRC, 'calibre')): for f in x[-1]: if f.endswith('.py'): files.append(self.j(x[0], f)) if self.newer(dest, files): self.info('\tCreating ' + self.b(dest)) complete = {} from calibre.ebooks.conversion.plumber import supported_input_formats complete['input_fmts'] = set(supported_input_formats()) from calibre.web.feeds.recipes.collection import get_builtin_recipe_titles complete['input_recipes'] = [t+'.recipe ' for t in get_builtin_recipe_titles()] from calibre.customize.ui import available_output_formats complete['output'] = set(available_output_formats()) from calibre.ebooks.conversion.cli import create_option_parser from calibre.utils.logging import Log log = Log() # log.outputs = [] for inf in supported_input_formats(): if inf in ('zip', 'rar', 'oebzip'): continue for ouf in available_output_formats(): of = ouf if ouf == 'oeb' else 'dummy.'+ouf p = create_option_parser(('ec', 'dummy1.'+inf, of, '-h'), log)[0] complete[(inf, ouf)] = [x+' 'for x in get_opts_from_parser(p)] with open(dest, 'wb') as f: f.write(msgpack_dumps(only_unicode_recursive(complete))) self.info('\tCreating template-functions.json') dest = self.j(self.RESOURCES, 'template-functions.json') function_dict = {} import inspect from calibre.utils.formatter_functions import formatter_functions for obj in formatter_functions().get_builtins().values(): eval_func = inspect.getmembers(obj, lambda x: inspect.ismethod(x) and x.__name__ == 'evaluate') try: lines = [l[4:] for l in inspect.getsourcelines(eval_func[0][1])[0]] except: continue lines = ''.join(lines) function_dict[obj.name] = lines dump_json(function_dict, dest) self.info('\tCreating editor-functions.json') dest = self.j(self.RESOURCES, 'editor-functions.json') function_dict = {} from calibre.gui2.tweak_book.function_replace import builtin_functions for func in builtin_functions(): try: src = ''.join(inspect.getsourcelines(func)[0][1:]) except Exception: continue src = src.replace('def ' + func.__name__, 'def replace') imports = ['from %s import %s' % (x.__module__, x.__name__) for x in func.imports] if imports: src = '\n'.join(imports) + '\n\n' + src function_dict[func.name] = src dump_json(function_dict, dest) self.info('\tCreating user-manual-translation-stats.json') d = {} for lc, stats in iteritems(json.load(open(self.j(self.d(self.SRC), 'manual', 'locale', 'completed.json')))): total = sum(itervalues(stats)) d[lc] = stats['translated'] / float(total) dump_json(d, self.j(self.RESOURCES, 'user-manual-translation-stats.json')) src = self.j(self.SRC, '..', 'Changelog.txt') dest = self.j(self.RESOURCES, 'changelog.json') if self.newer(dest, [src]): self.info('\tCreating changelog.json') from setup.changelog import parse with open(src) as f: dump_json(parse(f.read(), parse_dates=False), dest)
class Container(object): META_INF = { 'container.xml' : True, 'manifest.xml' : False, 'encryption.xml' : False, 'metadata.xml' : False, 'signatures.xml' : False, 'rights.xml' : False, } OCF_NS = 'urn:oasis:names:tc:opendocument:xmlns:container' OPF_NS = 'http://www.idpf.org/2007/opf' NCX_NS = "http://www.daisy.org/z3986/2005/ncx/" DC_NS = "http://purl.org/dc/elements/1.1/" XHTML_NS = "http://www.w3.org/1999/xhtml" OPF_MIMETYPE = 'application/oebps-package+xml' NCX_MIMETYPE = "application/x-dtbncx+xml" def __init__(self, path): tmpdir = PersistentTemporaryDirectory("_kobo-driver-extended") zf = zipfile.ZipFile(path) zf.extractall(tmpdir) self.root = os.path.abspath(tmpdir) self.log = Log() self.dirtied = set([]) self.cache = {} self.mime_map = {} print("Got container path {0}".format(self.root)) if os.path.exists(os.path.join(self.root, 'mimetype')): os.remove(os.path.join(self.root, 'mimetype')) container_path = os.path.join(self.root, 'META-INF', 'container.xml') if not os.path.exists(container_path): raise InvalidEpub('No META-INF/container.xml in epub') self.container = etree.fromstring(open(container_path, 'rb').read()) opf_files = self.container.xpath((r'child::ocf:rootfiles/ocf:rootfile[@media-type="{0}" and @full-path]'.format(guess_type('a.opf')[0])), namespaces = {'ocf': self.OCF_NS}) if not opf_files: raise InvalidEpub('META-INF/container.xml contains no link to OPF file') opf_path = os.path.join(self.root, *opf_files[0].get('full-path').split('/')) if not os.path.exists(opf_path): raise InvalidEpub('OPF file does not exist at location pointed to by META-INF/container.xml') # Map of relative paths with / separators to absolute # paths on filesystem with os separators self.name_map = {} for dirpath, dirnames, filenames in os.walk(self.root): for f in filenames: path = os.path.join(dirpath, f) name = os.path.relpath(path, self.root).replace(os.sep, '/') self.name_map[name] = path if path == opf_path: self.opf_name = name self.mime_map[name] = guess_type('a.opf')[0] for item in self.opf.xpath('//opf:manifest/opf:item[@href and @media-type]', namespaces = {'opf': self.OPF_NS}): href = item.get('href') self.mime_map[self.href_to_name(href, posixpath.dirname(self.opf_name))] = item.get('media-type') def get_html_names(self): """A generator function that yields only HTML file names from the ePub. """ for name in self.name_map.keys(): ext = name[name.lower().rfind('.'):].lower() if ext in HTML_EXTENSIONS: yield name def is_drm_encrypted(self): """Determine if the ePub container is encumbered with Digital Restrictions Management. This method looks for the 'encryption.xml' file which denotes an ePub encumbered by Digital Restrictions Management. DRM-encumbered files cannot be edited. """ if 'META-INF/encryption.xml' in self.name_map: try: xml = self.get('META-INF/encryption.xml') if not xml: return True # Even if encryption.xml can't be parsed, assume its presence means an encumbered file for elem in xml.xpath('.//*[contains(name(), "EncryptionMethod")]'): alg = elem.get('Algorithm') return alg != 'http://ns.adobe.com/pdf/enc#RC' except: self.log.error("Could not parse encryption.xml") return True # If encryption.xml is present, assume the file is encumbered return False def manifest_worthy_names(self): for name in self.name_map: if name.endswith('.opf'): continue if name.startswith('META-INF') and posixpath.basename(name) in self.META_INF: continue yield name def delete_name(self, name): self.mime_map.pop(name, None) path = self.name_map[name] os.remove(path) self.name_map.pop(name) def manifest_item_for_name(self, name): href = self.name_to_href(name, posixpath.dirname(self.opf_name)) q = prepare_string_for_xml(href, attribute = True) existing = self.opf.xpath('//opf:manifest/opf:item[@href="{0}"]'.format(q), namespaces = {'opf': self.OPF_NS}) if not existing: return None return existing[0] def add_name_to_manifest(self, name, mt = None): item = self.manifest_item_for_name(name) if item is not None: return manifest = self.opf.xpath('//opf:manifest', namespaces = {'opf': self.OPF_NS})[0] item = manifest.makeelement('{%s}item' % self.OPF_NS, nsmap = {'opf': self.OPF_NS}, href = self.name_to_href(name, posixpath.dirname(self.opf_name)), id = self.generate_manifest_id()) if not mt: mt = guess_type(posixpath.basename(name))[0] if not mt: mt = 'application/octest-stream' item.set('media-type', mt) manifest.append(item) self.fix_tail(item) def fix_tail(self, item): ''' Designed only to work with self closing elements after item has just been inserted/appended ''' parent = item.getparent() idx = parent.index(item) if idx == 0: item.tail = parent.text else: item.tail = parent[idx - 1].tail if idx == len(parent) - 1: parent[idx - 1].tail = parent.text def generate_manifest_id(self): items = self.opf.xpath('//opf:manifest/opf:item[@id]', namespaces = {'opf': self.OPF_NS}) ids = set([x.get('id') for x in items]) for x in xrange(sys.maxint): c = 'id{0}'.format(x) if c not in ids: return c @property def opf(self): return self.get(self.opf_name) def href_to_name(self, href, base = ''): """Changed to fix a bug which incorrectly splits the href on '#' when '#' is part of the file name. Also normalizes the path. Taken from the calibre Modify Epub plugin's Container implementation. """ hash_index = href.find('#') period_index = href.find('.') if hash_index > 0 and hash_index > period_index: href = href.partition('#')[0] href = urllib.unquote(href) name = href if base: name = posixpath.join(base, href) name = os.path.normpath(name).replace('\\', '/') return name def name_to_href(self, name, base): """Changed to ensure that blank href names are referenced as the empty string instead of '.'. Taken from the calibre Modify Epub plugin's Container implementation. """ if not base: return name href = posixpath.relpath(name, base) if href == '.': href = '' return href def decode(self, data): """Automatically decode :param:`data` into a `unicode` object.""" def fix_data(d): return d.replace('\r\n', '\n').replace('\r', '\n') if isinstance(data, unicode): return fix_data(data) bom_enc = None if data[:4] in ('\0\0\xfe\xff', '\xff\xfe\0\0'): bom_enc = {'\0\0\xfe\xff':'utf-32-be', '\xff\xfe\0\0':'utf-32-le'}[data[:4]] data = data[4:] elif data[:2] in ('\xff\xfe', '\xfe\xff'): bom_enc = {'\xff\xfe':'utf-16-le', '\xfe\xff':'utf-16-be'}[data[:2]] data = data[2:] elif data[:3] == '\xef\xbb\xbf': bom_enc = 'utf-8' data = data[3:] if bom_enc is not None: try: return fix_data(data.decode(bom_enc)) except UnicodeDecodeError: pass try: return fix_data(data.decode('utf-8')) except UnicodeDecodeError: pass data, _ = xml_to_unicode(data) return fix_data(data) def get_raw(self, name): path = self.name_map[name] return open(path, 'rb').read() def get(self, name): if name in self.cache: return self.cache[name] raw = self.get_raw(name) raw = self.decode(raw) if name in self.mime_map: try: raw = self._parse(raw, self.mime_map[name]) except XMLSyntaxError as err: raise ParseError(name, unicode(err)) self.cache[name] = raw return raw def set(self, name, val): self.cache[name] = val self.dirtied.add(name) def _parse(self, raw, mimetype): mt = mimetype.lower() if mt.endswith('+xml'): parser = etree.XMLParser(no_network = True, huge_tree = not iswindows) raw = xml_to_unicode(raw, strip_encoding_pats = True, assume_utf8 = True, resolve_entities = True)[0].strip() idx = raw.find('<html') if idx == -1: idx = raw.find('<HTML') if idx > -1: pre = raw[:idx] raw = raw[idx:] if '<!DOCTYPE' in pre: user_entities = {} for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): val = match.group(2) if val.startswith('"') and val.endswith('"'): val = val[1:-1] user_entities[match.group(1)] = val if user_entities: pat = re.compile(r'&(%s);' % ('|'.join(user_entities.keys()))) raw = pat.sub(lambda m:user_entities[m.group(1)], raw) return etree.fromstring(raw, parser = parser) return raw def write(self, path): for name in self.dirtied: data = self.cache[name] if hasattr(data, 'xpath'): data = etree.tostring(data, encoding = 'UTF-8', xml_declaration = True, pretty_print = True) f = open(self.name_map[name], "wb") f.write(data) f.close() self.dirtied.clear() if os.path.exists(path): os.unlink(path) epub = zipfile.ZipFile(path, 'w', compression = zipfile.ZIP_DEFLATED) epub.writestr('mimetype', bytes(guess_type('a.epub')[0]), compress_type = zipfile.ZIP_STORED) cwd = os.getcwdu() os.chdir(self.root) zip_prefix = self.root if not zip_prefix.endswith(os.sep): zip_prefix += os.sep for t in os.walk(self.root, topdown = True): for f in t[2]: if f not in EXCLUDE_FROM_ZIP: filepath = os.path.join(t[0], f).replace(zip_prefix, '') st = os.stat(filepath) mtime = time.localtime(st.st_mtime) if mtime[0] < 1980: os.utime(filepath, None) epub.write(filepath) epub.close() os.chdir(cwd)
def main(args=sys.argv): log = Log() parser, plumber = create_option_parser(args, log) opts, leftover_args = parser.parse_args(args) if len(leftover_args) > 3: log.error('Extra arguments not understood:', u', '.join(leftover_args[3:])) return 1 for x in ('read_metadata_from_opf', 'cover'): if getattr(opts, x, None) is not None: setattr(opts, x, abspath(getattr(opts, x))) if opts.search_replace: opts.search_replace = read_sr_patterns(opts.search_replace, log) if opts.transform_css_rules: from calibre.ebooks.css_transform_rules import import_rules, validate_rule with open(opts.transform_css_rules, 'rb') as tcr: opts.transform_css_rules = rules = list(import_rules(tcr.read())) for rule in rules: title, msg = validate_rule(rule) if title and msg: log.error('Failed to parse CSS transform rules') log.error(title) log.error(msg) return 1 recommendations = [(n.dest, getattr(opts, n.dest), OptionRecommendation.HIGH) for n in parser.options_iter() if n.dest] plumber.merge_ui_recommendations(recommendations) try: plumber.run() except ConversionUserFeedBack as e: ll = {'info': log.info, 'warn': log.warn, 'error':log.error}.get(e.level, log.info) ll(e.title) if e.det_msg: log.debug(e.detmsg) ll(e.msg) raise SystemExit(1) log(_('Output saved to'), ' ', plumber.output) return 0
def main(args=sys.argv): log = Log() parser, plumber = create_option_parser(args, log) opts, leftover_args = parser.parse_args(args) if len(leftover_args) > 3: log.error('Extra arguments not understood:', ', '.join(leftover_args[3:])) return 1 for x in ('read_metadata_from_opf', 'cover'): if getattr(opts, x, None) is not None: setattr(opts, x, abspath(getattr(opts, x))) if opts.search_replace: opts.search_replace = read_sr_patterns(opts.search_replace, log) if opts.transform_css_rules: from calibre.ebooks.css_transform_rules import import_rules, validate_rule with open(opts.transform_css_rules, 'rb') as tcr: opts.transform_css_rules = rules = list(import_rules(tcr.read())) for rule in rules: title, msg = validate_rule(rule) if title and msg: log.error('Failed to parse CSS transform rules') log.error(title) log.error(msg) return 1 if opts.transform_html_rules: from calibre.ebooks.html_transform_rules import import_rules, validate_rule with open(opts.transform_html_rules, 'rb') as tcr: opts.transform_html_rules = rules = list(import_rules(tcr.read())) for rule in rules: title, msg = validate_rule(rule) if title and msg: log.error('Failed to parse HTML transform rules') log.error(title) log.error(msg) return 1 recommendations = [(n.dest, getattr(opts, n.dest), OptionRecommendation.HIGH) for n in parser.options_iter() if n.dest] plumber.merge_ui_recommendations(recommendations) try: plumber.run() except ConversionUserFeedBack as e: ll = { 'info': log.info, 'warn': log.warn, 'error': log.error }.get(e.level, log.info) ll(e.title) if e.det_msg: log.debug(e.detmsg) ll(e.msg) raise SystemExit(1) log(_('Output saved to'), ' ', plumber.output) return 0
def run(self, opts): scripts = {} for x in ('console', 'gui'): for name in basenames[x]: if name in ('calibre-complete', 'calibre_postinstall'): continue scripts[name] = x dest = self.j(self.RESOURCES, 'scripts.pickle') if self.newer(dest, self.j(self.SRC, 'calibre', 'linux.py')): self.info('\tCreating scripts.pickle') f = open(dest, 'wb') cPickle.dump(scripts, f, -1) from calibre.web.feeds.recipes.collection import \ serialize_builtin_recipes, iterate_over_builtin_recipe_files files = [x[1] for x in iterate_over_builtin_recipe_files()] dest = self.j(self.RESOURCES, 'builtin_recipes.xml') if self.newer(dest, files): self.info('\tCreating builtin_recipes.xml') xml = serialize_builtin_recipes() with open(dest, 'wb') as f: f.write(xml) recipe_icon_dir = self.a( self.j(self.RESOURCES, '..', 'recipes', 'icons')) dest = os.path.splitext(dest)[0] + '.zip' files += glob.glob(self.j(recipe_icon_dir, '*.png')) if self.newer(dest, files): self.info('\tCreating builtin_recipes.zip') with zipfile.ZipFile(dest, 'w', zipfile.ZIP_STORED) as zf: for n in sorted(files, key=self.b): with open(n, 'rb') as f: zf.writestr(os.path.basename(n), f.read()) dest = self.j(self.RESOURCES, 'ebook-convert-complete.pickle') files = [] for x in os.walk(self.j(self.SRC, 'calibre')): for f in x[-1]: if f.endswith('.py'): files.append(self.j(x[0], f)) if self.newer(dest, files): self.info('\tCreating ebook-convert-complete.pickle') complete = {} from calibre.ebooks.conversion.plumber import supported_input_formats complete['input_fmts'] = set(supported_input_formats()) from calibre.web.feeds.recipes.collection import get_builtin_recipe_titles complete['input_recipes'] = [ t + '.recipe ' for t in get_builtin_recipe_titles() ] from calibre.customize.ui import available_output_formats complete['output'] = set(available_output_formats()) from calibre.ebooks.conversion.cli import create_option_parser from calibre.utils.logging import Log log = Log() # log.outputs = [] for inf in supported_input_formats(): if inf in ('zip', 'rar', 'oebzip'): continue for ouf in available_output_formats(): of = ouf if ouf == 'oeb' else 'dummy.' + ouf p = create_option_parser(('ec', 'dummy1.' + inf, of, '-h'), log)[0] complete[(inf, ouf)] = [ x + ' ' for x in get_opts_from_parser(p) ] cPickle.dump(complete, open(dest, 'wb'), -1) self.info('\tCreating template-functions.json') dest = self.j(self.RESOURCES, 'template-functions.json') function_dict = {} import inspect from calibre.utils.formatter_functions import formatter_functions for obj in formatter_functions().get_builtins().values(): eval_func = inspect.getmembers( obj, lambda x: inspect.ismethod(x) and x.__name__ == 'evaluate') try: lines = [ l[4:] for l in inspect.getsourcelines(eval_func[0][1])[0] ] except: continue lines = ''.join(lines) function_dict[obj.name] = lines import json json.dump(function_dict, open(dest, 'wb'), indent=4)
def do_download_for_worker(book, options, merge, notification=lambda x, y: x): ''' Child job, to download story when run as a worker job ''' from calibre_plugins.fanficfare_plugin import FanFicFareBase fffbase = FanFicFareBase(options['plugin_path']) with fffbase: from calibre_plugins.fanficfare_plugin.dialogs import ( NotGoingToDownload, OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY, CALIBREONLYSAVECOL) from calibre_plugins.fanficfare_plugin.fanficfare import adapters, writers, exceptions from calibre_plugins.fanficfare_plugin.fanficfare.epubutils import get_update_data from calibre_plugins.fanficfare_plugin.fff_util import ( get_fff_adapter, get_fff_config) try: book['comment'] = _('Download started...') configuration = get_fff_config(book['url'], options['fileform'], options['personal.ini']) if configuration.getConfig('use_ssl_unverified_context'): ## monkey patch to avoid SSL bug. dupliated from ## fff_plugin.py because bg jobs run in own process ## space. import ssl if hasattr(ssl, '_create_unverified_context'): ssl._create_default_https_context = ssl._create_unverified_context if not options[ 'updateepubcover'] and 'epub_for_update' in book and options[ 'collision'] in (UPDATE, UPDATEALWAYS): configuration.set("overrides", "never_make_cover", "true") # images only for epub, html, even if the user mistakenly # turned it on else where. if options['fileform'] not in ("epub", "html"): configuration.set("overrides", "include_images", "false") adapter = adapters.getAdapter(configuration, book['url']) adapter.is_adult = book['is_adult'] adapter.username = book['username'] adapter.password = book['password'] adapter.setChaptersRange(book['begin'], book['end']) configuration.load_cookiejar(options['cookiejarfile']) #logger.debug("cookiejar:%s"%configuration.cookiejar) configuration.set_pagecache(options['pagecache']) story = adapter.getStoryMetadataOnly() if not story.getMetadata("series") and 'calibre_series' in book: adapter.setSeries(book['calibre_series'][0], book['calibre_series'][1]) # set PI version instead of default. if 'version' in options: story.setMetadata('version', options['version']) book['title'] = story.getMetadata("title", removeallentities=True) book['author_sort'] = book['author'] = story.getList( "author", removeallentities=True) book['publisher'] = story.getMetadata("site") book['url'] = story.getMetadata("storyUrl") book['tags'] = story.getSubjectTags(removeallentities=True) book['comments'] = story.get_sanitized_description() book['series'] = story.getMetadata("series", removeallentities=True) if story.getMetadataRaw('datePublished'): book['pubdate'] = story.getMetadataRaw( 'datePublished').replace(tzinfo=local_tz) if story.getMetadataRaw('dateUpdated'): book['updatedate'] = story.getMetadataRaw( 'dateUpdated').replace(tzinfo=local_tz) if story.getMetadataRaw('dateCreated'): book['timestamp'] = story.getMetadataRaw( 'dateCreated').replace(tzinfo=local_tz) else: book['timestamp'] = datetime.now( ) # need *something* there for calibre. writer = writers.getWriter(options['fileform'], configuration, adapter) outfile = book['outfile'] ## No need to download at all. Shouldn't ever get down here. if options['collision'] in (CALIBREONLY, CALIBREONLYSAVECOL): logger.info( "Skipping CALIBREONLY 'update' down inside worker--this shouldn't be happening..." ) book['comment'] = _('Metadata collected.') book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() ## checks were done earlier, it's new or not dup or newer--just write it. elif options['collision'] in (ADDNEW, SKIP, OVERWRITE, OVERWRITEALWAYS) or \ ('epub_for_update' not in book and options['collision'] in (UPDATE, UPDATEALWAYS)): # preserve logfile even on overwrite. if 'epub_for_update' in book: adapter.logfile = get_update_data( book['epub_for_update'])[6] # change the existing entries id to notid so # write_epub writes a whole new set to indicate overwrite. if adapter.logfile: adapter.logfile = adapter.logfile.replace( "span id", "span notid") if options['collision'] == OVERWRITE and 'fileupdated' in book: lastupdated = story.getMetadataRaw('dateUpdated') fileupdated = book['fileupdated'] # updated doesn't have time (or is midnight), use dates only. # updated does have time, use full timestamps. if (lastupdated.time() == time.min and fileupdated.date() > lastupdated.date()) or \ (lastupdated.time() != time.min and fileupdated > lastupdated): raise NotGoingToDownload( _("Not Overwriting, web site is not newer."), 'edit-undo.png', showerror=False) logger.info("write to %s" % outfile) inject_cal_cols(book, story, configuration) writer.writeStory(outfilename=outfile, forceOverwrite=True) book['comment'] = _('Download %s completed, %s chapters.') % ( options['fileform'], story.getMetadata("numChapters")) book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() ## checks were done earlier, just update it. elif 'epub_for_update' in book and options['collision'] in ( UPDATE, UPDATEALWAYS): # update now handled by pre-populating the old images and # chapters in the adapter rather than merging epubs. #urlchaptercount = int(story.getMetadata('numChapters').replace(',','')) # returns int adjusted for start-end range. urlchaptercount = story.getChapterCount() (url, chaptercount, adapter.oldchapters, adapter.oldimgs, adapter.oldcover, adapter.calibrebookmark, adapter.logfile, adapter.oldchaptersmap, adapter.oldchaptersdata) = get_update_data( book['epub_for_update'])[0:9] # dup handling from fff_plugin needed for anthology updates. if options['collision'] == UPDATE: if chaptercount == urlchaptercount: if merge: book['comment'] = _( "Already contains %d chapters. Reuse as is." ) % chaptercount book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata( ) book['outfile'] = book[ 'epub_for_update'] # for anthology merge ops. return book else: # not merge, raise NotGoingToDownload( _("Already contains %d chapters.") % chaptercount, 'edit-undo.png', showerror=False) elif chaptercount > urlchaptercount: raise NotGoingToDownload( _("Existing epub contains %d chapters, web site only has %d. Use Overwrite to force update." ) % (chaptercount, urlchaptercount), 'dialog_error.png') elif chaptercount == 0: raise NotGoingToDownload( _("FanFicFare doesn't recognize chapters in existing epub, epub is probably from a different source. Use Overwrite to force update." ), 'dialog_error.png') if not (options['collision'] == UPDATEALWAYS and chaptercount == urlchaptercount) \ and adapter.getConfig("do_update_hook"): chaptercount = adapter.hookForUpdates(chaptercount) logger.info("Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount)) logger.info("write to %s" % outfile) inject_cal_cols(book, story, configuration) writer.writeStory(outfilename=outfile, forceOverwrite=True) book['comment'] = _('Update %s completed, added %s chapters for %s total.')%\ (options['fileform'],(urlchaptercount-chaptercount),urlchaptercount) book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() if options['do_wordcount'] == SAVE_YES or ( options['do_wordcount'] == SAVE_YES_UNLESS_SITE and not story.getMetadataRaw('numWords')): wordcount = get_word_count(outfile) logger.info("get_word_count:%s" % wordcount) story.setMetadata('numWords', wordcount) writer.writeStory(outfilename=outfile, forceOverwrite=True) book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() if options['smarten_punctuation'] and options['fileform'] == "epub" \ and calibre_version >= (0, 9, 39): # for smarten punc from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS from calibre.utils.logging import Log from collections import namedtuple # do smarten_punctuation from calibre's polish feature data = {'smarten_punctuation': True} opts = ALL_OPTS.copy() opts.update(data) O = namedtuple('Options', ' '.join(ALL_OPTS.iterkeys())) opts = O(**opts) log = Log(level=Log.DEBUG) polish({outfile: outfile}, opts, log, logger.info) except NotGoingToDownload as d: book['good'] = False book['showerror'] = d.showerror book['comment'] = unicode(d) book['icon'] = d.icon except Exception as e: book['good'] = False book['comment'] = unicode(e) book['icon'] = 'dialog_error.png' book['status'] = _('Error') logger.info("Exception: %s:%s" % (book, unicode(e)), exc_info=True) #time.sleep(10) return book
def get_metadata(stream): from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre import CurrentDir try: from PIL import Image as PILImage PILImage except ImportError: import Image as PILImage stream.seek(0) try: raw = stream.read(3) except: raw = '' stream.seek(0) if raw == b'TPZ': from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) from calibre.utils.logging import Log log = Log() try: mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) except: mi = MetaInformation(_('Unknown'), [_('Unknown')]) mh = MetadataHeader(stream, log) if mh.title and mh.title != _('Unknown'): mi.title = mh.title if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi else: size = 1024**3 if hasattr(stream, 'seek') and hasattr(stream, 'tell'): pos = stream.tell() stream.seek(0, 2) size = stream.tell() stream.seek(pos) if size < 4*1024*1024: with TemporaryDirectory('_mobi_meta_reader') as tdir: with CurrentDir(tdir): mr = MobiReader(stream, log) parse_cache = {} mr.extract_content(tdir, parse_cache) if mr.embedded_mi is not None: mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index)) else: try: data = mh.section_data(mh.first_image_index) except: data = '' buf = cStringIO.StringIO(data) try: im = PILImage.open(buf) except: log.exception('Failed to read MOBI cover') else: obuf = cStringIO.StringIO() im.convert('RGB').save(obuf, format='JPEG') mi.cover_data = ('jpg', obuf.getvalue()) return mi
def get_metadata(stream): from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre import CurrentDir try: from PIL import Image as PILImage PILImage except ImportError: import Image as PILImage stream.seek(0) try: raw = stream.read(3) except: raw = '' stream.seek(0) if raw == b'TPZ': from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) from calibre.utils.logging import Log log = Log() try: mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) except: mi = MetaInformation(_('Unknown'), [_('Unknown')]) mh = MetadataHeader(stream, log) if mh.title and mh.title != _('Unknown'): mi.title = mh.title if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi else: size = 1024**3 if hasattr(stream, 'seek') and hasattr(stream, 'tell'): pos = stream.tell() stream.seek(0, 2) size = stream.tell() stream.seek(pos) if size < 4 * 1024 * 1024: with TemporaryDirectory('_mobi_meta_reader') as tdir: with CurrentDir(tdir): mr = MobiReader(stream, log) parse_cache = {} mr.extract_content(tdir, parse_cache) if mr.embedded_mi is not None: mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index)) else: try: data = mh.section_data(mh.first_image_index) except: data = '' buf = cStringIO.StringIO(data) try: im = PILImage.open(buf) except: log.exception('Failed to read MOBI cover') else: obuf = cStringIO.StringIO() im.convert('RGB').save(obuf, format='JPEG') mi.cover_data = ('jpg', obuf.getvalue()) return mi
def create_fetcher(options, image_map={}, log=None): if log is None: log = Log(level=Log.DEBUG) if options.verbose else Log() return RecursiveFetcher(options, log, image_map={})
def setup_pipeline(self, *args): oidx = self.groups.currentIndex().row() input_format = self.input_format output_format = self.output_format output_path = 'dummy.'+output_format log = Log() log.outputs = [] input_file = 'dummy.'+input_format if input_format in ARCHIVE_FMTS: input_file = 'dummy.html' self.plumber = Plumber(input_file, output_path, log) def widget_factory(cls): return cls(self.stack, self.plumber.get_option_by_name, self.plumber.get_option_help, self.db, self.book_id) self.mw = widget_factory(MetadataWidget) self.setWindowTitle(_('Convert')+ ' ' + unicode(self.mw.title.text())) lf = widget_factory(LookAndFeelWidget) hw = widget_factory(HeuristicsWidget) sr = widget_factory(SearchAndReplaceWidget) ps = widget_factory(PageSetupWidget) sd = widget_factory(StructureDetectionWidget) toc = widget_factory(TOCWidget) from calibre.gui2.actions.toc_edit import SUPPORTED toc.manually_fine_tune_toc.setVisible(output_format.upper() in SUPPORTED) debug = widget_factory(DebugWidget) output_widget = self.plumber.output_plugin.gui_configuration_widget( self.stack, self.plumber.get_option_by_name, self.plumber.get_option_help, self.db, self.book_id) input_widget = self.plumber.input_plugin.gui_configuration_widget( self.stack, self.plumber.get_option_by_name, self.plumber.get_option_help, self.db, self.book_id) while True: c = self.stack.currentWidget() if not c: break self.stack.removeWidget(c) widgets = [self.mw, lf, hw, ps, sd, toc, sr] if input_widget is not None: widgets.append(input_widget) if output_widget is not None: widgets.append(output_widget) widgets.append(debug) for w in widgets: self.stack.addWidget(w) self.connect(w, SIGNAL('set_help(PyQt_PyObject)'), self.help.setPlainText) self._groups_model = GroupModel(widgets) self.groups.setModel(self._groups_model) idx = oidx if -1 < oidx < self._groups_model.rowCount() else 0 self.groups.setCurrentIndex(self._groups_model.index(idx)) self.stack.setCurrentIndex(idx) try: shutil.rmtree(self.plumber.archive_input_tdir, ignore_errors=True) except: pass
class Container(object): META_INF = { 'container.xml' : True, 'manifest.xml' : False, 'encryption.xml' : False, 'metadata.xml' : False, 'signatures.xml' : False, 'rights.xml' : False, } acceptable_encryption_algorithms = ( 'http://ns.adobe.com/pdf/enc#RC' ) namespaces = { 'opf': 'http://www.idpf.org/2007/opf', 'ocf': 'urn:oasis:names:tc:opendocument:xmlns:container', 'ncx': 'http://www.daisy.org/z3986/2005/ncx/', 'dc': 'http://purl.org/dc/elements/1.1/', 'xhtml': 'http://www.w3.org/1999/xhtml', 'enc': 'http://www.w3.org/2001/04/xmlenc#', 'deenc': 'http://ns.adobe.com/digitaleditions/enc', 'xml': 'http://www.w3.org/XML/1998/namespace' } OPF_MIMETYPE = 'application/oebps-package+xml' NCX_MIMETYPE = "application/x-dtbncx+xml" def __init__(self, path): tmpdir = PersistentTemporaryDirectory("_kobo-driver-extended") zf = zipfile.ZipFile(path) zf.extractall(tmpdir) self.root = os.path.abspath(tmpdir) self.log = Log() self.dirtied = set([]) self.cache = {} self.mime_map = {} print("Container:__init__:Got container path {0}".format(self.root)) if os.path.exists(os.path.join(self.root, 'mimetype')): os.remove(os.path.join(self.root, 'mimetype')) container_path = os.path.join(self.root, 'META-INF', 'container.xml') if not os.path.exists(container_path): raise InvalidEpub('No META-INF/container.xml in epub') self.container = etree.fromstring(open(container_path, 'rb').read()) opf_files = self.container.xpath((r'child::ocf:rootfiles/ocf:rootfile[@media-type="{0}" and @full-path]'.format(guess_type('a.opf')[0])), namespaces = self.namespaces) if not opf_files: raise InvalidEpub('META-INF/container.xml contains no link to OPF file') opf_path = os.path.join(self.root, *opf_files[0].get('full-path').split('/')) if not os.path.exists(opf_path): raise InvalidEpub('OPF file does not exist at location pointed to by META-INF/container.xml') # Map of relative paths with / separators to absolute # paths on filesystem with os separators self.name_map = {} for dirpath, dirnames, filenames in os.walk(self.root): for f in filenames: path = os.path.join(dirpath, f) name = os.path.relpath(path, self.root).replace(os.sep, '/') self.name_map[name] = path self.mime_map[name] = guess_type(f)[0] if path == opf_path: self.opf_name = name self.mime_map[name] = guess_type('a.opf')[0] opf = self.opf for item in opf.xpath('//opf:manifest/opf:item[@href and @media-type]', namespaces = self.namespaces): href = unquote(item.get('href')) item.set("href", href) self.mime_map[self.href_to_name(href, os.path.dirname(self.opf_name).replace(os.sep, '/'))] = item.get('media-type') self.set(self.opf_name, opf) def get_html_names(self): """A generator function that yields only HTML file names from the ePub. """ for node in self.opf.xpath('//opf:manifest/opf:item[@href and @media-type]', namespaces = self.namespaces): if node.get("media-type") in HTML_MIMETYPES: href = os.path.join(os.path.dirname(self.opf_name), node.get("href")) href = os.path.normpath(href).replace(os.sep, '/') yield href @property def is_drm_encumbered(self): """Determine if the ePub container is encumbered with Digital Restrictions Management. This method looks for the 'encryption.xml' file which denotes an ePub encumbered by Digital Restrictions Management. DRM-encumbered files cannot be edited. """ is_encumbered = False if 'META-INF/encryption.xml' in self.name_map: try: xml = self.get('META-INF/encryption.xml') if xml is None: return True # If encryption.xml can't be parsed, assume its presence means an encumbered file for elem in xml.xpath('./enc:EncryptedData/enc:EncryptionMethod[@Algorithm]', namespaces = self.namespaces): alg = elem.get('Algorithm') # Anything not in acceptable_encryption_algorithms is a sign of an # encumbered file. if alg not in self.acceptable_encryption_algorithms: is_encumbered = True except Exception as e: self.log.error("Could not parse encryption.xml: " + e.message) raise return is_encumbered def manifest_worthy_names(self): for name in self.name_map: if name.endswith('.opf'): continue if name.startswith('META-INF') and os.path.basename(name) in self.META_INF: continue yield name def delete_name(self, name): self.mime_map.pop(name, None) path = self.name_map[name] os.remove(path) self.name_map.pop(name) def manifest_item_for_name(self, name): href = self.name_to_href(name, os.path.dirname(self.opf_name)) q = prepare_string_for_xml(href, attribute = True) existing = self.opf.xpath('//opf:manifest/opf:item[@href="{0}"]'.format(q), namespaces = self.namespaces) if not existing: return None return existing[0] def add_name_to_manifest(self, name, mt = None): item = self.manifest_item_for_name(name) if item is not None: return self.log("Adding '{0}' to the manifest".format(name)) manifest = self.opf.xpath('//opf:manifest', namespaces = self.namespaces)[0] item = manifest.makeelement('{%s}item' % self.namespaces['opf'], href = self.name_to_href(name, os.path.dirname(self.opf_name)), id = self.generate_manifest_id()) if not mt: mt = guess_type(os.path.basename(name))[0] if not mt: mt = 'application/octest-stream' item.set('media-type', mt) manifest.append(item) self.fix_tail(item) self.set(self.opf_name, self.opf) self.name_map[name] = os.path.join(self.root, name) self.mime_map[name] = mt def fix_tail(self, item): ''' Designed only to work with self closing elements after item has just been inserted/appended ''' parent = item.getparent() idx = parent.index(item) if idx == 0: item.tail = parent.text else: item.tail = parent[idx - 1].tail if idx == len(parent) - 1: parent[idx - 1].tail = parent.text def copy_file_to_container(self, path, name = None, mt = None): '''Copy a file into this Container instance. @param path: The path to the file to copy into this Container. @param name: The name to give to the copied file, relative to the Container root. Set to None to use the basename of path. @param mt: The MIME type of the file to set in the manifest. Set to None to auto-detect. @return: The name of the file relative to the Container root ''' if path is None or re.match(r'^\s*$', path, re.MULTILINE): raise ValueError("A source path must be given") if name is None: name = os.path.basename(path) self.log("Copying file '{0}' to '{1}'".format(path, os.path.join(self.root, name))) shutil.copy(path, os.path.join(self.root, name)) self.add_name_to_manifest(name, mt) return name def add_content_file_reference(self, name): '''Add a reference to the named file (from self.name_map) to all content files (self.get_html_names()). Currently only CSS files with a MIME type of text/css and JavaScript files with a MIME type of application/x-javascript are supported. ''' if name not in self.name_map or name not in self.mime_map: raise ValueError("A valid file name must be given (got: {0})".format(name)) for file in self.get_html_names(): root = self.get(file) if not root: self.log("Could not retrieve content file {0}".format(file)) continue head = root.xpath('./xhtml:head', namespaces = self.namespaces) if not head: self.log("Could not find a <head> element in content file {0}".format(file)) continue head = head[0] if not head: self.log("A <head> section was found but was undefined in content file {0}".format(file)) continue if self.mime_map[name] == guess_type('a.css')[0]: elem = head.makeelement("{%s}link" % self.namespaces['xhtml'], rel = 'stylesheet', href = os.path.relpath(name, os.path.dirname(file)).replace(os.sep, '/')) elif self.mime_map[name] == guess_type('a.js')[0]: elem = head.makeelement("{%s}script" % self.namespaces['xhtml'], type = 'text/javascript', src = os.path.relpath(name, os.path.dirname(file)).replace(os.sep, '/')) else: elem = None if elem is not None: head.append(elem) if self.mime_map[name] == guess_type('a.css')[0]: self.fix_tail(elem) self.set(file, root) def generate_manifest_id(self): items = self.opf.xpath('//opf:manifest/opf:item[@id]', namespaces = self.namespaces) ids = set([x.get('id') for x in items]) for x in xrange(sys.maxint): c = 'id{0}'.format(x) if c not in ids: return c @property def opf(self): return self.get(self.opf_name) def href_to_name(self, href, base = ''): """Changed to fix a bug which incorrectly splits the href on '#' when '#' is part of the file name. Also normalizes the path. Taken from the calibre Modify Epub plugin's Container implementation. """ hash_index = href.find('#') period_index = href.find('.') if hash_index > 0 and hash_index > period_index: href = href.partition('#')[0] href = unquote(href) name = href if base: name = os.path.join(base, href) name = os.path.normpath(name).replace(os.sep, '/') return name def name_to_href(self, name, base): """Changed to ensure that blank href names are referenced as the empty string instead of '.'. Taken from the calibre Modify Epub plugin's Container implementation. """ if not base: return name href = os.path.relpath(name, base).replace(os.sep, '/') if href == '.': href = '' return href def decode(self, data): """Automatically decode :param:`data` into a `unicode` object.""" def fix_data(d): return d.replace('\r\n', '\n').replace('\r', '\n') if isinstance(data, unicode): return fix_data(data) bom_enc = None if data[:4] in ('\0\0\xfe\xff', '\xff\xfe\0\0'): bom_enc = {'\0\0\xfe\xff':'utf-32-be', '\xff\xfe\0\0':'utf-32-le'}[data[:4]] data = data[4:] elif data[:2] in ('\xff\xfe', '\xfe\xff'): bom_enc = {'\xff\xfe':'utf-16-le', '\xfe\xff':'utf-16-be'}[data[:2]] data = data[2:] elif data[:3] == '\xef\xbb\xbf': bom_enc = 'utf-8' data = data[3:] if bom_enc is not None: try: return fix_data(data.decode(bom_enc)) except UnicodeDecodeError: pass try: return fix_data(data.decode('utf-8')) except UnicodeDecodeError: pass data, _ = xml_to_unicode(data) return fix_data(data) def get_raw(self, name): path = self.name_map[name] return open(path, 'rb').read() def get(self, name): if name in self.cache: val = self.cache[name] if not hasattr(val, 'xpath'): val = self._parse(val, self.mime_map[name]) return val raw = self.get_raw(name) raw = self.decode(raw) if name in self.mime_map: try: raw = self._parse(raw, self.mime_map[name]) except XMLSyntaxError as err: raise ParseError(name, unicode(err)) self.cache[name] = raw return raw def set(self, name, val): self.cache[name] = val self.dirtied.add(name) def _parse(self, raw, mimetype): mt = mimetype.lower() if mt.endswith('xml'): parser = etree.XMLParser(no_network = True, huge_tree = not iswindows) raw = xml_to_unicode(raw, strip_encoding_pats = True, assume_utf8 = True, resolve_entities = True)[0].strip() idx = raw.find('<html') if idx == -1: idx = raw.find('<HTML') if idx > -1: pre = raw[:idx] raw = raw[idx:] if '<!DOCTYPE' in pre: user_entities = {} for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): val = match.group(2) if val.startswith('"') and val.endswith('"'): val = val[1:-1] user_entities[match.group(1)] = val if user_entities: pat = re.compile(r'&(%s);' % ('|'.join(user_entities.keys()))) raw = pat.sub(lambda m:user_entities[m.group(1)], raw) return etree.fromstring(raw, parser = parser) return raw def write(self, path): for name in self.dirtied: data = self.cache[name] if hasattr(data, 'xpath'): data = etree.tostring(data, encoding = 'UTF-8', xml_declaration = True, pretty_print = True) data = string.replace(data, u"\uFFFD", "") f = open(self.name_map[name], "wb") f.write(data) f.close() self.dirtied.clear() if os.path.exists(path): os.unlink(path) epub = zipfile.ZipFile(path, 'w', compression = zipfile.ZIP_DEFLATED) epub.writestr('mimetype', bytes(guess_type('a.epub')[0]), compress_type = zipfile.ZIP_STORED) cwd = os.getcwdu() os.chdir(self.root) zip_prefix = self.root if not zip_prefix.endswith(os.sep): zip_prefix += os.sep for t in os.walk(self.root, topdown = True): for f in t[2]: if f not in EXCLUDE_FROM_ZIP: filepath = os.path.join(t[0], f).replace(zip_prefix, '') st = os.stat(filepath) mtime = time.localtime(st.st_mtime) if mtime[0] < 1980: os.utime(filepath, None) epub.write(filepath) epub.close() os.chdir(cwd) def __hyphenate_node(self, elem, hyphenator, hyphen = u'\u00AD'): if elem is None: return None if isinstance(elem, basestring): newstr = [] for w in elem.split(): if len(w) > 3 and '-' not in w and hyphen not in w: w = hyphenator.inserted(w, hyphen = hyphen) newstr.append(w) elem = " ".join(newstr) else: if elem.text is None and elem.tail is None: # If we get here, there's only child nodes for node in elem.xpath('./node()'): node = self.__hyphenate_node(node, hyphenator, hyphen) else: elem.text = self.__hyphenate_node(elem.text, hyphenator, hyphen) if elem.text is not None: elem.text += u" " elem.tail = self.__hyphenate_node(elem.tail, hyphenator, hyphen) return elem def hyphenate(self, hyphenator, hyphen = u'\u00AD'): if hyphenator is None or hyphen is None or hyphen == '': return False for name in self.get_html_names(): self.log("Hyphenating file {0}".format(name)) root = self.get(name) for node in root.xpath("./xhtml:body//xhtml:span[starts-with(@id, 'kobo.')]", namespaces = self.namespaces): node = self.__hyphenate_node(node, hyphenator, hyphen) self.set(name, root) return True def smarten_punctuation(self): preprocessor = HeuristicProcessor(log = self.log) for name in self.get_html_names(): html = self.get_raw(name) html = html.encode("UTF-8") # Fix non-breaking space indents html = preprocessor.fix_nbsp_indents(html) # Smarten punctuation html = smartyPants(html) # Ellipsis to HTML entity html = re.sub(r'(?u)(?<=\w)\s?(\.\s+?){2}\.', '…', html) # Double-dash and unicode char code to em-dash html = string.replace(html, '---', ' – ') html = string.replace(html, u"\x97", ' – ') html = string.replace(html, '--', ' — ') html = string.replace(html, u"\u2014", ' — ') html = string.replace(html, u"\u2013", ' – ') html = string.replace(html, u"...", "…") # Remove Unicode replacement characters html = string.replace(html, u"\uFFFD", "") self.set(name, html) def clean_markup(self): preprocessor = HeuristicProcessor(log = self.log) for name in self.get_html_names(): html = self.get_raw(name) html = html.encode("UTF-8") html = string.replace(html, u"\u2014", ' -- ') html = string.replace(html, u"\u2013", ' --- ') html = string.replace(html, u"\x97", ' --- ') html = preprocessor.cleanup_markup(html) # Remove Unicode replacement characters html = string.replace(html, u"\uFFFD", "") self.set(name, html)