def main(basedir=None, query=None): from calibre import prints from calibre.utils.terminal import ColoredStream if basedir is None: try: basedir = input_unicode('Enter directory to scan [%s]: ' % getcwd() ).strip() or getcwd() except (EOFError, KeyboardInterrupt): return m = FilesystemMatcher(basedir) emph = ColoredStream(sys.stdout, fg='red', bold=True) while True: if query is None: try: query = input_unicode('Enter query: ') except (EOFError, KeyboardInterrupt): break if not query: break for path, positions in islice(iteritems(m(query)), 0, 10): positions = list(positions) p = 0 while positions: pos = positions.pop(0) if pos == -1: continue prints(path[p:pos], end='') ch = get_char(path, pos) with emph: prints(ch, end='') p = pos + len(ch) prints(path[p:]) query = None
def get_images(self, stream, tdir, top_level=False): images = [] imgs = [] if top_level: imgs = glob.glob(os.path.join(tdir, '*.png')) # Images not in top level try bookname_img directory because # that's where Dropbook likes to see them. if not imgs: if hasattr(stream, 'name'): imgs = glob.glob(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img', '*.png')) # No images in Dropbook location try generic images directory if not imgs: imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png')) if imgs: os.makedirs(os.path.join(getcwd(), 'images')) for img in imgs: pimg_name = os.path.basename(img) pimg_path = os.path.join(getcwd(), 'images', pimg_name) images.append('images/' + pimg_name) shutil.copy(img, pimg_path) return images
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.utils.zipfile import ZipFile self.options = options self.log = log pages, images = [], [] toc = TOC() if file_ext == 'pmlz': log.debug('De-compressing content to temporary directory...') with TemporaryDirectory('_unpmlz') as tdir: zf = ZipFile(stream) zf.extractall(tdir) pmls = glob.glob(os.path.join(tdir, '*.pml')) for pml in pmls: html_name = os.path.splitext(os.path.basename(pml))[0]+'.html' html_path = os.path.join(getcwd(), html_name) pages.append(html_name) log.debug('Processing PML item %s...' % pml) ttoc = self.process_pml(pml, html_path) toc += ttoc images = self.get_images(stream, tdir, True) else: toc = self.process_pml(stream, 'index.html') pages.append('index.html') if hasattr(stream, 'name'): images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name))) # We want pages to be orded alphabetically. pages.sort() manifest_items = [] for item in pages+images: manifest_items.append((item, None)) from calibre.ebooks.metadata.meta import get_metadata log.debug('Reading metadata from input file...') mi = get_metadata(stream, 'pml') if 'images/cover.png' in images: mi.cover = 'images/cover.png' opf = OPFCreator(getcwd(), mi) log.debug('Generating manifest...') opf.create_manifest(manifest_items) opf.create_spine(pages) opf.set_toc(toc) with lopen('metadata.opf', 'wb') as opffile: with lopen('toc.ncx', 'wb') as tocfile: opf.render(opffile, tocfile, 'toc.ncx') return os.path.join(getcwd(), 'metadata.opf')
def convert_new(self, stream, accelerators): from calibre.ebooks.pdf.pdftohtml import pdftohtml from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.pdf.reflow import PDFDocument pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True) with lopen('index.xml', 'rb') as f: xml = clean_ascii_chars(f.read()) PDFDocument(xml, self.opts, self.log) return os.path.join(getcwd(), 'metadata.opf')
def compile_pyj(data, filename='<stdin>', beautify=True, private_scope=True, libdir=None, omit_baselib=False): if isinstance(data, bytes): data = data.decode('utf-8') c = compiler() c.g.current_options = { 'beautify':beautify, 'private_scope':private_scope, 'omit_baselib': omit_baselib, 'libdir': libdir or default_lib_dir(), 'basedir': getcwd() if not filename or filename == '<stdin>' else os.path.dirname(filename), 'filename': filename, } c.g.rs_source_code = data ok, result = c.eval( ''' ans = [null, null]; try { ans = [true, exports["compile"](rs_source_code, %s, current_options)]; } catch(e) { ans = [false, e] } ans; ''' % json.dumps(filename)) if ok: return result presult = to_python(result) if 'message' in result: msg = presult['message'] if 'filename' in presult and 'line' in presult: msg = '%s:%s:%s' % (presult['filename'], presult['line'], msg) raise CompileFailure(msg) if result.stack: # Javascript error object instead of ParseError raise CompileFailure(result.stack) raise CompileFailure(repr(presult))
def get_metadata(stream): from calibre.ebooks.lit.reader import LitContainer from calibre.utils.logging import Log litfile = LitContainer(stream, Log()) src = litfile.get_metadata().encode('utf-8') litfile = litfile._litfile opf = OPF(io.BytesIO(src), getcwd()) mi = opf.to_book_metadata() covers = [] for item in opf.iterguide(): if 'cover' not in item.get('type', '').lower(): continue ctype = item.get('type') href = item.get('href', '') candidates = [href, href.replace('&', '%26')] for item in litfile.manifest.values(): if item.path in candidates: try: covers.append((litfile.get_file('/data/'+item.internal), ctype)) except: pass break covers.sort(key=lambda x: len(x[0]), reverse=True) idx = 0 if len(covers) > 1: if covers[1][1] == covers[0][1]+'-standard': idx = 1 mi.cover_data = ('jpg', covers[idx][0]) return mi
def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) if len(args) < 2: parser.print_help() prints(_('No file specified'), file=sys.stderr) return 1 path = args[1] stream_type = os.path.splitext(path)[1].replace('.', '').lower() trying_to_set = False for pref in config().option_set.preferences: if pref.name in ('to_opf', 'get_cover'): continue if getattr(opts, pref.name) is not None: trying_to_set = True break with open(path, 'rb') as stream: mi = get_metadata(stream, stream_type, force_read_metadata=True) if trying_to_set: prints(_('Original metadata')+'::') metadata = unicode_type(mi) if trying_to_set: metadata = '\t'+'\n\t'.join(metadata.split('\n')) prints(metadata, safe_encode=True) if trying_to_set: with open(path, 'r+b') as stream: do_set_metadata(opts, mi, stream, stream_type) stream.seek(0) stream.flush() lrf = None if stream_type == 'lrf': if opts.lrf_bookid is not None: lrf = LRFMetaFile(stream) lrf.book_id = opts.lrf_bookid mi = get_metadata(stream, stream_type, force_read_metadata=True) prints('\n' + _('Changed metadata') + '::') metadata = unicode_type(mi) metadata = '\t'+'\n\t'.join(metadata.split('\n')) prints(metadata, safe_encode=True) if lrf is not None: prints('\tBookID:', lrf.book_id) if opts.to_opf is not None: from calibre.ebooks.metadata.opf2 import OPFCreator opf = OPFCreator(getcwd(), mi) with open(opts.to_opf, 'wb') as f: opf.render(f) prints(_('OPF created in'), opts.to_opf) if opts.get_cover is not None: if mi.cover_data and mi.cover_data[1]: with open(opts.get_cover, 'wb') as f: f.write(mi.cover_data[1]) prints(_('Cover saved to'), f.name) else: prints(_('No cover found'), file=sys.stderr) return 0
def convert(self, stream, opts, file_ext, log, accelerators): self._is_case_sensitive = None basedir = getcwd() self.opts = opts fname = None if hasattr(stream, 'name'): basedir = os.path.dirname(stream.name) fname = os.path.basename(stream.name) if file_ext != 'opf': if opts.dont_package: raise ValueError('The --dont-package option is not supported for an HTML input file') from calibre.ebooks.metadata.html import get_metadata mi = get_metadata(stream) if fname: from calibre.ebooks.metadata.meta import metadata_from_filename fmi = metadata_from_filename(fname) fmi.smart_update(mi) mi = fmi oeb = self.create_oebbook(stream.name, basedir, opts, log, mi) return oeb from calibre.ebooks.conversion.plumber import create_oebbook return create_oebbook(log, stream.name, opts, encoding=opts.input_encoding)
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.rb.reader import Reader reader = Reader(stream, log, options.input_encoding) opf = reader.extract_content(getcwd()) return opf
def serve(resources={}, port=8000, host='0.0.0.0'): Handler.special_resources = resources Handler.compiler = compile_coffeescript httpd = Server((host, port), Handler) print('serving %s at %s:%d with PID=%d'%(getcwd(), host, port, os.getpid())) try: httpd.serve_forever() except KeyboardInterrupt: raise SystemExit(0)
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.azw4.reader import Reader header = PdbHeaderReader(stream) reader = Reader(header, stream, log, options) opf = reader.extract_content(getcwd()) return opf
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.pdf.pdftohtml import pdftohtml log.debug('Converting file to html...') # The main html file will be named index.html self.opts, self.log = options, log if options.new_pdf_engine: return self.convert_new(stream, accelerators) pdftohtml(getcwd(), stream.name, options.no_images) from calibre.ebooks.metadata.meta import get_metadata log.debug('Retrieving document metadata...') mi = get_metadata(stream, 'pdf') opf = OPFCreator(getcwd(), mi) manifest = [('index.html', None)] images = os.listdir(getcwd()) images.remove('index.html') for i in images: manifest.append((i, None)) log.debug('Generating manifest...') opf.create_manifest(manifest) opf.create_spine(['index.html']) log.debug('Rendering manifest...') with lopen('metadata.opf', 'wb') as opffile: opf.render(opffile) if os.path.exists('toc.ncx'): ncxid = opf.manifest.id_for_path('toc.ncx') if ncxid: with lopen('metadata.opf', 'r+b') as f: raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid)) f.seek(0) f.write(raw) return os.path.join(getcwd(), 'metadata.opf')
def main(opts, args, dbctx): if len(args) < 1: raise SystemExit(_('You must specify an id')) book_id = int(args[0]) mi = dbctx.run('show_metadata', book_id) if mi is None: raise SystemExit('Id #%d is not present in database.' % id) if opts.as_opf: mi = OPFCreator(getcwd(), mi) mi.render(sys.stdout) else: prints(unicode_type(mi)) return 0
def add_simple_plugin(path_to_plugin): import tempfile, zipfile, shutil tdir = tempfile.mkdtemp() open(os.path.join(tdir, 'custom_plugin.py'), 'wb').write(open(path_to_plugin, 'rb').read()) odir = getcwd() os.chdir(tdir) zf = zipfile.ZipFile('plugin.zip', 'w') zf.write('custom_plugin.py') zf.close() from calibre.customize.ui import main main(['calibre-customize', '-a', 'plugin.zip']) os.chdir(odir) shutil.rmtree(tdir)
def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0, base_path=getcwd(), type='unknown', author=None, description=None, toc_thumbnail=None): self.href = href self.fragment = fragment if not self.fragment: self.fragment = None self.text = text self.parent = parent self.base_path = base_path self.play_order = play_order self.type = type self.author = author self.description = description self.toc_thumbnail = toc_thumbnail
def extractall(path_or_stream, path=None): f = path_or_stream close_at_end = False if not hasattr(f, 'read'): f = open(f, 'rb') close_at_end = True if path is None: path = getcwd() pos = f.tell() try: _extractall(f, path) finally: f.seek(pos) if close_at_end: f.close()
def __init__(self, stream, mode='r', root=None): if isinstance(stream, (LocalZipFile, ZipFile)): self.archive = stream else: try: self.archive = ZipFile(stream, mode=mode) except BadZipfile: raise EPubException("not a ZIP .epub OCF container") self.root = root if self.root is None: name = getattr(stream, 'name', False) if name: self.root = os.path.abspath(os.path.dirname(name)) else: self.root = getcwd() super(OCFZipReader, self).__init__()
def set_metadata(stream, mi, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True): stream.seek(0) reader = get_zip_reader(stream, root=getcwd()) new_cdata = None try: new_cdata = mi.cover_data[1] if not new_cdata: raise Exception('no cover') except Exception: try: with lopen(mi.cover, 'rb') as f: new_cdata = f.read() except Exception: pass opfbytes, ver, raster_cover = set_metadata_opf( reader.read_bytes(reader.opf_path), mi, cover_prefix=posixpath.dirname(reader.opf_path), cover_data=new_cdata, apply_null=apply_null, update_timestamp=update_timestamp, force_identifiers=force_identifiers, add_missing_cover=add_missing_cover) cpath = None replacements = {} if new_cdata and raster_cover: try: cpath = posixpath.join(posixpath.dirname(reader.opf_path), raster_cover) cover_replacable = not reader.encryption_meta.is_encrypted(cpath) and \ os.path.splitext(cpath)[1].lower() in ('.png', '.jpg', '.jpeg') if cover_replacable: replacements[cpath] = serialize_cover_data(new_cdata, cpath) except Exception: import traceback traceback.print_exc() if isinstance(reader.archive, LocalZipFile): reader.archive.safe_replace(reader.container[OPF.MIMETYPE], opfbytes, extra_replacements=replacements, add_missing=True) else: safe_replace(stream, reader.container[OPF.MIMETYPE], opfbytes, extra_replacements=replacements, add_missing=True) try: if cpath is not None: replacements[cpath].close() os.remove(replacements[cpath].name) except: pass
def build_toc(index_entries): ans = TOC(base_path=getcwd()) levels = {x['hlvl'] for x in index_entries} num_map = {-1: ans} level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in levels} for lvl in sorted(levels): for item in level_map[lvl]: parent = num_map[item['parent']] child = parent.add_item(item['href'], item['idtag'], replace_entities(item['text'], encoding=None)) num_map[item['num']] = child # Set play orders in depth first order for i, item in enumerate(ans.flat()): item.play_order = i return ans
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader header = PdbHeaderReader(stream) Reader = get_reader(header.ident) if Reader is None: raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' % (header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown')))) log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident)) reader = Reader(header, stream, log, options) opf = reader.extract_content(getcwd()) return opf
def _parse_toc(self, ul, basedir=getcwd()): toc = TOC(play_order=self._playorder, base_path=basedir, text='') self._playorder += 1 for li in ul('li', recursive=False): href = li.object('param', {'name': 'Local'})[0]['value'] if href.count('#'): href, frag = href.split('#') else: frag = None name = self._deentity(li.object('param', {'name': 'Name'})[0]['value']) # print "========>", name toc.add_item(href, frag, name, play_order=self._playorder) self._playorder += 1 if li.ul: child = self._parse_toc(li.ul) child.parent = toc toc.append(child) # print toc return toc
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False): self.docx = DOCX(path_or_stream, log=log) self.namespace = self.docx.namespace self.ms_pat = re.compile(r'\s{2,}') self.ws_pat = re.compile(r'[\n\r\t]') self.log = self.docx.log self.detect_cover = detect_cover self.notes_text = notes_text or _('Notes') self.notes_nopb = notes_nopb self.nosupsub = nosupsub self.dest_dir = dest_dir or getcwd() self.mi = self.docx.metadata self.body = BODY() self.theme = Theme(self.namespace) self.settings = Settings(self.namespace) self.tables = Tables(self.namespace) self.fields = Fields(self.namespace) self.styles = Styles(self.namespace, self.tables) self.images = Images(self.namespace, self.log) self.object_map = OrderedDict() self.html = HTML( HEAD( META(charset='utf-8'), TITLE(self.mi.title or _('Unknown')), LINK(rel='stylesheet', type='text/css', href='docx.css'), ), self.body ) self.html.text='\n\t' self.html[0].text='\n\t\t' self.html[0].tail='\n' for child in self.html[0]: child.tail = '\n\t\t' self.html[0][-1].tail = '\n\t' self.html[1].text = self.html[1].tail = '\n' lang = html_lang(self.mi.language) if lang: self.html.set('lang', lang) self.doc_lang = lang else: self.doc_lang = None
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.txt.processor import convert_basic stdout = BytesIO() from calibre.ebooks.djvu.djvu import DJVUFile x = DJVUFile(stream) x.get_text(stdout) html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace( b'\037', b'\n\n')) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = getcwd() fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = os.path.join(base, 'index%d.html'%c) htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile.name, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) return oeb
def link_to_local_path(self, link_, base=None): from calibre.ebooks.html.input import Link if not isinstance(link_, unicode_type): try: link_ = link_.decode('utf-8', 'error') except: self.log.warn('Failed to decode link %r. Ignoring'%link_) return None, None try: l = Link(link_, base if base else getcwd()) except: self.log.exception('Failed to process link: %r'%link_) return None, None if l.path is None: # Not a local resource return None, None link = l.path.replace('/', os.sep).strip() frag = l.fragment if not link: return None, None return link, frag
def opf_metadata(opfpath): if hasattr(opfpath, 'read'): f = opfpath opfpath = getattr(f, 'name', getcwd()) else: f = open(opfpath, 'rb') try: opf = OPF(f, os.path.dirname(opfpath)) if opf.application_id is not None: mi = opf.to_book_metadata() if hasattr(opf, 'cover') and opf.cover: cpath = os.path.join(os.path.dirname(opfpath), opf.cover) if os.access(cpath, os.R_OK): fmt = cpath.rpartition('.')[-1] data = open(cpath, 'rb').read() mi.cover_data = (fmt, data) return mi except: import traceback traceback.print_exc() pass
def find_opf(self): from lxml import etree def attr(n, attr): for k, v in n.attrib.items(): if k.endswith(attr): return v try: with lopen('META-INF/container.xml', 'rb') as f: root = etree.fromstring(f.read()) for r in root.xpath('//*[local-name()="rootfile"]'): if attr(r, 'media-type') != "application/oebps-package+xml": continue path = attr(r, 'full-path') if not path: continue path = os.path.join(getcwd(), *path.split('/')) if os.path.exists(path): return path except Exception: import traceback traceback.print_exc()
def zip_opf_metadata(opfpath, zf): from calibre.ebooks.metadata.opf2 import OPF if hasattr(opfpath, 'read'): f = opfpath opfpath = getattr(f, 'name', getcwd()) else: f = open(opfpath, 'rb') opf = OPF(f, os.path.dirname(opfpath)) mi = opf.to_book_metadata() # This is broken, in that it only works for # when both the OPF file and the cover file are in the root of the # zip file and the cover is an actual raster image, but I don't care # enough to make it more robust if getattr(mi, 'cover', None): covername = os.path.basename(mi.cover) mi.cover = None names = zf.namelist() if covername in names: fmt = covername.rpartition('.')[-1] data = zf.read(covername) mi.cover_data = (fmt, data) return mi
def run(self): try: if DEBUG_DIALOG: self.results = self.sample_results() else: res = fork_job( 'calibre.ebooks.metadata.sources.worker', 'single_identify', (self.title, self.authors, self.identifiers), no_output=True, abort=self.abort) self.results, covers, caches, log_dump = res['result'] self.results = [OPF(BytesIO(r), basedir=getcwd(), populate_spine=False).to_book_metadata() for r in self.results] for r, cov in zip(self.results, covers): r.has_cached_cover_url = cov self.caches.update(caches) self.log.load(log_dump) for i, result in enumerate(self.results): result.gui_rank = i except WorkerError as e: self.error = force_unicode(e.orig_tb) except: import traceback self.error = force_unicode(traceback.format_exc())
def convert(self, stream, opts, file_ext, log, accelerators): from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC self.opts, self.log = opts, log if file_ext == 'cbc': comics_ = self.get_comics_from_collection(stream) else: comics_ = [['Comic', os.path.abspath(stream.name)]] stream.close() comics = [] for i, x in enumerate(comics_): title, fname = x cdir = u'comic_%d' % (i + 1) if len(comics_) > 1 else u'.' cdir = os.path.abspath(cdir) if not os.path.exists(cdir): os.makedirs(cdir) pages = self.get_pages(fname, cdir) if not pages: continue wrappers = self.create_wrappers(pages) comics.append((title, pages, wrappers)) if not comics: raise ValueError('No comic pages found in %s' % stream.name) mi = MetaInformation( os.path.basename(stream.name).rpartition('.')[0], [_('Unknown')]) opf = OPFCreator(getcwd(), mi) entries = [] def href(x): if len(comics) == 1: return os.path.basename(x) return '/'.join(x.split(os.sep)[-2:]) for comic in comics: pages, wrappers = comic[1:] entries += [(w, None) for w in map(href, wrappers)] + \ [(x, None) for x in map(href, pages)] opf.create_manifest(entries) spine = [] for comic in comics: spine.extend(map(href, comic[2])) self._images = [] for comic in comics: self._images.extend(comic[1]) opf.create_spine(spine) toc = TOC() if len(comics) == 1: wrappers = comics[0][2] for i, x in enumerate(wrappers): toc.add_item(href(x), None, _('Page') + ' %d' % (i + 1), play_order=i) else: po = 0 for comic in comics: po += 1 wrappers = comic[2] stoc = toc.add_item(href(wrappers[0]), None, comic[0], play_order=po) if not opts.dont_add_comic_pages_to_toc: for i, x in enumerate(wrappers): stoc.add_item(href(x), None, _('Page') + ' %d' % (i + 1), play_order=po) po += 1 opf.set_toc(toc) m, n = open(u'metadata.opf', 'wb'), open('toc.ncx', 'wb') opf.render(m, n, u'toc.ncx') return os.path.abspath(u'metadata.opf')
def convert_epub3_nav(self, nav_path, opf, log, opts): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize from calibre.ebooks.oeb.polish.toc import first_child from calibre.utils.xml_parse import safe_xml_fromstring from tempfile import NamedTemporaryFile with lopen(nav_path, 'rb') as f: raw = f.read() raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] root = parse(raw, log=log) ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>') navmap = ncx[0] et = '{%s}type' % EPUB_NS bn = os.path.basename(nav_path) def add_from_li(li, parent): href = text = None for x in li.iterchildren(XHTML('a'), XHTML('span')): text = etree.tostring( x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join( x.xpath('descendant-or-self::*/@title')).strip() href = x.get('href') if href: if href.startswith('#'): href = bn + href break np = parent.makeelement(NCX('navPoint')) parent.append(np) np.append(np.makeelement(NCX('navLabel'))) np[0].append(np.makeelement(NCX('text'))) np[0][0].text = text if href: np.append(np.makeelement(NCX('content'), attrib={'src':href})) return np def process_nav_node(node, toc_parent): for li in node.iterchildren(XHTML('li')): child = add_from_li(li, toc_parent) ol = first_child(li, XHTML('ol')) if child is not None and ol is not None: process_nav_node(ol, child) for nav in root.iterdescendants(XHTML('nav')): if nav.get(et) == 'toc': ol = first_child(nav, XHTML('ol')) if ol is not None: process_nav_node(ol, navmap) break else: return with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: f.write(etree.tostring(ncx, encoding='utf-8')) ncx_href = os.path.relpath(f.name, getcwd()).replace(os.sep, '/') ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id') for spine in opf.root.xpath('//*[local-name()="spine"]'): spine.set('toc', ncx_id) opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/')) opts.epub3_nav_parsed = root if getattr(self, 'removed_cover', None): changed = False base_path = os.path.dirname(nav_path) for elem in root.xpath('//*[@href]'): href, frag = elem.get('href').partition('#')[::2] link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path) abs_href = urlnormalize(link_path) if abs_href == self.removed_cover: changed = True elem.set('data-calibre-removed-titlepage', '1') if changed: with lopen(nav_path, 'wb') as f: f.write(serialize(root, 'application/xhtml+xml'))
def convert(self, stream, options, file_ext, log, accelerators): from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(getcwd()) except: log.exception('EPUB appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) opf = self.find_opf() if opf is None: for f in walk('.'): if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') if opf is None: raise ValueError('%s is not a valid EPUB file (could not find opf)'%path) opf = os.path.relpath(opf, getcwd()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self._encrypted_font_uris = [] if os.path.exists(encfile): if not self.process_encryption(encfile, opf, log): raise DRMError(os.path.basename(path)) self.encrypted_fonts = self._encrypted_font_uris if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1])+'/' def normpath(x): return posixpath.normpath(delta + elem.get('href')) for elem in opf.itermanifest(): elem.set('href', normpath(elem.get('href'))) for elem in opf.iterguide(): elem.set('href', normpath(elem.get('href'))) f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2 self.removed_cover = f(opf, log) if self.removed_cover: self.removed_items_to_ignore = (self.removed_cover,) epub3_nav = opf.epub3_nav if epub3_nav is not None: self.convert_epub3_nav(epub3_nav, opf, log, options) for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( 'EPUB files with DTBook markup are not supported') not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_: mt = y.get('media-type', None) if mt in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) ext = y.get('href', '').rpartition('.')[-1].lower() if mt == 'text/plain' and ext in {'otf', 'ttf'}: # some epub authoring software sets font mime types to # text/plain not_for_spine.add(id_) y.set('media-type', 'application/font') seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError('No valid entries in the spine of this EPUB') with lopen('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath('content.opf')
def other5(): cache_dir.ans = getcwd() if not os.path.isdir(tdir_in_cache('t')): raise SystemExit(1)
def __init__(self, base_dirs=(), builtin_modules=None): self._ctx = Context_() self.g = self._ctx.g self.g.Duktape.load_file = partial(load_file, base_dirs or (getcwd(),), builtin_modules or {}) self.g.Duktape.pyreadfile = readfile self.g.Duktape.pywritefile = writefile self.g.Duktape.create_context = partial(create_context, base_dirs) self.g.Duktape.run_in_context = run_in_context self.g.Duktape.cwd = getcwd self.g.Duktape.sha1sum = sha1sum self.g.Duktape.dirname = os.path.dirname self.g.Duktape.errprint = lambda *args: print(*args, file=sys.stderr) self.eval(''' console = { log: function() { print(Array.prototype.join.call(arguments, ' ')); }, error: function() { Duktape.errprint(Array.prototype.join.call(arguments, ' ')); }, debug: function() { print(Array.prototype.join.call(arguments, ' ')); } }; Duktape.modSearch = function (id, require, exports, module) { var ans = Duktape.load_file(id); if (ans[0]) return ans[1]; throw ans[1]; } if (!String.prototype.trim) { (function() { // Make sure we trim BOM and NBSP var rtrim = /^[\\s\uFEFF\xA0]+|[\\s\uFEFF\xA0]+$/g; String.prototype.trim = function() { return this.replace(rtrim, ''); }; })(); }; if (!String.prototype.trimLeft) { (function() { // Make sure we trim BOM and NBSP var rtrim = /^[\\s\uFEFF\xA0]+/g; String.prototype.trimLeft = function() { return this.replace(rtrim, ''); }; })(); }; if (!String.prototype.trimRight) { (function() { // Make sure we trim BOM and NBSP var rtrim = /[\\s\uFEFF\xA0]+$/g; String.prototype.trimRight = function() { return this.replace(rtrim, ''); }; })(); }; if (!String.prototype.startsWith) { String.prototype.startsWith = function(searchString, position) { position = position || 0; return this.indexOf(searchString, position) === position; }; } if (!String.prototype.endsWith) { String.prototype.endsWith = function(searchString, position) { var subjectString = this.toString(); if (position === undefined || position > subjectString.length) { position = subjectString.length; } position -= searchString.length; var lastIndex = subjectString.indexOf(searchString, position); return lastIndex !== -1 && lastIndex === position; }; } Duktape.readfile = function(path, encoding) { var x = Duktape.pyreadfile(path, encoding); var data = x[0]; var errcode = x[1]; var errmsg = x[2]; if (errmsg !== null) throw {code:errcode, message:errmsg}; return data; } Duktape.writefile = function(path, data, encoding) { var x = Duktape.pywritefile(path, data, encoding); var errcode = x[0]; var errmsg = x[1]; if (errmsg !== null) throw {code:errcode, message:errmsg}; } process = { 'platform': 'duktape', 'env': {'HOME': _HOME_, 'TERM':_TERM_}, 'exit': function() {}, 'cwd':Duktape.cwd } '''.replace( '_HOME_', json.dumps(os.path.expanduser('~'))).replace('_TERM_', json.dumps(os.environ.get('TERM', ''))), '<init>')
def __call__(self, redirect_output=True, cwd=None, priority=None, pass_fds=()): ''' If redirect_output is True, output from the child is redirected to a file on disk and this method returns the path to that file. ''' exe = self.gui_executable if self.gui else self.executable env = self.env try: origwd = cwd or os.path.abspath(getcwd()) except EnvironmentError: # cwd no longer exists origwd = cwd or os.path.expanduser('~') env[native_string_type('ORIGWD')] = environ_item(as_hex_unicode(msgpack_dumps(origwd))) _cwd = cwd if priority is None: priority = prefs['worker_process_priority'] cmd = [exe] if isinstance(exe, string_or_bytes) else exe args = { 'env' : env, 'cwd' : _cwd, } if iswindows: priority = { 'high' : subprocess.HIGH_PRIORITY_CLASS, 'normal' : subprocess.NORMAL_PRIORITY_CLASS, 'low' : subprocess.IDLE_PRIORITY_CLASS}[priority] args['creationflags'] = subprocess.CREATE_NO_WINDOW|priority else: niceness = { 'normal' : 0, 'low' : 10, 'high' : 20, }[priority] args['env']['CALIBRE_WORKER_NICENESS'] = str(niceness) ret = None if redirect_output: self._file = PersistentTemporaryFile('_worker_redirect.log') args['stdout'] = self._file._fd args['stderr'] = subprocess.STDOUT if iswindows: args['stdin'] = subprocess.PIPE ret = self._file.name if iswindows and 'stdin' not in args: # On windows when using the pythonw interpreter, # stdout, stderr and stdin may not be valid args['stdin'] = subprocess.PIPE args['stdout'] = windows_null_file args['stderr'] = subprocess.STDOUT args['close_fds'] = True try: if pass_fds: if iswindows: for fd in pass_fds: os.set_handle_inheritable(fd, True) args['startupinfo'] = subprocess.STARTUPINFO(lpAttributeList={'handle_list':pass_fds}) else: args['pass_fds'] = pass_fds self.child = subprocess.Popen(cmd, **args) finally: if iswindows and pass_fds: for fd in pass_fds: os.set_handle_inheritable(fd, False) if 'stdin' in args: self.child.stdin.close() self.log_path = ret return ret
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.utils.zipfile import ZipFile from calibre.ebooks.txt.processor import (convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = b'' log.debug('Reading text from file...') length = 0 base_dir = getcwd() # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for x in walk('.'): if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + b'\n\n' else: if getattr(stream, 'name', None): base_dir = os.path.dirname(stream.name) txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext) log.info('File extension indicates particular formatting. ' 'Forcing formatting type to: %s'%options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) else: det_encoding = detect(txt[:4096]) det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence'] if det_encoding and det_encoding.lower().replace('_', '-').strip() in ( 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100)) if not ienc: ienc = 'utf-8' log.debug('No input encoding specified and could not auto detect using %s' % ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug('Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s' % options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt,'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. self.shifted_files = [] try: html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()]) except RuntimeError: raise ValueError('This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax') html = self.fix_resources(html, base_dir) elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) html = self.fix_resources(html, base_dir) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' htmlfile = self.shift_file(base_dir, 'index.html', html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi finally: for x in self.shifted_files: os.remove(x) # Set metadata from file. if input_mi is None: from calibre.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb
def test(cls): from calibre.utils.matcher import get_items_from_dir items = get_items_from_dir(getcwd(), lambda x: not x.endswith('.pyc')) d = cls(items) d.exec_() print(d.selected_result)
def __call__(self, redirect_output=True, cwd=None, priority=None): ''' If redirect_output is True, output from the child is redirected to a file on disk and this method returns the path to that file. ''' exe = self.gui_executable if self.gui else self.executable env = self.env try: origwd = cwd or os.path.abspath(getcwd()) except EnvironmentError: # cwd no longer exists origwd = cwd or os.path.expanduser(u'~') env[native_string_type('ORIGWD')] = environ_item( as_hex_unicode(msgpack_dumps(origwd))) _cwd = cwd if priority is None: priority = prefs['worker_process_priority'] cmd = [exe] if isinstance(exe, string_or_bytes) else exe args = { 'env': env, 'cwd': _cwd, } if iswindows: priority = { 'high': win32process.HIGH_PRIORITY_CLASS, 'normal': win32process.NORMAL_PRIORITY_CLASS, 'low': win32process.IDLE_PRIORITY_CLASS }[priority] args['creationflags'] = win32process.CREATE_NO_WINDOW | priority else: niceness = { 'normal': 0, 'low': 10, 'high': 20, }[priority] args['preexec_fn'] = partial(renice, niceness) ret = None if redirect_output: self._file = PersistentTemporaryFile('_worker_redirect.log') args['stdout'] = self._file._fd args['stderr'] = subprocess.STDOUT if iswindows: args['stdin'] = subprocess.PIPE ret = self._file.name if iswindows and 'stdin' not in args: # On windows when using the pythonw interpreter, # stdout, stderr and stdin may not be valid args['stdin'] = subprocess.PIPE args['stdout'] = windows_null_file args['stderr'] = subprocess.STDOUT if not iswindows: # Close inherited file descriptors in worker # On windows, this is done in the worker process # itself args['close_fds'] = True self.child = subprocess.Popen(cmd, **args) if 'stdin' in args: self.child.stdin.close() self.log_path = ret return ret
def convert(self, stream, options, file_ext, log, accelerators): from lxml import etree from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException from calibre.ebooks.rtf.input import InlineClass from calibre.utils.xml_parse import safe_xml_fromstring self.opts = options self.log = log self.log('Converting RTF to XML...') try: xml = self.generate_xml(stream.name) except RtfInvalidCodeException as e: self.log.exception('Unable to parse RTF') raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: imap = {} try: imap = self.extract_images(d[0]) except: self.log.exception('Failed to extract images...') self.log('Parsing XML...') doc = safe_xml_fromstring(xml) border_styles = self.convert_borders(doc) for pict in doc.xpath('//rtf:pict[@num]', namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}): num = int(pict.get('num')) name = imap.get(num, None) if name is not None: pict.set('num', name) self.log('Converting XML to HTML...') inline_class = InlineClass(self.log) styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False) extensions = {('calibre', 'inline-class') : inline_class} transform = etree.XSLT(styledoc, extensions=extensions) result = transform(doc) html = u'index.xhtml' with open(html, 'wb') as f: res = as_bytes(transform.tostring(result)) # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] # clean multiple \n res = re.sub(b'\n+', b'\n', res) # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines # res = re.sub('\s*<body>', '<body>', res) # res = re.sub('(?<=\n)\n{2}', # u'<p>\u00a0</p>\n'.encode('utf-8'), res) f.write(res) self.write_inline_css(inline_class, border_styles) stream.seek(0) mi = get_metadata(stream, 'rtf') if not mi.title: mi.title = _('Unknown') if not mi.authors: mi.authors = [_('Unknown')] opf = OPFCreator(getcwd(), mi) opf.create_manifest([(u'index.xhtml', None)]) opf.create_spine([u'index.xhtml']) opf.render(open(u'metadata.opf', 'wb')) return os.path.abspath(u'metadata.opf')
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath, urlquote) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist from calibre.ebooks.metadata import string_to_authors from calibre.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn('Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate(__('Unknown'))] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) bookid = unicode_type(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls(item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True) return oeb
def main(opts, args, dbctx): if opts.list_fields: ans = get_fields(dbctx) prints('%-40s' % _('Title'), _('Field name'), '\n') for key, m in ans: prints('%-40s' % m['name'], key) return 0 def verify_int(x): try: int(x) return True except: return False if len(args) < 1 or not verify_int(args[0]): raise SystemExit( _('You must specify a record id as the ' 'first argument')) if len(args) < 2 and not opts.field: raise SystemExit(_('You must specify either a field or an OPF file')) book_id = int(args[0]) if len(args) > 1: opf = os.path.abspath(args[1]) if not os.path.exists(opf): raise SystemExit(_('The OPF file %s does not exist') % opf) with lopen(opf, 'rb') as stream: mi = get_metadata(stream)[0] if mi.cover: mi.cover = os.path.join(os.path.dirname(opf), os.path.relpath(mi.cover, getcwd())) final_mi = dbctx.run('set_metadata', 'opf', book_id, read_cover(mi)) if not final_mi: raise SystemExit( _('No book with id: %s in the database') % book_id) if opts.field: fields = {k: v for k, v in get_fields(dbctx)} fields['title_sort'] = fields['sort'] vals = {} for x in opts.field: field, val = x.partition(':')[::2] if field == 'sort': field = 'title_sort' if field not in fields: raise SystemExit(_('%s is not a known field' % field)) if field == 'cover': val = dbctx.path(os.path.abspath(os.path.expanduser(val))) else: val = field_from_string(field, val, fields[field]) vals[field] = val fvals = [] for field, val in sorted( # ensure series_index fields are set last iteritems(vals), key=lambda k: 1 if k[0].endswith('_index') else 0): if field.endswith('_index'): try: val = float(val) except Exception: raise SystemExit( 'The value %r is not a valid series index' % val) fvals.append((field, val)) final_mi = dbctx.run('set_metadata', 'fields', book_id, fvals) if not final_mi: raise SystemExit( _('No book with id: %s in the database') % book_id) prints(unicode_type(final_mi)) return 0
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata.opf2 import OPF from calibre.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn( _('Multiple HTML files found in the archive. Only %s will be used.' ) % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception(_('No top level HTML file found.')) if not html: raise Exception(_('Top level HTML file %s is empty') % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = getcwd() fname = os.path.join(base, u'index.html') c = 0 while os.path.exists(fname): c += 1 fname = u'index%d.html' % c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=getcwd()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(getcwd(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def extractall(self, path=None): self.stream.seek(0) _extractall(self.stream, path=(path or getcwd()))
def convert(self, stream, opts, file_ext, log, accelerators): from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC self.opts, self.log = opts, log if file_ext == 'cbc': comics_ = self.get_comics_from_collection(stream) else: comics_ = [['Comic', os.path.abspath(stream.name)]] stream.close() comics = [] num_pages_per_comic = [] for i, x in enumerate(comics_): title, fname = x cdir = 'comic_%d' % (i + 1) if len(comics_) > 1 else '.' cdir = os.path.abspath(cdir) if not os.path.exists(cdir): os.makedirs(cdir) pages = self.get_pages(fname, cdir) if not pages: continue num_pages_per_comic.append(len(pages)) if self.for_viewer: comics.append( (title, pages, [self.create_viewer_wrapper(pages, cdir)])) else: wrappers = self.create_wrappers(pages) comics.append((title, pages, wrappers)) if not comics: raise ValueError('No comic pages found in %s' % stream.name) mi = MetaInformation( os.path.basename(stream.name).rpartition('.')[0], [_('Unknown')]) opf = OPFCreator(getcwd(), mi) entries = [] def href(x): if len(comics) == 1: return os.path.basename(x) return '/'.join(x.split(os.sep)[-2:]) cover_href = None for comic in comics: pages, wrappers = comic[1:] page_entries = [(x, None) for x in map(href, pages)] entries += [(w, None) for w in map(href, wrappers)] + page_entries if cover_href is None and page_entries: cover_href = page_entries[0][0] opf.create_manifest(entries) spine = [] for comic in comics: spine.extend(map(href, comic[2])) self._images = [] for comic in comics: self._images.extend(comic[1]) opf.create_spine(spine) if self.for_viewer and cover_href: if os.path.isabs(cover_href): cover_href = os.path.relpath(cover_href).replace(os.sep, '/') opf.guide.set_cover(cover_href) toc = TOC() if len(comics) == 1: wrappers = comics[0][2] if self.for_viewer: wrapper_page_href = href(wrappers[0]) for i in range(num_pages_per_comic[0]): toc.add_item('{}#page_{}'.format(wrapper_page_href, i + 1), None, _('Page') + ' %d' % (i + 1), play_order=i) else: for i, x in enumerate(wrappers): toc.add_item(href(x), None, _('Page') + ' %d' % (i + 1), play_order=i) else: po = 0 for num_pages, comic in zip(num_pages_per_comic, comics): po += 1 wrappers = comic[2] stoc = toc.add_item(href(wrappers[0]), None, comic[0], play_order=po) if not opts.dont_add_comic_pages_to_toc: if self.for_viewer: wrapper_page_href = href(wrappers[0]) for i in range(num_pages): stoc.add_item('{}#page_{}'.format( wrapper_page_href, i + 1), None, _('Page') + ' %d' % (i + 1), play_order=po) po += 1 else: for i, x in enumerate(wrappers): stoc.add_item(href(x), None, _('Page') + ' %d' % (i + 1), play_order=po) po += 1 opf.set_toc(toc) with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n: opf.render(m, n, 'toc.ncx') return os.path.abspath('metadata.opf')
def convert(self, stream, options, file_ext, log, accelerators): from lxml import etree from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER from calibre.ebooks.chardet import xml_to_unicode self.log = log log.debug('Parsing XML...') raw = get_fb2_data(stream)[0] raw = raw.replace(b'\0', b'') raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)[0] try: doc = etree.fromstring(raw) except etree.XMLSyntaxError: try: doc = etree.fromstring(raw, parser=RECOVER_PARSER) if doc is None: raise Exception('parse failed') except: doc = etree.fromstring(raw.replace('& ', '&'), parser=RECOVER_PARSER) if doc is None: raise ValueError('The FB2 file is not valid XML') doc = ensure_namespace(doc) try: fb_ns = doc.nsmap[doc.prefix] except Exception: fb_ns = FB2NS NAMESPACES = {'f': fb_ns, 'l': XLINK_NS} stylesheets = doc.xpath( '//*[local-name() = "stylesheet" and @type="text/css"]') css = '' for s in stylesheets: css += etree.tostring( s, encoding=unicode_type, method='text', with_tail=False) + '\n\n' if css: import css_parser, logging parser = css_parser.CSSParser(fetcher=None, log=logging.getLogger('calibre.css')) XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS text = XHTML_CSS_NAMESPACE + css log.debug('Parsing stylesheet...') stylesheet = parser.parseString(text) stylesheet.namespaces['h'] = XHTML_NS css = stylesheet.cssText if isinstance(css, bytes): css = css.decode('utf-8', 'replace') css = css.replace('h|style', 'h|span') css = re.sub(r'name\s*=\s*', 'class=', css) self.extract_embedded_content(doc) log.debug('Converting XML to HTML...') ss = open(P('templates/fb2.xsl'), 'rb').read() ss = ss.replace("__FB_NS__", fb_ns) if options.no_inline_fb2_toc: log('Disabling generation of inline FB2 TOC') ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->', re.DOTALL).sub('', ss) styledoc = etree.fromstring(ss) transform = etree.XSLT(styledoc) result = transform(doc) # Handle links of type note and cite notes = { a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#') } cites = { a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '') } all_ids = {x for x in result.xpath('//*/@id')} for cite, a in iteritems(cites): note = notes.get(cite, None) if note: c = 1 while 'cite%d' % c in all_ids: c += 1 if not note.get('id', None): note.set('id', 'cite%d' % c) all_ids.add(note.get('id')) a.set('href', '#%s' % note.get('id')) for x in result.xpath('//*[@link_note or @link_cite]'): x.attrib.pop('link_note', None) x.attrib.pop('link_cite', None) for img in result.xpath('//img[@src]'): src = img.get('src') img.set('src', self.binary_map.get(src, src)) index = transform.tostring(result) open(u'index.xhtml', 'wb').write(index) open(u'inline-styles.css', 'wb').write(css) stream.seek(0) mi = get_metadata(stream, 'fb2') if not mi.title: mi.title = _('Unknown') if not mi.authors: mi.authors = [_('Unknown')] cpath = None if mi.cover_data and mi.cover_data[1]: with open(u'fb2_cover_calibre_mi.jpg', 'wb') as f: f.write(mi.cover_data[1]) cpath = os.path.abspath(u'fb2_cover_calibre_mi.jpg') else: for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): href = img.get('{%s}href' % XLINK_NS, img.get('href', None)) if href is not None: if href.startswith('#'): href = href[1:] cpath = os.path.abspath(href) break opf = OPFCreator(getcwd(), mi) entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')] opf.create_manifest(entries) opf.create_spine([u'index.xhtml']) if cpath: opf.guide.set_cover(cpath) with open(u'metadata.opf', 'wb') as f: opf.render(f) return os.path.join(getcwd(), u'metadata.opf')
if p is not run[-1]: style.apply_between_border() if has_visible_border: border_style.margin_left, border_style.margin_right = max_left, max_right self.block_runs.append((border_style, run)) run = [] for p in paras: if run and self.frame_map.get(p) == self.frame_map.get(run[-1]): style = self.styles.resolve_paragraph(p) last_style = self.styles.resolve_paragraph(run[-1]) if style.has_identical_borders(last_style): run.append(p) continue if len(run) > 1: process_run(run) run = [p] if len(run) > 1: process_run(run) if __name__ == '__main__': import shutil from calibre.utils.logging import default_log default_log.filter_level = default_log.DEBUG dest_dir = os.path.join(getcwd(), 'docx_input') if os.path.exists(dest_dir): shutil.rmtree(dest_dir) os.mkdir(dest_dir) Convert(sys.argv[-1], dest_dir=dest_dir, log=default_log)()
def other4(): cache_dir.ans = getcwd() tdir_in_cache('t') time.sleep(30)
def ExtractFiles(self, output_dir=getcwd(), debug_dump=False): html_files = set() for path in self.Contents(): fpath = path lpath = os.path.join(output_dir, fpath) self._ensure_dir(lpath) try: data = self.GetFile(path) except: self.log.exception('Failed to extract %s from CHM, ignoring' % path) continue if lpath.find(';') != -1: # fix file names with ";<junk>" at the end, see _reformat() lpath = lpath.split(';')[0] try: with open(lpath, 'wb') as f: f.write(data) try: if 'html' in guess_mimetype(path)[0]: html_files.add(lpath) except: pass except: if iswindows and len(lpath) > 250: self.log.warn('%r filename too long, skipping' % path) continue raise if debug_dump: import shutil shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump')) for lpath in html_files: with lopen(lpath, 'r+b') as f: data = f.read() data = self._reformat(data, lpath) if isinstance(data, unicode_type): data = data.encode('utf-8') f.seek(0) f.truncate() f.write(data) self._extracted = True files = [ y for y in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, y)) ] if self.hhc_path not in files: for f in files: if f.lower() == self.hhc_path.lower(): self.hhc_path = f break if self.hhc_path not in files and files: for f in files: if f.partition('.')[-1].lower() in { 'html', 'htm', 'xhtm', 'xhtml' }: self.hhc_path = f break if self.hhc_path == '.hhc' and self.hhc_path not in files: from calibre import walk for x in walk(output_dir): if os.path.basename(x).lower() in ('index.htm', 'index.html', 'contents.htm', 'contents.html'): self.hhc_path = os.path.relpath(x, output_dir) break if self.hhc_path not in files and files: self.hhc_path = files[0]
def extract_content(self, output_dir=getcwd(), debug_dump=False): self.ExtractFiles(output_dir=output_dir, debug_dump=debug_dump)
def __init__(self, base_dirs=(), builtin_modules=None): self._ctx = Context_() self.g = self._ctx.g self.g.Duktape.load_file = partial(load_file, base_dirs or (getcwd(), ), builtin_modules or {}) self.g.Duktape.pyreadfile = readfile self.g.Duktape.pywritefile = writefile self.g.Duktape.create_context = partial(create_context, base_dirs) self.g.Duktape.run_in_context = run_in_context self.g.Duktape.cwd = getcwd self.g.Duktape.sha1sum = sha1sum self.g.Duktape.dirname = os.path.dirname self.g.Duktape.errprint = lambda *args: print(*args, file=sys.stderr) self.eval( ''' console = { log: function() { print(Array.prototype.join.call(arguments, ' ')); }, error: function() { Duktape.errprint(Array.prototype.join.call(arguments, ' ')); }, debug: function() { print(Array.prototype.join.call(arguments, ' ')); } }; Duktape.modSearch = function (id, require, exports, module) { var ans = Duktape.load_file(id); if (ans[0]) return ans[1]; throw ans[1]; } if (!String.prototype.trim) { (function() { // Make sure we trim BOM and NBSP var rtrim = /^[\\s\uFEFF\xA0]+|[\\s\uFEFF\xA0]+$/g; String.prototype.trim = function() { return this.replace(rtrim, ''); }; })(); }; if (!String.prototype.trimLeft) { (function() { // Make sure we trim BOM and NBSP var rtrim = /^[\\s\uFEFF\xA0]+/g; String.prototype.trimLeft = function() { return this.replace(rtrim, ''); }; })(); }; if (!String.prototype.trimRight) { (function() { // Make sure we trim BOM and NBSP var rtrim = /[\\s\uFEFF\xA0]+$/g; String.prototype.trimRight = function() { return this.replace(rtrim, ''); }; })(); }; if (!String.prototype.startsWith) { String.prototype.startsWith = function(searchString, position) { position = position || 0; return this.indexOf(searchString, position) === position; }; } if (!String.prototype.endsWith) { String.prototype.endsWith = function(searchString, position) { var subjectString = this.toString(); if (position === undefined || position > subjectString.length) { position = subjectString.length; } position -= searchString.length; var lastIndex = subjectString.indexOf(searchString, position); return lastIndex !== -1 && lastIndex === position; }; } Duktape.readfile = function(path, encoding) { var x = Duktape.pyreadfile(path, encoding); var data = x[0]; var errcode = x[1]; var errmsg = x[2]; if (errmsg !== null) throw {code:errcode, message:errmsg}; return data; } Duktape.writefile = function(path, data, encoding) { var x = Duktape.pywritefile(path, data, encoding); var errcode = x[0]; var errmsg = x[1]; if (errmsg !== null) throw {code:errcode, message:errmsg}; } process = { 'platform': 'duktape', 'env': {'HOME': _HOME_, 'TERM':_TERM_}, 'exit': function() {}, 'cwd':Duktape.cwd } '''.replace('_HOME_', json.dumps(os.path.expanduser('~'))).replace( '_TERM_', json.dumps(os.environ.get('TERM', ''))), '<init>')
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath, urlquote) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist from calibre.ebooks.metadata import string_to_authors from calibre.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn(u'Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate(__('Unknown'))] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls(item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True) return oeb