def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.toc import TOC from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.utils.zipfile import ZipFile self.options = options self.log = log pages, images = [], [] toc = TOC() if file_ext == 'pmlz': log.debug('De-compressing content to temporary directory...') with TemporaryDirectory('_unpmlz') as tdir: zf = ZipFile(stream) zf.extractall(tdir) pmls = glob.glob(os.path.join(tdir, '*.pml')) for pml in pmls: html_name = os.path.splitext( os.path.basename(pml))[0] + '.html' html_path = os.path.join(os.getcwd(), html_name) pages.append(html_name) log.debug('Processing PML item %s...', pml) ttoc = self.process_pml(pml, html_path) toc += ttoc images = self.get_images(stream, tdir, True) else: toc = self.process_pml(stream, 'index.html') pages.append('index.html') if hasattr(stream, 'name'): images = self.get_images( stream, os.path.abspath(os.path.dirname(stream.name))) # We want pages to be orded alphabetically. pages.sort() manifest_items = [] for item in pages + images: manifest_items.append((item, None)) from ebook_converter.ebooks.metadata.meta import get_metadata log.debug('Reading metadata from input file...') mi = get_metadata(stream, 'pml') if 'images/cover.png' in images: mi.cover = 'images/cover.png' opf = OPFCreator(os.getcwd(), mi) log.debug('Generating manifest...') opf.create_manifest(manifest_items) opf.create_spine(pages) opf.set_toc(toc) with open('metadata.opf', 'wb') as opffile: with open('toc.ncx', 'wb') as tocfile: opf.render(opffile, tocfile, 'toc.ncx') return os.path.join(os.getcwd(), 'metadata.opf')
def run(self, archive): from ebook_converter.utils.zipfile import ZipFile is_rar = archive.lower().endswith('.rar') if is_rar: from ebook_converter.utils.unrar import extract_member, names else: zf = ZipFile(archive, 'r') if is_rar: fnames = list(names(archive)) else: fnames = zf.namelist() def fname_ok(fname): bn = os.path.basename(fname).lower() if bn == 'thumbs.db': return False if '.' not in bn: return False if bn.rpartition('.')[-1] in {'diz', 'nfo'}: return False if '__MACOSX' in fname.split('/'): return False return True fnames = list(filter(fname_ok, fnames)) if is_comic(fnames): ext = '.cbr' if is_rar else '.cbz' of = self.temporary_file('_archive_extract' + ext) with open(archive, 'rb') as f: of.write(f.read()) of.close() return of.name if len(fnames) > 1 or not fnames: return archive fname = fnames[0] ext = os.path.splitext(fname)[1][1:] if ext.lower() not in { 'lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', 'mp3', 'pdb', 'azw', 'azw1', 'azw3', 'fb2', 'docx', 'doc', 'odt' }: return archive of = self.temporary_file('_archive_extract.' + ext) with closing(of): if is_rar: data = extract_member(archive, match=None, name=fname)[1] of.write(data) else: of.write(zf.read(fname)) return of.name
def get_fb2_data(stream): from ebook_converter.utils.zipfile import ZipFile, BadZipfile pos = stream.tell() try: zf = ZipFile(stream) except BadZipfile: stream.seek(pos) ans = stream.read() zip_file_name = None else: names = zf.namelist() names = [x for x in names if x.lower().endswith('.fb2')] or names zip_file_name = names[0] ans = zf.open(zip_file_name).read() return ans, zip_file_name
def extract(self, stream): self.tdir = PersistentTemporaryDirectory('docx_container') try: zf = ZipFile(stream) zf.extractall(self.tdir) except Exception: self.log.exception('DOCX appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from ebook_converter.utils.localunzip import extractall stream.seek(0) extractall(stream, self.tdir) self.names = {} for f in walk(self.tdir): name = os.path.relpath(f, self.tdir).replace(os.sep, '/') self.names[name] = f
def set_metadata(stream, mi, apply_null=False, update_timestamp=False): stream.seek(0) raw, zip_file_name = get_fb2_data(stream) root = _get_fbroot(raw) ctx = Context(root) desc = ctx.get_or_create(root, 'description') ti = ctx.get_or_create(desc, 'title-info') indent = ti.text _set_comments(ti, mi, ctx) _set_series(ti, mi, ctx) _set_tags(ti, mi, ctx) _set_authors(ti, mi, ctx) _set_title(ti, mi, ctx) _set_cover(ti, mi, ctx) for child in ti: child.tail = indent # Apparently there exists FB2 reading software that chokes on the use of # single quotes in xml declaration. Sigh. See # https://www.mobileread.com/forums/showthread.php?p=2273184#post2273184 raw = b'<?xml version="1.0" encoding="UTF-8"?>\n' raw += etree.tostring(root, method='xml', encoding='utf-8', xml_declaration=False) stream.seek(0) stream.truncate() if zip_file_name: from ebook_converter.utils.zipfile import ZipFile with ZipFile(stream, 'w') as zf: zf.writestr(zip_file_name, raw) else: stream.write(raw)
def set_metadata(stream, mi): with ZipFile(stream) as zf: raw = _set_metadata(zf.open('meta.xml').read(), mi) # print(raw.decode('utf-8')) stream.seek(os.SEEK_SET) safe_replace(stream, "meta.xml", io.BytesIO(raw))
def __enter__(self, *args): """ Add this plugin to the python path so that it's contents become directly importable. Useful when bundling large python libraries into the plugin. Use it like this:: with plugin: import something """ if self.plugin_path is not None: from ebook_converter.utils.zipfile import ZipFile zf = ZipFile(self.plugin_path) extensions = {x.rpartition('.')[-1].lower() for x in zf.namelist()} zip_safe = True for ext in ('pyd', 'so', 'dll', 'dylib'): if ext in extensions: zip_safe = False break if zip_safe: sys.path.insert(0, self.plugin_path) self.sys_insertion_path = self.plugin_path else: from ebook_converter.ptempfile import TemporaryDirectory self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip') self.sys_insertion_path = (self._sys_insertion_tdir. __enter__(*args)) zf.extractall(self.sys_insertion_path) sys.path.insert(0, self.sys_insertion_path) zf.close()
def initialize_container(path_to_container, opf_name='metadata.opf', extra_entries=[]): ''' Create an empty EPUB document, with a default skeleton. ''' rootfiles = '' for path, mimetype, _ in extra_entries: rootfiles += '<rootfile full-path="{0}" media-type="{1}"/>'.format( path, mimetype) CONTAINER = simple_container_xml(opf_name, rootfiles).encode('utf-8') zf = ZipFile(path_to_container, 'w') zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED) zf.writestr('META-INF/', b'', 0o755) zf.writestr('META-INF/container.xml', CONTAINER) for path, _, data in extra_entries: zf.writestr(path, data) return zf
def convert(self, oeb_book, output_path, input_plugin, opts, log): from ebook_converter.ebooks.pml.pmlml import PMLMLizer from ebook_converter.utils.zipfile import ZipFile with TemporaryDirectory('_pmlz_output') as tdir: pmlmlizer = PMLMLizer(log) pml = str(pmlmlizer.extract_content(oeb_book, opts)) with open(os.path.join(tdir, 'index.pml'), 'wb') as out: out.write(pml.encode(opts.pml_output_encoding, 'replace')) img_path = os.path.join(tdir, 'index_img') if not os.path.exists(img_path): os.makedirs(img_path) self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, img_path, opts) log.debug('Compressing output...') pmlz = ZipFile(output_path, 'w') pmlz.add_dir(tdir)
def get_comic_metadata(stream, stream_type, series_index='volume'): comment = None if stream_type == 'cbz': from ebook_converter.utils.zipfile import ZipFile zf = ZipFile(stream) comment = zf.comment elif stream_type == 'cbr': from ebook_converter.utils.unrar import comment as get_comment comment = get_comment(stream) return parse_comic_comment(comment or b'{}', series_index=series_index)
def run(self, archive): from ebook_converter.utils.zipfile import ZipFile with ZipFile(archive, 'r') as zf: fnames = zf.namelist() candidates = [x for x in fnames if x.lower().endswith('.docx')] if not candidates: return archive of = self.temporary_file('_kpf_extract.docx') with closing(of): of.write(zf.read(candidates[0])) return of.name
def safe_replace(self, name, datastream, extra_replacements={}, add_missing=False): from ebook_converter.utils.zipfile import ZipFile, ZipInfo replacements = {name: datastream} replacements.update(extra_replacements) names = frozenset(list(replacements.keys())) found = set() def rbytes(name): r = replacements[name] if not isinstance(r, bytes): r = r.read() return r with SpooledTemporaryFile(max_size=100 * 1024 * 1024) as temp: ztemp = ZipFile(temp, 'w') for offset, header in self.file_info.values(): if header.filename in names: zi = ZipInfo(header.filename) zi.compress_type = header.compression_method ztemp.writestr(zi, rbytes(header.filename)) found.add(header.filename) else: ztemp.writestr(header.filename, self.read(header.filename, spool_size=0)) if add_missing: for name in names - found: ztemp.writestr(name, rbytes(name)) ztemp.close() zipstream = self.stream temp.seek(0) zipstream.seek(0) zipstream.truncate() shutil.copyfileobj(temp, zipstream) zipstream.flush()
def convert(self, oeb_book, output_path, input_plugin, opts, log): from ebook_converter.ebooks.oeb.base import OEB_IMAGES from ebook_converter.utils.zipfile import ZipFile from lxml import etree with TemporaryDirectory('_txtz_output') as tdir: # TXT txt_name = 'index.txt' if opts.txt_output_formatting.lower() == 'textile': txt_name = 'index.text' with TemporaryFile(txt_name) as tf: TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log) shutil.copy(tf, os.path.join(tdir, txt_name)) # Images for item in oeb_book.manifest: if item.media_type in OEB_IMAGES: if hasattr(self.writer, 'images'): path = os.path.join(tdir, 'images') if item.href in self.writer.images: href = self.writer.images[item.href] else: continue else: path = os.path.join(tdir, os.path.dirname(item.href)) href = os.path.basename(item.href) if not os.path.exists(path): os.makedirs(path) with open(os.path.join(path, href), 'wb') as imgf: imgf.write(item.data) # Metadata with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf: mdataf.write(etree.tostring(oeb_book.metadata.to_opf1())) txtz = ZipFile(output_path, 'w') txtz.add_dir(tdir)
def __call__(self, stream, odir, log): from ebook_converter.utils.zipfile import ZipFile from ebook_converter.ebooks.metadata.odt import get_metadata from ebook_converter.ebooks.metadata.opf2 import OPFCreator if not os.path.exists(odir): os.makedirs(odir) with directory.CurrentDir(odir): log.info('Extracting ODT file...') stream.seek(0) mi = get_metadata(stream, 'odt') if not mi.title: mi.title = 'Unknown' if not mi.authors: mi.authors = ['Unknown'] self.filter_load(stream, mi, log) # NOTE(gryf): Here is a workaround for ODF2XHTML.xhtml() method, # which expects, that all lines are strings. html = ''.join([str(l) for l in self.lines]) # A blanket img specification like this causes problems # with EPUB output as the containing element often has # an absolute height and width set that is larger than # the available screen real estate html = html.replace('img { width: 100%; height: 100%; }', '') # odf2xhtml creates empty title tag html = html.replace('<title></title>', '<title>%s</title>' % (mi.title, )) try: html = self.fix_markup(html, log) except: log.exception('Failed to filter CSS, conversion may be slow') with open('index.xhtml', 'wb') as f: f.write(polyglot.as_bytes(html)) zf = ZipFile(stream, 'r') self.extract_pictures(zf) opf = OPFCreator(os.path.abspath(os.getcwd()), mi) opf.create_manifest([(os.path.abspath(os.path.join(r, f2)), None) for r, _, fnames in os.walk(os.getcwd()) for f2 in fnames]) opf.create_spine([os.path.abspath('index.xhtml')]) with open('metadata.opf', 'wb') as f: opf.render(f) return os.path.abspath('metadata.opf')
def get_metadata(stream, extract_cover=True): whitespace = re.compile(r'\s+') def normalize(s): return whitespace.sub(' ', s).strip() with ZipFile(stream) as zf: meta = zf.read('meta.xml') root = fromstring(meta) def find(field): ns, tag = fields[field] ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns}) if ans: return normalize( tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip() mi = MetaInformation(None, []) title = find('title') if title: mi.title = title creator = find('initial-creator') or find('creator') if creator: mi.authors = string_to_authors(creator) desc = find('description') if desc: mi.comments = desc lang = find('language') if lang and canonicalize_lang(lang): mi.languages = [canonicalize_lang(lang)] kw = find('keyword') or find('keywords') if kw: mi.tags = [x.strip() for x in kw.split(',') if x.strip()] data = {} for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}): name = (tag.get('{%s}name' % METANS) or '').lower() vtype = tag.get('{%s}value-type' % METANS) or 'string' val = tag.text if name and val: if vtype == 'boolean': val = val == 'true' data[name] = val opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata'): # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.identifiers'): try: mi.identifiers = json.loads(data['opf.identifiers']) except Exception: pass if data.get('opf.rating'): try: mi.rating = max(0, min(float(data['opf.rating']), 10)) except Exception: pass if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except Exception: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', False) if not opfnocover: try: read_cover(stream, zf, mi, opfmeta, extract_cover) except Exception: pass # Do not let an error reading the cover prevent reading other data return mi
def do_dump(path, dest): if os.path.exists(dest): shutil.rmtree(dest) with ZipFile(path) as zf: zf.extractall(dest) pretty_all_xml_in_dir(dest)
class DOCX(object): def __init__(self, path_or_stream, log=None, extract=True): self.docx_is_transitional = True stream = path_or_stream if not hasattr(path_or_stream, 'read'): stream = open(path_or_stream, 'rb') self.name = getattr(stream, 'name', None) or '<stream>' self.log = log or default_log if extract: self.extract(stream) else: self.init_zipfile(stream) self.read_content_types() self.read_package_relationships() self.namespace = DOCXNamespace(self.docx_is_transitional) def init_zipfile(self, stream): self.zipf = ZipFile(stream) self.names = frozenset(self.zipf.namelist()) def extract(self, stream): self.tdir = PersistentTemporaryDirectory('docx_container') try: zf = ZipFile(stream) zf.extractall(self.tdir) except Exception: self.log.exception('DOCX appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from ebook_converter.utils.localunzip import extractall stream.seek(0) extractall(stream, self.tdir) self.names = {} for f in walk(self.tdir): name = os.path.relpath(f, self.tdir).replace(os.sep, '/') self.names[name] = f def exists(self, name): return name in self.names def read(self, name): if hasattr(self, 'zipf'): return self.zipf.open(name).read() path = self.names[name] with open(path, 'rb') as f: return f.read() def read_content_types(self): try: raw = self.read('[Content_Types].xml') except KeyError: raise InvalidDOCX('The file %s docx file has no ' '[Content_Types].xml' % self.name) root = etree.fromstring(raw) self.content_types = {} self.default_content_types = {} for item in root.xpath('//*[local-name()="Types"]/*[local-name()=' '"Default" and @Extension and @ContentType]'): self.default_content_types[item.get('Extension').lower()] = \ item.get('ContentType') for item in root.xpath('//*[local-name()="Types"]/*[local-name()=' '"Override" and @PartName and @ContentType]'): name = item.get('PartName').lstrip('/') self.content_types[name] = item.get('ContentType') def content_type(self, name): if name in self.content_types: return self.content_types[name] ext = name.rpartition('.')[-1].lower() if ext in self.default_content_types: return self.default_content_types[ext] return mimetypes.guess_type(name)[0] def read_package_relationships(self): try: raw = self.read('_rels/.rels') except KeyError: raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name) root = etree.fromstring(raw) self.relationships = {} self.relationships_rmap = {} for item in root.xpath('//*[local-name()="Relationships"]/*[local-name' '()="Relationship" and @Type and @Target]'): target = item.get('Target').lstrip('/') typ = item.get('Type') if target == 'word/document.xml': self.docx_is_transitional = (typ != 'http://purl.oclc.org/' 'ooxml/officeDocument/' 'relationships/officeDocument') self.relationships[typ] = target self.relationships_rmap[target] = typ @property def document_name(self): name = self.relationships.get(self.namespace.names['DOCUMENT'], None) if name is None: names = tuple( n for n in self.names if n == 'document.xml' or n.endswith('/document.xml')) if not names: raise InvalidDOCX('The file %s docx file has no main ' 'document' % self.name) name = names[0] return name @property def document(self): return etree.fromstring(self.read(self.document_name)) @property def document_relationships(self): return self.get_relationships(self.document_name) def get_relationships(self, name): base = '/'.join(name.split('/')[:-1]) by_id, by_type = {}, {} parts = name.split('/') name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels']) try: raw = self.read(name) except KeyError: pass else: root = etree.fromstring(raw) for item in root.xpath('//*[local-name()="Relationships"]/*' '[local-name()="Relationship" and @Type ' 'and @Target]'): target = item.get('Target') if (item.get('TargetMode', None) != 'External' and not target.startswith('#')): target = '/'.join((base, target.lstrip('/'))) typ = item.get('Type') Id = item.get('Id') by_id[Id] = by_type[typ] = target return by_id, by_type def get_document_properties_names(self): name = self.relationships.get(self.namespace.names['DOCPROPS'], None) if name is None: names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml') if names: name = names[0] yield name name = self.relationships.get(self.namespace.names['APPPROPS'], None) if name is None: names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml') if names: name = names[0] yield name @property def metadata(self): mi = Metadata('Unknown') dp_name, ap_name = self.get_document_properties_names() if dp_name: try: raw = self.read(dp_name) except KeyError: pass else: read_doc_props(raw, mi, self.namespace.XPath) if mi.is_null('language'): try: raw = self.read('word/styles.xml') except KeyError: pass else: read_default_style_language(raw, mi, self.namespace.XPath) ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None) if ap_name: try: raw = self.read(ap_name) except KeyError: pass else: read_app_props(raw, mi) return mi def close(self): if hasattr(self, 'zipf'): self.zipf.close() else: try: shutil.rmtree(self.tdir) except EnvironmentError: pass
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from ebook_converter.ebooks.chardet import detect from ebook_converter.utils.zipfile import ZipFile from ebook_converter.ebooks.txt.processor import ( convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = b'' log.debug('Reading text from file...') length = 0 base_dir = self.output_dir = os.getcwd() # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for root, _, fnames in os.walk('.'): for x in fnames: x = os.path.join(root, x) if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + b'\n\n' else: if getattr(stream, 'name', None): base_dir = os.path.dirname(stream.name) txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = { 'md': 'markdown' }.get(file_ext, file_ext) log.info( 'File extension indicates particular formatting. ' 'Forcing formatting type to: %s', options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s', ienc) else: det_encoding = detect(txt[:4096]) det_encoding, confidence = det_encoding['encoding'], det_encoding[ 'confidence'] if det_encoding and det_encoding.lower().replace( '_', '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug( 'Detected input encoding as %s with a confidence of ' '%s%%', ienc, confidence * 100) if not ienc: ienc = 'utf-8' log.debug( 'No input encoding specified and could not auto detect ' 'using %s', ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = entities.ENT_PAT.sub(entities.xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug('Could not reliably determine paragraph type using ' 'block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s', options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s', options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from ebook_converter.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr( options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt, 'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. self.shifted_files = [] try: html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata( txt, extensions=[ x.strip() for x in options.markdown_extensions.split(',') if x.strip() ]) except RuntimeError: raise ValueError( 'This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax' ) html = self.fix_resources(html, base_dir) elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) html = self.fix_resources(html, base_dir) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' htmlfile = self.shift_file('index.html', html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi finally: for x in self.shifted_files: os.remove(x) # Set metadata from file. if input_mi is None: from ebook_converter.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.metadata.opf2 import OPF from ebook_converter.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in ('index.html', 'index.xhtml', 'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn('Multiple HTML files found in the archive. Only %s will ' 'be used.' % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception('No top level HTML file found.') if not html: raise Exception('Top level HTML file %s is empty' % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwd() htmlfile = os.path.join(base, u'index.html') c = 0 while os.path.exists(htmlfile): c += 1 htmlfile = u'index%d.html' % c with open(htmlfile, 'wb') as f: f.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile) # Set metadata from file. from ebook_converter.customize.ui import get_file_type_metadata from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=os.getcwd()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(os.getcwd(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, mimetypes.guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def init_zipfile(self, stream): self.zipf = ZipFile(stream) self.names = frozenset(self.zipf.namelist())
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from ebook_converter.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from ebook_converter.ebooks.metadata.opf2 import OPF, metadata_to_opf from ebook_converter.utils.zipfile import ZipFile from ebook_converter.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == 'inline': from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == 'tag': from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory(u'_htmlz_output') as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = u'index' if opts.htmlz_title_filename: from ebook_converter.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename(str(oeb_book.metadata.title[0])),))[0] with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf: if isinstance(html, str): html = html.encode('utf-8') tf.write(html) # CSS if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external': with open(os.path.join(tdir, u'style.css'), 'wb') as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, u'images')): os.makedirs(os.path.join(tdir, u'images')) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = etree.tostring(item.data, encoding='unicode') else: data = item.data fname = os.path.join(tdir, u'images', images[item.href]) with open(fname, 'wb') as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from ebook_converter.utils.img import save_cover_data_to cover_path = os.path.join(tdir, u'cover.jpg') with open(cover_path, 'w') as cf: cf.write('') save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf: opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8'))) mi = opf.to_book_metadata() if cover_path: mi.cover = u'cover.jpg' mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, 'w') htmlz.add_dir(tdir)
def run(self, path_to_output, opts, db, notification=DummyReporter()): from ebook_converter.library.catalogs.epub_mobi_builder import CatalogBuilder from ebook_converter.utils.logging import default_log as log from ebook_converter.utils.config import JSONConfig # If preset specified from the cli, insert stored options from JSON file if hasattr(opts, 'preset') and opts.preset: available_presets = JSONConfig("catalog_presets") if opts.preset not in available_presets: if available_presets: print('Error: Preset "%s" not found.' % opts.preset) print('Stored presets: %s' % ', '.join([p for p in sorted(available_presets.keys())])) else: print('Error: No stored presets.') return 1 # Copy the relevant preset values to the opts object for item in available_presets[opts.preset]: if item not in ['exclusion_rules_tw', 'format', 'prefix_rules_tw']: setattr(opts, item, available_presets[opts.preset][item]) # Provide an unconnected device opts.connected_device = { 'is_device_connected': False, 'kind': None, 'name': None, 'save_template': None, 'serial': None, 'storage': None, } # Convert prefix_rules and exclusion_rules from JSON lists to tuples prs = [] for rule in opts.prefix_rules: prs.append(tuple(rule)) opts.prefix_rules = tuple(prs) ers = [] for rule in opts.exclusion_rules: ers.append(tuple(rule)) opts.exclusion_rules = tuple(ers) opts.log = log opts.fmt = self.fmt = path_to_output.rpartition('.')[2] # Add local options opts.creator = '%s, %s %s, %s' % (date.strftime('%A'), date.strftime('%B'), date.strftime('%d').lstrip('0'), date.strftime('%Y')) opts.creator_sort_as = '%s %s' % ('calibre', date.strftime('%Y-%m-%d')) opts.connected_kindle = False # Finalize output_profile op = opts.output_profile if op is None: op = 'default' if opts.connected_device['name'] and 'kindle' in opts.connected_device['name'].lower(): opts.connected_kindle = True if opts.connected_device['serial'] and \ opts.connected_device['serial'][:4] in ['B004', 'B005']: op = "kindle_dx" else: op = "kindle" opts.description_clip = 380 if op.endswith('dx') or 'kindle' not in op else 100 opts.author_clip = 100 if op.endswith('dx') or 'kindle' not in op else 60 opts.output_profile = op opts.basename = "Catalog" opts.cli_environment = not hasattr(opts, 'sync') # Hard-wired to always sort descriptions by author, with series after non-series opts.sort_descriptions_by_author = True build_log = [] build_log.append("%s('%s'): Generating %s %sin %s environment, locale: '%s'" % (self.name, current_library_name(), self.fmt, 'for %s ' % opts.output_profile if opts.output_profile else '', 'CLI' if opts.cli_environment else 'GUI', langcode_to_name(canonicalize_lang(get_lang()), localize=False)) ) # If exclude_genre is blank, assume user wants all tags as genres if opts.exclude_genre.strip() == '': # opts.exclude_genre = '\[^.\]' # build_log.append(" converting empty exclude_genre to '\[^.\]'") opts.exclude_genre = 'a^' build_log.append(" converting empty exclude_genre to 'a^'") if opts.connected_device['is_device_connected'] and \ opts.connected_device['kind'] == 'device': if opts.connected_device['serial']: build_log.append(" connected_device: '%s' #%s%s " % (opts.connected_device['name'], opts.connected_device['serial'][0:4], 'x' * (len(opts.connected_device['serial']) - 4))) for storage in opts.connected_device['storage']: if storage: build_log.append(" mount point: %s" % storage) else: build_log.append(" connected_device: '%s'" % opts.connected_device['name']) try: for storage in opts.connected_device['storage']: if storage: build_log.append(" mount point: %s" % storage) except: build_log.append(" (no mount points)") else: build_log.append(" connected_device: '%s'" % opts.connected_device['name']) opts_dict = vars(opts) if opts_dict['ids']: build_log.append(" book count: %d" % len(opts_dict['ids'])) sections_list = [] if opts.generate_authors: sections_list.append('Authors') if opts.generate_titles: sections_list.append('Titles') if opts.generate_series: sections_list.append('Series') if opts.generate_genres: sections_list.append('Genres') if opts.generate_recently_added: sections_list.append('Recently Added') if opts.generate_descriptions: sections_list.append('Descriptions') if not sections_list: if opts.cli_environment: opts.log.warn('*** No Section switches specified, enabling all Sections ***') opts.generate_authors = True opts.generate_titles = True opts.generate_series = True opts.generate_genres = True opts.generate_recently_added = True opts.generate_descriptions = True sections_list = ['Authors', 'Titles', 'Series', 'Genres', 'Recently Added', 'Descriptions'] else: opts.log.warn('\n*** No enabled Sections, terminating catalog generation ***') return ["No Included Sections", "No enabled Sections.\nCheck E-book options tab\n'Included sections'\n"] if opts.fmt == 'mobi' and sections_list == ['Descriptions']: warning = ("\n*** Adding 'By authors' section required for MOBI " "output ***") opts.log.warn(warning) sections_list.insert(0, 'Authors') opts.generate_authors = True opts.log(" Sections: %s" % ', '.join(sections_list)) opts.section_list = sections_list # Limit thumb_width to 1.0" - 2.0" try: if float(opts.thumb_width) < float(self.THUMB_SMALLEST): log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_SMALLEST)) opts.thumb_width = self.THUMB_SMALLEST if float(opts.thumb_width) > float(self.THUMB_LARGEST): log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_LARGEST)) opts.thumb_width = self.THUMB_LARGEST opts.thumb_width = "%.2f" % float(opts.thumb_width) except: log.error("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_SMALLEST)) opts.thumb_width = "1.0" # eval prefix_rules if passed from command line if type(opts.prefix_rules) is not tuple: try: opts.prefix_rules = eval(opts.prefix_rules) except: log.error("malformed --prefix-rules: %s" % opts.prefix_rules) raise for rule in opts.prefix_rules: if len(rule) != 4: log.error("incorrect number of args for --prefix-rules: %s" % repr(rule)) # eval exclusion_rules if passed from command line if type(opts.exclusion_rules) is not tuple: try: opts.exclusion_rules = eval(opts.exclusion_rules) except: log.error("malformed --exclusion-rules: %s" % opts.exclusion_rules) raise for rule in opts.exclusion_rules: if len(rule) != 3: log.error("incorrect number of args for --exclusion-rules: %s" % repr(rule)) # Display opts keys = sorted(opts_dict.keys()) build_log.append(" opts:") for key in keys: if key in ['catalog_title', 'author_clip', 'connected_kindle', 'creator', 'cross_reference_authors', 'description_clip', 'exclude_book_marker', 'exclude_genre', 'exclude_tags', 'exclusion_rules', 'fmt', 'genre_source_field', 'header_note_source_field', 'merge_comments_rule', 'output_profile', 'prefix_rules', 'preset', 'read_book_marker', 'search_text', 'sort_by', 'sort_descriptions_by_author', 'sync', 'thumb_width', 'use_existing_cover', 'wishlist_tag']: build_log.append(" %s: %s" % (key, repr(opts_dict[key]))) if opts.verbose: log('\n'.join(line for line in build_log)) # Capture start_time opts.start_time = time.time() self.opts = opts if opts.verbose: log.info(" Begin catalog source generation (%s)" % str(datetime.timedelta(seconds=int(time.time() - opts.start_time)))) # Launch the Catalog builder catalog = CatalogBuilder(db, opts, self, report_progress=notification) try: catalog.build_sources() if opts.verbose: log.info(" Completed catalog source generation (%s)\n" % str(datetime.timedelta(seconds=int(time.time() - opts.start_time)))) except (AuthorSortMismatchException, EmptyCatalogException) as e: log.error(" *** Terminated catalog generation: %s ***" % e) except: log.error(" unhandled exception in catalog generator") raise else: recommendations = [] recommendations.append(('remove_fake_margins', False, OptionRecommendation.HIGH)) recommendations.append(('comments', '', OptionRecommendation.HIGH)) """ >>> Use to debug generated catalog code before pipeline conversion <<< """ GENERATE_DEBUG_EPUB = False if GENERATE_DEBUG_EPUB: catalog_debug_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'Catalog debug') setattr(opts, 'debug_pipeline', os.path.expanduser(catalog_debug_path)) dp = getattr(opts, 'debug_pipeline', None) if dp is not None: recommendations.append(('debug_pipeline', dp, OptionRecommendation.HIGH)) if opts.output_profile and opts.output_profile.startswith("kindle"): recommendations.append(('output_profile', opts.output_profile, OptionRecommendation.HIGH)) recommendations.append(('book_producer', opts.output_profile, OptionRecommendation.HIGH)) if opts.fmt == 'mobi': recommendations.append(('no_inline_toc', True, OptionRecommendation.HIGH)) recommendations.append(('verbose', 2, OptionRecommendation.HIGH)) # Use existing cover or generate new cover cpath = None existing_cover = False try: search_text = 'title:"%s" author:%s' % ( opts.catalog_title.replace('"', '\\"'), 'calibre') matches = db.search(search_text, return_matches=True, sort_results=False) if matches: cpath = db.cover(matches[0], index_is_id=True, as_path=True) if cpath and os.path.exists(cpath): existing_cover = True except: pass if self.opts.use_existing_cover and not existing_cover: log.warning("no existing catalog cover found") if self.opts.use_existing_cover and existing_cover: recommendations.append(('cover', cpath, OptionRecommendation.HIGH)) log.info("using existing catalog cover") else: # TODO(gryf): feature: generating cover with pillow. pass # from ebook_converter.ebooks.covers import calibre_cover2 # log.info("replacing catalog cover") # new_cover_path = PersistentTemporaryFile(suffix='.jpg') # # new_cover = calibre_cover2(opts.catalog_title, 'calibre') # new_cover_path.write('') # new_cover_path.close() # recommendations.append(('cover', new_cover_path.name, OptionRecommendation.HIGH)) # Run ebook-convert from ebook_converter.ebooks.conversion.plumber import Plumber plumber = Plumber(os.path.join(catalog.catalog_path, opts.basename + '.opf'), path_to_output, log, report_progress=notification, abort_after_input_dump=False) plumber.merge_ui_recommendations(recommendations) plumber.run() try: os.remove(cpath) except: pass if GENERATE_DEBUG_EPUB: from ebook_converter.ebooks.epub import initialize_container from ebook_converter.ebooks.tweak import zip_rebuilder from ebook_converter.utils.zipfile import ZipFile input_path = os.path.join(catalog_debug_path, 'input') epub_shell = os.path.join(catalog_debug_path, 'epub_shell.zip') initialize_container(epub_shell, opf_name='content.opf') with ZipFile(epub_shell, 'r') as zf: zf.extractall(path=input_path) os.remove(epub_shell) zip_rebuilder(input_path, os.path.join(catalog_debug_path, 'input.epub')) if opts.verbose: log.info(" Catalog creation complete (%s)\n" % str(datetime.timedelta(seconds=int(time.time() - opts.start_time)))) # returns to gui2.actions.catalog:catalog_generated() return catalog.error
def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb if self.opts.epub_inline_toc: from ebook_converter.ebooks.mobi.writer8.toc import TOCAdder opts.mobi_toc_at_start = not opts.epub_toc_at_end opts.mobi_passthrough = False opts.no_inline_toc = False TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True) if self.opts.epub_flatten: from ebook_converter.ebooks.oeb.transforms.filenames import FlatFilenames FlatFilenames()(oeb, opts) else: from ebook_converter.ebooks.oeb.transforms.filenames import UniqueFilenames UniqueFilenames()(oeb, opts) self.workaround_ade_quirks() self.workaround_webkit_quirks() self.upshift_markup() from ebook_converter.ebooks.oeb.transforms.rescale import RescaleImages RescaleImages(check_colorspaces=True)(oeb, opts) from ebook_converter.ebooks.oeb.transforms.split import Split split = Split(not self.opts.dont_split_on_page_breaks, max_flow_size=self.opts.flow_size*1024 ) split(self.oeb, self.opts) from ebook_converter.ebooks.oeb.transforms.cover import CoverManager cm = CoverManager( no_default_cover=self.opts.no_default_epub_cover, no_svg_cover=self.opts.no_svg_cover, preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio) cm(self.oeb, self.opts, self.log) self.workaround_sony_quirks() if self.oeb.toc.count() == 0: self.log.warn('This EPUB file has no Table of Contents. ' 'Creating a default TOC') first = next(iter(self.oeb.spine)) self.oeb.toc.add('Start', first.href) identifiers = oeb.metadata['identifier'] _uuid = None for x in identifiers: if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:')): _uuid = str(x).split(':')[-1] break encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) if _uuid is None: self.log.warn('No UUID identifier found') _uuid = str(uuid.uuid4()) oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid) if encrypted_fonts and not _uuid.startswith('urn:uuid:'): # Apparently ADE requires this value to start with urn:uuid: # for some absurd reason, or it will throw a hissy fit and refuse # to use the obfuscated fonts. for x in identifiers: if str(x) == _uuid: x.content = 'urn:uuid:' + _uuid with TemporaryDirectory('_epub_output') as tdir: from ebook_converter.customize.ui import plugin_for_output_format metadata_xml = None extra_entries = [] if self.is_periodical: if self.opts.output_profile.epub_periodical_format == 'sony': from ebook_converter.ebooks.epub.periodical import sony_metadata metadata_xml, atom_xml = sony_metadata(oeb) extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)] oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb, tdir, input_plugin, opts, log) opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir) if x.endswith('.ncx')][0]) if self.opts.epub_version == '3': self.upgrade_to_epub3(tdir, opf) encryption = None if encrypted_fonts: encryption = self.encrypt_fonts(encrypted_fonts, tdir, _uuid) from ebook_converter.ebooks.epub import initialize_container with initialize_container(output_path, os.path.basename(opf), extra_entries=extra_entries) as epub: epub.add_dir(tdir) if encryption is not None: epub.writestr('META-INF/encryption.xml', as_bytes(encryption)) if metadata_xml is not None: epub.writestr('META-INF/metadata.xml', metadata_xml.encode('utf-8')) if opts.extract_to is not None: from ebook_converter.utils.zipfile import ZipFile if os.path.exists(opts.extract_to): if os.path.isdir(opts.extract_to): shutil.rmtree(opts.extract_to) else: os.remove(opts.extract_to) os.mkdir(opts.extract_to) with ZipFile(output_path) as zf: zf.extractall(path=opts.extract_to) self.log.info('EPUB extracted to', opts.extract_to)
def convert(self, recipe_or_file, opts, file_ext, log, accelerators): from ebook_converter.web.feeds.recipes import compile_recipe opts.output_profile.flow_size = 0 if file_ext == 'downloaded_recipe': from ebook_converter.utils.zipfile import ZipFile zf = ZipFile(recipe_or_file, 'r') zf.extractall() zf.close() with open('download.recipe', 'rb') as f: self.recipe_source = f.read() recipe = compile_recipe(self.recipe_source) recipe.needs_subscription = False self.recipe_object = recipe(opts, log, self.report_progress) else: if os.environ.get('CALIBRE_RECIPE_URN'): from ebook_converter.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id urn = os.environ['CALIBRE_RECIPE_URN'] log('Downloading recipe urn: ' + urn) rtype, recipe_id = urn.partition(':')[::2] if not recipe_id: raise ValueError('Invalid recipe urn: ' + urn) if rtype == 'custom': self.recipe_source = get_custom_recipe(recipe_id) else: self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True) if not self.recipe_source: raise ValueError('Could not find recipe with urn: ' + urn) if not isinstance(self.recipe_source, bytes): self.recipe_source = self.recipe_source.encode('utf-8') recipe = compile_recipe(self.recipe_source) elif os.access(recipe_or_file, os.R_OK): with open(recipe_or_file, 'rb') as f: self.recipe_source = f.read() recipe = compile_recipe(self.recipe_source) log('Using custom recipe') else: from ebook_converter.web.feeds.recipes.collection import ( get_builtin_recipe_by_title, get_builtin_recipe_titles) title = getattr(opts, 'original_recipe_input_arg', recipe_or_file) title = os.path.basename(title).rpartition('.')[0] titles = frozenset(get_builtin_recipe_titles()) if title not in titles: title = getattr(opts, 'original_recipe_input_arg', recipe_or_file) title = title.rpartition('.')[0] raw = get_builtin_recipe_by_title(title, log=log, download_recipe=not opts.dont_download_recipe) builtin = False try: recipe = compile_recipe(raw) self.recipe_source = raw if recipe.requires_version > numeric_version: log.warn( 'Downloaded recipe needs calibre version at least: %s' % ('.'.join(recipe.requires_version))) builtin = True except: log.exception('Failed to compile downloaded recipe. Falling ' 'back to builtin one') builtin = True if builtin: log('Using bundled builtin recipe') raw = get_builtin_recipe_by_title(title, log=log, download_recipe=False) if raw is None: raise ValueError('Failed to find builtin recipe: '+title) recipe = compile_recipe(raw) self.recipe_source = raw else: log('Using downloaded builtin recipe') if recipe is None: raise ValueError('%r is not a valid recipe file or builtin recipe' % recipe_or_file) disabled = getattr(recipe, 'recipe_disabled', None) if disabled is not None: raise RecipeDisabled(disabled) ro = recipe(opts, log, self.report_progress) ro.download() self.recipe_object = ro for key, val in self.recipe_object.conversion_options.items(): setattr(opts, key, val) for f in os.listdir('.'): if f.endswith('.opf'): return os.path.abspath(f) for f in walk('.'): if f.endswith('.opf'): return os.path.abspath(f)
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.utils.zipfile import ZipFile from ebook_converter.ebooks import DRMError _path_or_stream = getattr(stream, 'name', 'stream') try: zf = ZipFile(stream) zf.extractall(os.getcwd()) except Exception: log.exception('EPUB appears to be invalid ZIP file, trying a ' 'more forgiving ZIP parser') from ebook_converter.utils.localunzip import extractall stream.seek(0) extractall(stream) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) opf = self.find_opf() if opf is None: for root, _, fnames in os.walk('.'): for f in fnames: f = os.path.join(root, f) if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break if opf is None: raise ValueError('%s is not a valid EPUB file (could not find ' 'opf)' % _path_or_stream) opf = os.path.relpath(opf, os.getcwd()) parts = os.path.split(opf) opf = opf_meta.OPF(opf, os.path.dirname(os.path.abspath(opf))) self._encrypted_font_uris = [] if os.path.exists(encfile): if not self.process_encryption(encfile, opf, log): raise DRMError(os.path.basename(_path_or_stream)) self.encrypted_fonts = self._encrypted_font_uris # NOTE(gryf): check if opf is nested on the directory(ies), if so, # update the links for guide and manifest. if len(parts) > 1 and parts[0]: path = os.path.join(parts[0]) for elem in opf.itermanifest(): elem.set('href', os.path.join(path, elem.get('href'))) for elem in opf.iterguide(): elem.set('href', os.path.join(path, elem.get('href'))) if opf.package_version >= 3.0: f = self.rationalize_cover3 else: f = self.rationalize_cover2 self.removed_cover = f(opf, log) if self.removed_cover: self.removed_items_to_ignore = (self.removed_cover, ) epub3_nav = opf.epub3_nav if epub3_nav is not None: self.convert_epub3_nav(epub3_nav, opf, log, options) for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( 'EPUB files with DTBook markup are not supported') not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_: mt = y.get('media-type', None) if mt in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) ext = y.get('href', '').rpartition('.')[-1].lower() if mt == 'text/plain' and ext in {'otf', 'ttf'}: # some epub authoring software sets font mime types to # text/plain not_for_spine.add(id_) y.set('media-type', 'application/font') seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError('No valid entries in the spine of this EPUB') with open('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath('content.opf')