def explode(path, dest, question=lambda x:True): with open(path, 'rb') as stream: raw = stream.read(3) stream.seek(0) if raw == b'TPZ': raise BadFormat(_('This is not a MOBI file. It is a Topaz file.')) try: header = MetadataHeader(stream, default_log) except MobiError: raise BadFormat(_('This is not a MOBI file.')) if header.encryption_type != 0: raise DRMError(_('This file is locked with DRM. It cannot be tweaked.')) kf8_type = header.kf8_type if kf8_type is None: raise BadFormat(_('This MOBI file does not contain a KF8 format ' 'book. KF8 is the new format from Amazon. calibre can ' 'only tweak MOBI files that contain KF8 books. Older ' 'MOBI files without KF8 are not tweakable.')) if kf8_type == 'joint': if not question(_('This MOBI file contains both KF8 and ' 'older Mobi6 data. Tweaking it will remove the Mobi6 data, which ' 'means the file will not be usable on older Kindles. Are you ' 'sure?')): return None return fork_job('calibre.ebooks.mobi.tweak', 'do_explode', args=(path, dest), no_output=True)['result']
def check_for_drm(self): if self.book_header.encryption_type != 0: try: name = self.book_header.exth.mi.title except: name = self.name if not name: name = self.name raise DRMError(name)
def read_drm(self): self.drmlevel = 0 if '/DRMStorage/Licenses/EUL' in self.entries: self.drmlevel = 5 elif '/DRMStorage/DRMBookplate' in self.entries: self.drmlevel = 3 elif '/DRMStorage/DRMSealed' in self.entries: self.drmlevel = 1 else: return if self.drmlevel < 5: msdes.deskey(self.calculate_deskey(), msdes.DE1) bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed')) if bookkey[0] != '\000': raise LitError('Unable to decrypt title key!') self.bookkey = bookkey[1:9] else: raise DRMError("Cannot access DRM-protected book")
def __init__(self, header, stream, log, options): self.log = log self.encoding = options.input_encoding self.log.debug('132 byte header version found.') self.sections = [] for i in range(header.num_sections): self.sections.append(header.section_data(i)) self.header_record = HeaderRecord(self.section_data(0)) if self.header_record.compression not in (2, 10): if self.header_record.compression in (260, 272): raise DRMError('eReader DRM is not supported.') else: raise EreaderError('Unknown book compression %i.' % self.header_record.compression) from calibre.ebooks.metadata.pdb import get_metadata self.mi = get_metadata(stream, False)
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): ''' Convert the pdf into html using the pdftohtml app. This will write the html as index.html into output_dir. It will also write all extracted images to the output_dir ''' pdfsrc = os.path.join(output_dir, 'src.pdf') index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html')) with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) with CurrentDir(output_dir): def a(x): return os.path.basename(x) exe = PDFTOHTML cmd = [ exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', a(pdfsrc), a(index) ] if isbsd: cmd.remove('-nodrm') if no_images: cmd.append('-i') if as_xml: cmd.append('-xml') logf = PersistentTemporaryFile('pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError( _('Could not find pdftohtml, check it is in your PATH')) else: raise ret = eintr_retry_call(p.wait) logf.flush() logf.close() out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip() if ret != 0: raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out)) if out: prints("pdftohtml log:") prints(out) if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() if not as_xml: with lopen(index, 'r+b') as i: raw = i.read().decode('utf-8') raw = flip_images(raw) raw = raw.replace( '<head', '<!-- created by calibre\'s pdftohtml -->\n <head', 1) i.seek(0) i.truncate() # versions of pdftohtml >= 0.20 output self closing <br> tags, this # breaks the pdf heuristics regexps, so replace them raw = raw.replace('<br/>', '<br>') raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I) raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I) raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I) raw = replace_entities(raw) raw = raw.replace('\u00a0', ' ') i.write(raw.encode('utf-8')) cmd = [ exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout', a(pdfsrc) ] if isbsd: cmd.remove('-nodrm') p = popen(cmd, stdout=subprocess.PIPE) raw = p.stdout.read().strip() if p.wait() == 0 and raw: parse_outline(raw, output_dir) try: os.remove(pdfsrc) except: pass
def convert(self, stream, options, file_ext, log, accelerators): """Convert a KePub file into a structure calibre can process.""" log("KEPUBInput::convert - start") from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd() zf.extractall(cwd) except Exception: log.exception("KEPUB appears to be invalid ZIP file, trying a " "more forgiving ZIP parser") from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) opf = self.find_opf() if opf is None: for f in walk("."): if (f.lower().endswith(".opf") and "__MACOSX" not in f and not os.path.basename(f).startswith(".")): opf = os.path.abspath(f) break path = getattr(stream, "name", "stream") if opf is None: raise ValueError( _( # noqa: F821 "{0} is not a valid KEPUB file (could not find opf)"). format(path)) encfile = os.path.abspath("rights.xml") if os.path.exists(encfile): raise DRMError(os.path.basename(path)) cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd() opf = os.path.relpath(opf, cwd) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self.encrypted_fonts = [] if len(parts) > 1 and parts[0]: delta = "/".join(parts[:-1]) + "/" for elem in opf.itermanifest(): elem.set("href", delta + elem.get("href")) for elem in opf.iterguide(): elem.set("href", delta + elem.get("href")) f = (self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2) self.removed_cover = f(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get("media-type", "") == "application/x-dtbook+xml": raise ValueError( _("EPUB files with DTBook markup are not supported" ) # noqa: F821 ) not_for_spine = set() for y in opf.itermanifest(): id_ = y.get("id", None) if id_ and y.get("media-type", None) in { "application/vnd.adobe-page-template+xml", "application/vnd.adobe.page-template+xml", "application/adobe-page-template+xml", "application/adobe.page-template+xml", "application/text", }: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get("idref", None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError( _("No valid entries in the spine of this EPUB") # noqa: F821 ) with open("content.opf", "wb") as nopf: nopf.write(opf.render()) return os.path.abspath("content.opf")
def convert(self, stream, options, file_ext, log, accelerators): from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwdu()) except: log.exception('EPUB appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) opf = self.find_opf() if opf is None: for f in walk(u'.'): if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') if opf is None: raise ValueError( '%s is not a valid EPUB file (could not find opf)' % path) opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self._encrypted_font_uris = [] if os.path.exists(encfile): if not self.process_encryption(encfile, opf, log): raise DRMError(os.path.basename(path)) self.encrypted_fonts = self._encrypted_font_uris if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1]) + '/' def normpath(x): return posixpath.normpath(delta + elem.get('href')) for elem in opf.itermanifest(): elem.set('href', normpath(elem.get('href'))) for elem in opf.iterguide(): elem.set('href', normpath(elem.get('href'))) f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2 self.removed_cover = f(opf, log) if self.removed_cover: self.removed_items_to_ignore = (self.removed_cover, ) epub3_nav = opf.epub3_nav if epub3_nav is not None: self.convert_epub3_nav(epub3_nav, opf, log, options) for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( 'EPUB files with DTBook markup are not supported') not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_: mt = y.get('media-type', None) if mt in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) ext = y.get('href', '').rpartition('.')[-1].lower() if mt == 'text/plain' and ext in {'otf', 'ttf'}: # some epub authoring software sets font mime types to # text/plain not_for_spine.add(id_) y.set('media-type', 'application/font') seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError('No valid entries in the spine of this EPUB') with lopen('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath(u'content.opf')
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): ''' Convert the pdf into html using the pdftohtml app. This will write the html as index.html into output_dir. It will also write all extracted images to the output_dir ''' pdfsrc = os.path.join(output_dir, u'src.pdf') index = os.path.join(output_dir, u'index.' + ('xml' if as_xml else 'html')) with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) with CurrentDir(output_dir): # This is necessary as pdftohtml doesn't always (linux) respect # absolute paths. Also, it allows us to safely pass only bytestring # arguments to subprocess on widows # subprocess in python 2 cannot handle unicode arguments on windows # that cannot be encoded with mbcs. Ensure all args are # bytestrings. def a(x): return os.path.basename(x).encode('ascii') exe = PDFTOHTML.encode(filesystem_encoding) if isinstance( PDFTOHTML, unicode) else PDFTOHTML cmd = [ exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge', b'-nodrm', b'-q', a(pdfsrc), a(index) ] if isbsd: cmd.remove(b'-nodrm') if no_images: cmd.append(b'-i') if as_xml: cmd.append('-xml') logf = PersistentTemporaryFile(u'pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError( _('Could not find pdftohtml, check it is in your PATH')) else: raise while True: try: ret = p.wait() break except OSError as e: if e.errno == errno.EINTR: continue else: raise logf.flush() logf.close() out = open(logf.name, 'rb').read().strip() if ret != 0: raise ConversionError(b'return code: %d\n%s' % (ret, out)) if out: print "pdftohtml log:" print out if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() if not as_xml: with open(index, 'r+b') as i: raw = i.read() raw = flip_images(raw) raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw i.seek(0) i.truncate() # versions of pdftohtml >= 0.20 output self closing <br> tags, this # breaks the pdf heuristics regexps, so replace them raw = raw.replace(b'<br/>', b'<br>') raw = re.sub(br'<a\s+name=(\d+)', br'<a id="\1"', raw, flags=re.I) i.write(raw) cmd = [ exe, b'-f', b'1', '-l', '1', b'-xml', b'-i', b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge', b'-nodrm', b'-q', b'-stdout', a(pdfsrc) ] p = popen(cmd, stdout=subprocess.PIPE) raw = p.stdout.read().strip() if p.wait() == 0 and raw: parse_outline(raw, output_dir) if isbsd: cmd.remove(b'-nodrm') try: os.remove(pdfsrc) except: pass
def convert(self, stream, options, file_ext, log, accelerators): log("KEPUBInput::convert - start") from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwdu()) except: log.exception('KEPUB appears to be invalid ZIP file, trying a ' 'more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) opf = self.find_opf() if opf is None: for f in walk(u'.'): if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') if opf is None: raise ValueError( _('%s is not a valid KEPUB file (could not find opf)') % path) encfile = os.path.abspath('rights.xml') if os.path.exists(encfile): raise DRMError(os.path.basename(path)) opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self.encrypted_fonts = [] if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1]) + '/' for elem in opf.itermanifest(): elem.set('href', delta + elem.get('href')) for elem in opf.iterguide(): elem.set('href', delta + elem.get('href')) f = self.rationalize_cover3 if opf.package_version >= 3.0 else \ self.rationalize_cover2 self.removed_cover = f(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( _('EPUB files with DTBook markup are not supported')) not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_ and y.get('media-type', None) in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError(_('No valid entries in the spine of this EPUB')) with open('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath(u'content.opf')
def call_convert_cmd(log, output_dir, pdf_name, first=None, last=None): ''' Convert the pdf into xml/txt using the pdftohtml/text app. This will write the output as index.xml/.txt into output_dir. pdftotext is often better than pdftohtml. ''' from calibre.ebooks.pdf.pdftohtml import popen pdfsrc = os.path.join(output_dir, pdf_name) if USE_PDFTOTEXT: EXE = 'pdftotext' index_file = os.path.join(output_dir, 'index.txt') else: from calibre.ebooks.pdf.pdftohtml import PDFTOHTML as EXE index_file = os.path.join(output_dir, 'index.xml') if os.path.exists(index_file): os.remove(index_file) with CurrentDir(output_dir): # This is necessary as pdftohtml doesn't always (linux) respect # absolute paths. Also, it allows us to safely pass only bytestring # arguments to subprocess on widows # subprocess in python 2 cannot handle unicode arguments on windows # that cannot be encoded with mbcs. Ensure all args are bytestrings. def a(x): return os.path.basename(x).encode('ascii') exe = EXE.encode(filesystem_encoding) if isinstance(EXE, str) else EXE if USE_PDFTOTEXT: cmd = [ exe, b'-enc', b'UTF-8', b'-nopgbrk', b'-q', a(pdfsrc), a(index_file) ] else: cmd = [ exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge', b'-nodrm', b'-q', a(pdfsrc), a(index_file), b'-xml', b'-i' ] if isbsd: cmd.remove(b'-nodrm') if first is not None: cmd.append(b'-f') cmd.append(str(first)) if last is not None: cmd.append(b'-l') cmd.append(str(last)) logf = PersistentTemporaryFile('pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError( _('Could not find %s, check it is in your PATH') % EXE) else: raise while True: try: ret = p.wait() break except OSError as e: if e.errno == errno.EINTR: continue else: raise logf.flush() logf.close() out = open(logf.name, 'rb').read().strip() if ret != 0: raise ConversionError(out) if out: log('%s log:' % EXE) log(out) if not os.path.exists(index_file): raise DRMError() if USE_PDFTOTEXT: with open(index_file, 'r', encoding='utf-8', errors='ignore') as f: text = f.read() else: with open(index_file, 'r', encoding='utf-8', errors='ignore') as f: # avoid encoding problems content = f.read().encode('utf-8') parser = etree.XMLParser(recover=True) tree = etree.fromstring(clean_ascii_chars(content), parser) text = ''.join(e.text or '' for e in tree.iter('text')) return text