예제 #1
0
def explode(path, dest, question=lambda x:True):
    with open(path, 'rb') as stream:
        raw = stream.read(3)
        stream.seek(0)
        if raw == b'TPZ':
            raise BadFormat(_('This is not a MOBI file. It is a Topaz file.'))

        try:
            header = MetadataHeader(stream, default_log)
        except MobiError:
            raise BadFormat(_('This is not a MOBI file.'))

        if header.encryption_type != 0:
            raise DRMError(_('This file is locked with DRM. It cannot be tweaked.'))

        kf8_type = header.kf8_type

        if kf8_type is None:
            raise BadFormat(_('This MOBI file does not contain a KF8 format '
                    'book. KF8 is the new format from Amazon. calibre can '
                    'only tweak MOBI files that contain KF8 books. Older '
                    'MOBI files without KF8 are not tweakable.'))

        if kf8_type == 'joint':
            if not question(_('This MOBI file contains both KF8 and '
                'older Mobi6 data. Tweaking it will remove the Mobi6 data, which '
                'means the file will not be usable on older Kindles. Are you '
                'sure?')):
                return None

    return fork_job('calibre.ebooks.mobi.tweak', 'do_explode', args=(path,
            dest), no_output=True)['result']
예제 #2
0
파일: mobi6.py 프로젝트: wh0197m/calibre
 def check_for_drm(self):
     if self.book_header.encryption_type != 0:
         try:
             name = self.book_header.exth.mi.title
         except:
             name = self.name
         if not name:
             name = self.name
         raise DRMError(name)
예제 #3
0
파일: reader.py 프로젝트: pkuhzx/calibre
 def read_drm(self):
     self.drmlevel = 0
     if '/DRMStorage/Licenses/EUL' in self.entries:
         self.drmlevel = 5
     elif '/DRMStorage/DRMBookplate' in self.entries:
         self.drmlevel = 3
     elif '/DRMStorage/DRMSealed' in self.entries:
         self.drmlevel = 1
     else:
         return
     if self.drmlevel < 5:
         msdes.deskey(self.calculate_deskey(), msdes.DE1)
         bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed'))
         if bookkey[0] != '\000':
             raise LitError('Unable to decrypt title key!')
         self.bookkey = bookkey[1:9]
     else:
         raise DRMError("Cannot access DRM-protected book")
예제 #4
0
    def __init__(self, header, stream, log, options):
        self.log = log
        self.encoding = options.input_encoding

        self.log.debug('132 byte header version found.')

        self.sections = []
        for i in range(header.num_sections):
            self.sections.append(header.section_data(i))

        self.header_record = HeaderRecord(self.section_data(0))

        if self.header_record.compression not in (2, 10):
            if self.header_record.compression in (260, 272):
                raise DRMError('eReader DRM is not supported.')
            else:
                raise EreaderError('Unknown book compression %i.' % self.header_record.compression)

        from calibre.ebooks.metadata.pdb import get_metadata
        self.mi = get_metadata(stream, False)
예제 #5
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, 'src.pdf')
    index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html'))

    with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):

        def a(x):
            return os.path.basename(x)

        exe = PDFTOHTML
        cmd = [
            exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm',
            a(pdfsrc),
            a(index)
        ]

        if isbsd:
            cmd.remove('-nodrm')
        if no_images:
            cmd.append('-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise
        ret = eintr_retry_call(p.wait)
        logf.flush()
        logf.close()
        out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
        if ret != 0:
            raise ConversionError('pdftohtml failed with return code: %d\n%s' %
                                  (ret, out))
        if out:
            prints("pdftohtml log:")
            prints(out)
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with lopen(index, 'r+b') as i:
                raw = i.read().decode('utf-8')
                raw = flip_images(raw)
                raw = raw.replace(
                    '<head',
                    '<!-- created by calibre\'s pdftohtml -->\n  <head', 1)
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
                # breaks the pdf heuristics regexps, so replace them
                raw = raw.replace('<br/>', '<br>')
                raw = re.sub(r'<a\s+name=(\d+)',
                             r'<a id="\1"',
                             raw,
                             flags=re.I)
                raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
                raw = re.sub(r'<a href="index.html#(\d+)"',
                             r'<a href="#p\1"',
                             raw,
                             flags=re.I)
                raw = replace_entities(raw)
                raw = raw.replace('\u00a0', ' ')

                i.write(raw.encode('utf-8'))

            cmd = [
                exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8',
                '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout',
                a(pdfsrc)
            ]
            if isbsd:
                cmd.remove('-nodrm')
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

        try:
            os.remove(pdfsrc)
        except:
            pass
예제 #6
0
    def convert(self, stream, options, file_ext, log, accelerators):
        """Convert a KePub file into a structure calibre can process."""
        log("KEPUBInput::convert - start")
        from calibre.utils.zipfile import ZipFile
        from calibre import walk
        from calibre.ebooks import DRMError
        from calibre.ebooks.metadata.opf2 import OPF

        try:
            zf = ZipFile(stream)
            cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd()
            zf.extractall(cwd)
        except Exception:
            log.exception("KEPUB appears to be invalid ZIP file, trying a "
                          "more forgiving ZIP parser")
            from calibre.utils.localunzip import extractall

            stream.seek(0)
            extractall(stream)
        opf = self.find_opf()
        if opf is None:
            for f in walk("."):
                if (f.lower().endswith(".opf") and "__MACOSX" not in f
                        and not os.path.basename(f).startswith(".")):
                    opf = os.path.abspath(f)
                    break
        path = getattr(stream, "name", "stream")

        if opf is None:
            raise ValueError(
                _(  # noqa: F821
                    "{0} is not a valid KEPUB file (could not find opf)").
                format(path))

        encfile = os.path.abspath("rights.xml")
        if os.path.exists(encfile):
            raise DRMError(os.path.basename(path))

        cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd()
        opf = os.path.relpath(opf, cwd)
        parts = os.path.split(opf)
        opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))

        self.encrypted_fonts = []

        if len(parts) > 1 and parts[0]:
            delta = "/".join(parts[:-1]) + "/"
            for elem in opf.itermanifest():
                elem.set("href", delta + elem.get("href"))
            for elem in opf.iterguide():
                elem.set("href", delta + elem.get("href"))

        f = (self.rationalize_cover3
             if opf.package_version >= 3.0 else self.rationalize_cover2)
        self.removed_cover = f(opf, log)

        self.optimize_opf_parsing = opf
        for x in opf.itermanifest():
            if x.get("media-type", "") == "application/x-dtbook+xml":
                raise ValueError(
                    _("EPUB files with DTBook markup are not supported"
                      )  # noqa: F821
                )

        not_for_spine = set()
        for y in opf.itermanifest():
            id_ = y.get("id", None)
            if id_ and y.get("media-type", None) in {
                    "application/vnd.adobe-page-template+xml",
                    "application/vnd.adobe.page-template+xml",
                    "application/adobe-page-template+xml",
                    "application/adobe.page-template+xml",
                    "application/text",
            }:
                not_for_spine.add(id_)

        seen = set()
        for x in list(opf.iterspine()):
            ref = x.get("idref", None)
            if not ref or ref in not_for_spine or ref in seen:
                x.getparent().remove(x)
                continue
            seen.add(ref)

        if len(list(opf.iterspine())) == 0:
            raise ValueError(
                _("No valid entries in the spine of this EPUB")  # noqa: F821
            )

        with open("content.opf", "wb") as nopf:
            nopf.write(opf.render())

        return os.path.abspath("content.opf")
예제 #7
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.utils.zipfile import ZipFile
        from calibre import walk
        from calibre.ebooks import DRMError
        from calibre.ebooks.metadata.opf2 import OPF
        try:
            zf = ZipFile(stream)
            zf.extractall(os.getcwdu())
        except:
            log.exception('EPUB appears to be invalid ZIP file, trying a'
                          ' more forgiving ZIP parser')
            from calibre.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream)
        encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
        opf = self.find_opf()
        if opf is None:
            for f in walk(u'.'):
                if f.lower().endswith('.opf') and '__MACOSX' not in f and \
                        not os.path.basename(f).startswith('.'):
                    opf = os.path.abspath(f)
                    break
        path = getattr(stream, 'name', 'stream')

        if opf is None:
            raise ValueError(
                '%s is not a valid EPUB file (could not find opf)' % path)

        opf = os.path.relpath(opf, os.getcwdu())
        parts = os.path.split(opf)
        opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))

        self._encrypted_font_uris = []
        if os.path.exists(encfile):
            if not self.process_encryption(encfile, opf, log):
                raise DRMError(os.path.basename(path))
        self.encrypted_fonts = self._encrypted_font_uris

        if len(parts) > 1 and parts[0]:
            delta = '/'.join(parts[:-1]) + '/'

            def normpath(x):
                return posixpath.normpath(delta + elem.get('href'))

            for elem in opf.itermanifest():
                elem.set('href', normpath(elem.get('href')))
            for elem in opf.iterguide():
                elem.set('href', normpath(elem.get('href')))

        f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
        self.removed_cover = f(opf, log)
        if self.removed_cover:
            self.removed_items_to_ignore = (self.removed_cover, )
        epub3_nav = opf.epub3_nav
        if epub3_nav is not None:
            self.convert_epub3_nav(epub3_nav, opf, log, options)

        for x in opf.itermanifest():
            if x.get('media-type', '') == 'application/x-dtbook+xml':
                raise ValueError(
                    'EPUB files with DTBook markup are not supported')

        not_for_spine = set()
        for y in opf.itermanifest():
            id_ = y.get('id', None)
            if id_:
                mt = y.get('media-type', None)
                if mt in {
                        'application/vnd.adobe-page-template+xml',
                        'application/vnd.adobe.page-template+xml',
                        'application/adobe-page-template+xml',
                        'application/adobe.page-template+xml',
                        'application/text'
                }:
                    not_for_spine.add(id_)
                ext = y.get('href', '').rpartition('.')[-1].lower()
                if mt == 'text/plain' and ext in {'otf', 'ttf'}:
                    # some epub authoring software sets font mime types to
                    # text/plain
                    not_for_spine.add(id_)
                    y.set('media-type', 'application/font')

        seen = set()
        for x in list(opf.iterspine()):
            ref = x.get('idref', None)
            if not ref or ref in not_for_spine or ref in seen:
                x.getparent().remove(x)
                continue
            seen.add(ref)

        if len(list(opf.iterspine())) == 0:
            raise ValueError('No valid entries in the spine of this EPUB')

        with lopen('content.opf', 'wb') as nopf:
            nopf.write(opf.render())

        return os.path.abspath(u'content.opf')
예제 #8
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, u'src.pdf')
    index = os.path.join(output_dir, u'index.' + ('xml' if as_xml else 'html'))

    with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):
        # This is necessary as pdftohtml doesn't always (linux) respect
        # absolute paths. Also, it allows us to safely pass only bytestring
        # arguments to subprocess on widows

        # subprocess in python 2 cannot handle unicode arguments on windows
        # that cannot be encoded with mbcs. Ensure all args are
        # bytestrings.
        def a(x):
            return os.path.basename(x).encode('ascii')

        exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(
            PDFTOHTML, unicode) else PDFTOHTML

        cmd = [
            exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
            b'-nodrm', b'-q',
            a(pdfsrc),
            a(index)
        ]

        if isbsd:
            cmd.remove(b'-nodrm')
        if no_images:
            cmd.append(b'-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile(u'pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise

        while True:
            try:
                ret = p.wait()
                break
            except OSError as e:
                if e.errno == errno.EINTR:
                    continue
                else:
                    raise
        logf.flush()
        logf.close()
        out = open(logf.name, 'rb').read().strip()
        if ret != 0:
            raise ConversionError(b'return code: %d\n%s' % (ret, out))
        if out:
            print "pdftohtml log:"
            print out
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with open(index, 'r+b') as i:
                raw = i.read()
                raw = flip_images(raw)
                raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
                # breaks the pdf heuristics regexps, so replace them
                raw = raw.replace(b'<br/>', b'<br>')
                raw = re.sub(br'<a\s+name=(\d+)',
                             br'<a id="\1"',
                             raw,
                             flags=re.I)
                i.write(raw)

            cmd = [
                exe, b'-f', b'1', '-l', '1', b'-xml', b'-i', b'-enc', b'UTF-8',
                b'-noframes', b'-p', b'-nomerge', b'-nodrm', b'-q', b'-stdout',
                a(pdfsrc)
            ]
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

            if isbsd:
                cmd.remove(b'-nodrm')

        try:
            os.remove(pdfsrc)
        except:
            pass
예제 #9
0
    def convert(self, stream, options, file_ext, log, accelerators):
        log("KEPUBInput::convert - start")
        from calibre.utils.zipfile import ZipFile
        from calibre import walk
        from calibre.ebooks import DRMError
        from calibre.ebooks.metadata.opf2 import OPF
        try:
            zf = ZipFile(stream)
            zf.extractall(os.getcwdu())
        except:
            log.exception('KEPUB appears to be invalid ZIP file, trying a '
                          'more forgiving ZIP parser')
            from calibre.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream)
        opf = self.find_opf()
        if opf is None:
            for f in walk(u'.'):
                if f.lower().endswith('.opf') and '__MACOSX' not in f and \
                        not os.path.basename(f).startswith('.'):
                    opf = os.path.abspath(f)
                    break
        path = getattr(stream, 'name', 'stream')

        if opf is None:
            raise ValueError(
                _('%s is not a valid KEPUB file (could not find opf)') % path)

        encfile = os.path.abspath('rights.xml')
        if os.path.exists(encfile):
            raise DRMError(os.path.basename(path))

        opf = os.path.relpath(opf, os.getcwdu())
        parts = os.path.split(opf)
        opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))

        self.encrypted_fonts = []

        if len(parts) > 1 and parts[0]:
            delta = '/'.join(parts[:-1]) + '/'
            for elem in opf.itermanifest():
                elem.set('href', delta + elem.get('href'))
            for elem in opf.iterguide():
                elem.set('href', delta + elem.get('href'))

        f = self.rationalize_cover3 if opf.package_version >= 3.0 else \
            self.rationalize_cover2
        self.removed_cover = f(opf, log)

        self.optimize_opf_parsing = opf
        for x in opf.itermanifest():
            if x.get('media-type', '') == 'application/x-dtbook+xml':
                raise ValueError(
                    _('EPUB files with DTBook markup are not supported'))

        not_for_spine = set()
        for y in opf.itermanifest():
            id_ = y.get('id', None)
            if id_ and y.get('media-type', None) in {
                    'application/vnd.adobe-page-template+xml',
                    'application/vnd.adobe.page-template+xml',
                    'application/adobe-page-template+xml',
                    'application/adobe.page-template+xml', 'application/text'
            }:
                not_for_spine.add(id_)

        seen = set()
        for x in list(opf.iterspine()):
            ref = x.get('idref', None)
            if not ref or ref in not_for_spine or ref in seen:
                x.getparent().remove(x)
                continue
            seen.add(ref)

        if len(list(opf.iterspine())) == 0:
            raise ValueError(_('No valid entries in the spine of this EPUB'))

        with open('content.opf', 'wb') as nopf:
            nopf.write(opf.render())

        return os.path.abspath(u'content.opf')
예제 #10
0
def call_convert_cmd(log, output_dir, pdf_name, first=None, last=None):
    '''
    Convert the pdf into xml/txt using the pdftohtml/text app.
    This will write the output as index.xml/.txt into output_dir.

    pdftotext is often better than pdftohtml.
    '''
    from calibre.ebooks.pdf.pdftohtml import popen

    pdfsrc = os.path.join(output_dir, pdf_name)
    if USE_PDFTOTEXT:
        EXE = 'pdftotext'
        index_file = os.path.join(output_dir, 'index.txt')
    else:
        from calibre.ebooks.pdf.pdftohtml import PDFTOHTML as EXE
        index_file = os.path.join(output_dir, 'index.xml')

    if os.path.exists(index_file):
        os.remove(index_file)

    with CurrentDir(output_dir):
        # This is necessary as pdftohtml doesn't always (linux) respect
        # absolute paths. Also, it allows us to safely pass only bytestring
        # arguments to subprocess on widows

        # subprocess in python 2 cannot handle unicode arguments on windows
        # that cannot be encoded with mbcs. Ensure all args are bytestrings.
        def a(x):
            return os.path.basename(x).encode('ascii')

        exe = EXE.encode(filesystem_encoding) if isinstance(EXE, str) else EXE
        if USE_PDFTOTEXT:
            cmd = [
                exe, b'-enc', b'UTF-8', b'-nopgbrk', b'-q',
                a(pdfsrc),
                a(index_file)
            ]
        else:
            cmd = [
                exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
                b'-nodrm', b'-q',
                a(pdfsrc),
                a(index_file), b'-xml', b'-i'
            ]
            if isbsd:
                cmd.remove(b'-nodrm')

        if first is not None:
            cmd.append(b'-f')
            cmd.append(str(first))
        if last is not None:
            cmd.append(b'-l')
            cmd.append(str(last))

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find %s, check it is in your PATH') % EXE)
            else:
                raise

        while True:
            try:
                ret = p.wait()
                break
            except OSError as e:
                if e.errno == errno.EINTR:
                    continue
                else:
                    raise
        logf.flush()
        logf.close()
        out = open(logf.name, 'rb').read().strip()
        if ret != 0:
            raise ConversionError(out)
        if out:
            log('%s log:' % EXE)
            log(out)
        if not os.path.exists(index_file):
            raise DRMError()

        if USE_PDFTOTEXT:
            with open(index_file, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
        else:
            with open(index_file, 'r', encoding='utf-8', errors='ignore') as f:
                # avoid encoding problems
                content = f.read().encode('utf-8')
            parser = etree.XMLParser(recover=True)
            tree = etree.fromstring(clean_ascii_chars(content), parser)
            text = ''.join(e.text or '' for e in tree.iter('text'))
        return text