Ejemplo n.º 1
0
def explode(path, dest, question=lambda x: True):
    with open(path, 'rb') as stream:
        raw = stream.read(3)
        stream.seek(0)
        if raw == b'TPZ':
            raise BadFormat('This is not a MOBI file. It is a Topaz file.')

        try:
            header = MetadataHeader(stream, default_log)
        except MobiError:
            raise BadFormat('This is not a MOBI file.')

        if header.encryption_type != 0:
            raise DRMError('This file is locked with DRM. It cannot be '
                           'tweaked.')

        kf8_type = header.kf8_type

        if kf8_type is None:
            raise BadFormat('This MOBI file does not contain a KF8 format '
                            'book. KF8 is the new format from Amazon. calibre '
                            'can only tweak MOBI files that contain KF8 '
                            'books. Older MOBI files without KF8 are not '
                            'tweakable.')

        if kf8_type == 'joint':
            if not question('This MOBI file contains both KF8 and older Mobi6 '
                            'data. Tweaking it will remove the Mobi6 data, '
                            'which means the file will not be usable on older '
                            'Kindles. Are you sure?'):
                return None

    return fork_job('ebook_converter.ebooks.mobi.tweak', 'do_explode',
                    args=(path, dest), no_output=True)['result']
Ejemplo n.º 2
0
 def check_for_drm(self):
     if self.book_header.encryption_type != 0:
         try:
             name = self.book_header.exth.mi.title
         except Exception:
             name = self.name
         if not name:
             name = self.name
         raise DRMError(name)
Ejemplo n.º 3
0
    def __init__(self, header, stream, log, options):
        self.log = log
        self.encoding = options.input_encoding

        self.log.debug('132 byte header version found.')

        self.sections = []
        for i in range(header.num_sections):
            self.sections.append(header.section_data(i))

        self.header_record = HeaderRecord(self.section_data(0))

        if self.header_record.compression not in (2, 10):
            if self.header_record.compression in (260, 272):
                raise DRMError('eReader DRM is not supported.')
            else:
                raise EreaderError('Unknown book compression %i.' % self.header_record.compression)

        from ebook_converter.ebooks.metadata.pdb import get_metadata
        self.mi = get_metadata(stream, False)
Ejemplo n.º 4
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, 'src.pdf')
    index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html'))

    with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):

        def a(x):
            return os.path.basename(x)

        cmd = [
            'pdftohtml', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
            '-nodrm',
            a(pdfsrc),
            a(index)
        ]

        if no_images:
            cmd.append('-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError('Could not find pdftohtml, check it is '
                                      'in your PATH')
            else:
                raise
        ret = eintr_retry_call(p.wait)
        logf.flush()
        logf.close()
        out = open(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
        if ret != 0:
            raise ConversionError('pdftohtml failed with return code: '
                                  '%d\n%s' % (ret, out))
        if out:
            print("pdftohtml log:")
            print(out)
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with open(index, 'r+b') as i:
                raw = i.read().decode('utf-8', 'replace')
                raw = flip_images(raw)
                raw = raw.replace(
                    '<head', '<!-- created by ebook-converter\'s'
                    ' pdftohtml -->\n  <head', 1)
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags,
                # this breaks the pdf heuristics regexps, so replace them
                raw = raw.replace('<br/>', '<br>')
                raw = re.sub(r'<a\s+name=(\d+)',
                             r'<a id="\1"',
                             raw,
                             flags=re.I)
                raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
                raw = re.sub(r'<a href="index.html#(\d+)"',
                             r'<a href="#p\1"',
                             raw,
                             flags=re.I)
                raw = xml_replace_entities(raw)
                raw = raw.replace('\u00a0', ' ')

                i.write(raw.encode('utf-8'))

            cmd = [
                'pdftohtml', '-f', '1', '-l', '1', '-xml', '-i', '-enc',
                'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q',
                '-stdout',
                a(pdfsrc)
            ]
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

        try:
            os.remove(pdfsrc)
        except Exception:
            pass
Ejemplo n.º 5
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.ebooks import DRMError

        _path_or_stream = getattr(stream, 'name', 'stream')
        try:
            zf = ZipFile(stream)
            zf.extractall(os.getcwd())
        except Exception:
            log.exception('EPUB appears to be invalid ZIP file, trying a '
                          'more forgiving ZIP parser')
            from ebook_converter.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream)
        encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
        opf = self.find_opf()
        if opf is None:
            for root, _, fnames in os.walk('.'):
                for f in fnames:
                    f = os.path.join(root, f)
                    if f.lower().endswith('.opf') and '__MACOSX' not in f and \
                            not os.path.basename(f).startswith('.'):
                        opf = os.path.abspath(f)
                        break

        if opf is None:
            raise ValueError('%s is not a valid EPUB file (could not find '
                             'opf)' % _path_or_stream)

        opf = os.path.relpath(opf, os.getcwd())
        parts = os.path.split(opf)
        opf = opf_meta.OPF(opf, os.path.dirname(os.path.abspath(opf)))

        self._encrypted_font_uris = []
        if os.path.exists(encfile):
            if not self.process_encryption(encfile, opf, log):
                raise DRMError(os.path.basename(_path_or_stream))
        self.encrypted_fonts = self._encrypted_font_uris

        # NOTE(gryf): check if opf is nested on the directory(ies), if so,
        # update the links for guide and manifest.
        if len(parts) > 1 and parts[0]:
            path = os.path.join(parts[0])

            for elem in opf.itermanifest():
                elem.set('href', os.path.join(path, elem.get('href')))
            for elem in opf.iterguide():
                elem.set('href', os.path.join(path, elem.get('href')))

        if opf.package_version >= 3.0:
            f = self.rationalize_cover3
        else:
            f = self.rationalize_cover2
        self.removed_cover = f(opf, log)
        if self.removed_cover:
            self.removed_items_to_ignore = (self.removed_cover, )
        epub3_nav = opf.epub3_nav
        if epub3_nav is not None:
            self.convert_epub3_nav(epub3_nav, opf, log, options)

        for x in opf.itermanifest():
            if x.get('media-type', '') == 'application/x-dtbook+xml':
                raise ValueError(
                    'EPUB files with DTBook markup are not supported')

        not_for_spine = set()
        for y in opf.itermanifest():
            id_ = y.get('id', None)
            if id_:
                mt = y.get('media-type', None)
                if mt in {
                        'application/vnd.adobe-page-template+xml',
                        'application/vnd.adobe.page-template+xml',
                        'application/adobe-page-template+xml',
                        'application/adobe.page-template+xml',
                        'application/text'
                }:
                    not_for_spine.add(id_)
                ext = y.get('href', '').rpartition('.')[-1].lower()
                if mt == 'text/plain' and ext in {'otf', 'ttf'}:
                    # some epub authoring software sets font mime types to
                    # text/plain
                    not_for_spine.add(id_)
                    y.set('media-type', 'application/font')

        seen = set()
        for x in list(opf.iterspine()):
            ref = x.get('idref', None)
            if not ref or ref in not_for_spine or ref in seen:
                x.getparent().remove(x)
                continue
            seen.add(ref)

        if len(list(opf.iterspine())) == 0:
            raise ValueError('No valid entries in the spine of this EPUB')

        with open('content.opf', 'wb') as nopf:
            nopf.write(opf.render())

        return os.path.abspath('content.opf')