Exemple #1
0
 def __enter__(self, *args):
     """
     Add this plugin to the python path so that it's contents become
     directly importable.  Useful when bundling large python libraries into
     the plugin. Use it like this::
         with plugin:
             import something
     """
     if self.plugin_path is not None:
         from ebook_converter.utils.zipfile import ZipFile
         zf = ZipFile(self.plugin_path)
         extensions = {x.rpartition('.')[-1].lower() for x in
                       zf.namelist()}
         zip_safe = True
         for ext in ('pyd', 'so', 'dll', 'dylib'):
             if ext in extensions:
                 zip_safe = False
                 break
         if zip_safe:
             sys.path.insert(0, self.plugin_path)
             self.sys_insertion_path = self.plugin_path
         else:
             from ebook_converter.ptempfile import TemporaryDirectory
             self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip')
             self.sys_insertion_path = (self._sys_insertion_tdir.
                                        __enter__(*args))
             zf.extractall(self.sys_insertion_path)
             sys.path.insert(0, self.sys_insertion_path)
         zf.close()
Exemple #2
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.metadata.toc import TOC
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator
        from ebook_converter.utils.zipfile import ZipFile

        self.options = options
        self.log = log
        pages, images = [], []
        toc = TOC()

        if file_ext == 'pmlz':
            log.debug('De-compressing content to temporary directory...')
            with TemporaryDirectory('_unpmlz') as tdir:
                zf = ZipFile(stream)
                zf.extractall(tdir)

                pmls = glob.glob(os.path.join(tdir, '*.pml'))
                for pml in pmls:
                    html_name = os.path.splitext(
                        os.path.basename(pml))[0] + '.html'
                    html_path = os.path.join(os.getcwd(), html_name)

                    pages.append(html_name)
                    log.debug('Processing PML item %s...', pml)
                    ttoc = self.process_pml(pml, html_path)
                    toc += ttoc
                images = self.get_images(stream, tdir, True)
        else:
            toc = self.process_pml(stream, 'index.html')
            pages.append('index.html')

            if hasattr(stream, 'name'):
                images = self.get_images(
                    stream, os.path.abspath(os.path.dirname(stream.name)))

        # We want pages to be orded alphabetically.
        pages.sort()

        manifest_items = []
        for item in pages + images:
            manifest_items.append((item, None))

        from ebook_converter.ebooks.metadata.meta import get_metadata
        log.debug('Reading metadata from input file...')
        mi = get_metadata(stream, 'pml')
        if 'images/cover.png' in images:
            mi.cover = 'images/cover.png'
        opf = OPFCreator(os.getcwd(), mi)
        log.debug('Generating manifest...')
        opf.create_manifest(manifest_items)
        opf.create_spine(pages)
        opf.set_toc(toc)
        with open('metadata.opf', 'wb') as opffile:
            with open('toc.ncx', 'wb') as tocfile:
                opf.render(opffile, tocfile, 'toc.ncx')

        return os.path.join(os.getcwd(), 'metadata.opf')
    def extract(self, stream):
        self.tdir = PersistentTemporaryDirectory('docx_container')
        try:
            zf = ZipFile(stream)
            zf.extractall(self.tdir)
        except Exception:
            self.log.exception('DOCX appears to be invalid ZIP file, trying a'
                               ' more forgiving ZIP parser')
            from ebook_converter.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream, self.tdir)

        self.names = {}
        for f in walk(self.tdir):
            name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
            self.names[name] = f
Exemple #4
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.chardet import xml_to_unicode
        from ebook_converter.ebooks.metadata.opf2 import OPF
        from ebook_converter.utils.zipfile import ZipFile

        self.log = log
        html = u''
        top_levels = []

        # Extract content from zip archive.
        zf = ZipFile(stream)
        zf.extractall()

        # Find the HTML file in the archive. It needs to be
        # top level.
        index = u''
        multiple_html = False
        # Get a list of all top level files in the archive.
        for x in os.listdir(u'.'):
            if os.path.isfile(x):
                top_levels.append(x)
        # Try to find an index. file.
        for x in top_levels:
            if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
                index = x
                break
        # Look for multiple HTML files in the archive. We look at the
        # top level files only as only they matter in HTMLZ.
        for x in top_levels:
            if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
                # Set index to the first HTML file found if it's not
                # called index.
                if not index:
                    index = x
                else:
                    multiple_html = True
        # Warn the user if there multiple HTML file in the archive. HTMLZ
        # supports a single HTML file. A conversion with a multiple HTML file
        # HTMLZ archive probably won't turn out as the user expects. With
        # Multiple HTML files ZIP input should be used in place of HTMLZ.
        if multiple_html:
            log.warn('Multiple HTML files found in the archive. Only %s will '
                     'be used.' % index)

        if index:
            with open(index, 'rb') as tf:
                html = tf.read()
        else:
            raise Exception('No top level HTML file found.')

        if not html:
            raise Exception('Top level HTML file %s is empty' % index)

        # Encoding
        if options.input_encoding:
            ienc = options.input_encoding
        else:
            ienc = xml_to_unicode(html[:4096])[-1]
        html = html.decode(ienc, 'replace')

        # Run the HTML through the html processing plugin.
        from ebook_converter.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwd()
        htmlfile = os.path.join(base, u'index.html')
        c = 0
        while os.path.exists(htmlfile):
            c += 1
            htmlfile = u'index%d.html' % c
        with open(htmlfile, 'wb') as f:
            f.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile)

        # Set metadata from file.
        from ebook_converter.customize.ui import get_file_type_metadata
        from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        # Get the cover path from the OPF.
        cover_path = None
        opf = None
        for x in top_levels:
            if os.path.splitext(x)[1].lower() == u'.opf':
                opf = x
                break
        if opf:
            opf = OPF(opf, basedir=os.getcwd())
            cover_path = opf.raster_cover or opf.cover
        # Set the cover.
        if cover_path:
            cdata = None
            with open(os.path.join(os.getcwd(), cover_path), 'rb') as cf:
                cdata = cf.read()
            cover_name = os.path.basename(cover_path)
            id, href = oeb.manifest.generate('cover', cover_name)
            oeb.manifest.add(id,
                             href,
                             mimetypes.guess_type(cover_name)[0],
                             data=cdata)
            oeb.guide.add('cover', 'Cover', href)

        return oeb
Exemple #5
0
    def convert(self, recipe_or_file, opts, file_ext, log,
            accelerators):
        from ebook_converter.web.feeds.recipes import compile_recipe
        opts.output_profile.flow_size = 0
        if file_ext == 'downloaded_recipe':
            from ebook_converter.utils.zipfile import ZipFile
            zf = ZipFile(recipe_or_file, 'r')
            zf.extractall()
            zf.close()
            with open('download.recipe', 'rb') as f:
                self.recipe_source = f.read()
            recipe = compile_recipe(self.recipe_source)
            recipe.needs_subscription = False
            self.recipe_object = recipe(opts, log, self.report_progress)
        else:
            if os.environ.get('CALIBRE_RECIPE_URN'):
                from ebook_converter.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
                urn = os.environ['CALIBRE_RECIPE_URN']
                log('Downloading recipe urn: ' + urn)
                rtype, recipe_id = urn.partition(':')[::2]
                if not recipe_id:
                    raise ValueError('Invalid recipe urn: ' + urn)
                if rtype == 'custom':
                    self.recipe_source = get_custom_recipe(recipe_id)
                else:
                    self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True)
                if not self.recipe_source:
                    raise ValueError('Could not find recipe with urn: ' + urn)
                if not isinstance(self.recipe_source, bytes):
                    self.recipe_source = self.recipe_source.encode('utf-8')
                recipe = compile_recipe(self.recipe_source)
            elif os.access(recipe_or_file, os.R_OK):
                with open(recipe_or_file, 'rb') as f:
                    self.recipe_source = f.read()
                recipe = compile_recipe(self.recipe_source)
                log('Using custom recipe')
            else:
                from ebook_converter.web.feeds.recipes.collection import (
                        get_builtin_recipe_by_title, get_builtin_recipe_titles)
                title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
                title = os.path.basename(title).rpartition('.')[0]
                titles = frozenset(get_builtin_recipe_titles())
                if title not in titles:
                    title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
                    title = title.rpartition('.')[0]

                raw = get_builtin_recipe_by_title(title, log=log,
                        download_recipe=not opts.dont_download_recipe)
                builtin = False
                try:
                    recipe = compile_recipe(raw)
                    self.recipe_source = raw
                    if recipe.requires_version > numeric_version:
                        log.warn(
                        'Downloaded recipe needs calibre version at least: %s' %
                        ('.'.join(recipe.requires_version)))
                        builtin = True
                except:
                    log.exception('Failed to compile downloaded recipe. Falling '
                            'back to builtin one')
                    builtin = True
                if builtin:
                    log('Using bundled builtin recipe')
                    raw = get_builtin_recipe_by_title(title, log=log,
                            download_recipe=False)
                    if raw is None:
                        raise ValueError('Failed to find builtin recipe: '+title)
                    recipe = compile_recipe(raw)
                    self.recipe_source = raw
                else:
                    log('Using downloaded builtin recipe')

            if recipe is None:
                raise ValueError('%r is not a valid recipe file or builtin recipe' %
                        recipe_or_file)

            disabled = getattr(recipe, 'recipe_disabled', None)
            if disabled is not None:
                raise RecipeDisabled(disabled)
            ro = recipe(opts, log, self.report_progress)
            ro.download()
            self.recipe_object = ro

        for key, val in self.recipe_object.conversion_options.items():
            setattr(opts, key, val)

        for f in os.listdir('.'):
            if f.endswith('.opf'):
                return os.path.abspath(f)

        for f in walk('.'):
            if f.endswith('.opf'):
                return os.path.abspath(f)
Exemple #6
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.ebooks import DRMError

        _path_or_stream = getattr(stream, 'name', 'stream')
        try:
            zf = ZipFile(stream)
            zf.extractall(os.getcwd())
        except Exception:
            log.exception('EPUB appears to be invalid ZIP file, trying a '
                          'more forgiving ZIP parser')
            from ebook_converter.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream)
        encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
        opf = self.find_opf()
        if opf is None:
            for root, _, fnames in os.walk('.'):
                for f in fnames:
                    f = os.path.join(root, f)
                    if f.lower().endswith('.opf') and '__MACOSX' not in f and \
                            not os.path.basename(f).startswith('.'):
                        opf = os.path.abspath(f)
                        break

        if opf is None:
            raise ValueError('%s is not a valid EPUB file (could not find '
                             'opf)' % _path_or_stream)

        opf = os.path.relpath(opf, os.getcwd())
        parts = os.path.split(opf)
        opf = opf_meta.OPF(opf, os.path.dirname(os.path.abspath(opf)))

        self._encrypted_font_uris = []
        if os.path.exists(encfile):
            if not self.process_encryption(encfile, opf, log):
                raise DRMError(os.path.basename(_path_or_stream))
        self.encrypted_fonts = self._encrypted_font_uris

        # NOTE(gryf): check if opf is nested on the directory(ies), if so,
        # update the links for guide and manifest.
        if len(parts) > 1 and parts[0]:
            path = os.path.join(parts[0])

            for elem in opf.itermanifest():
                elem.set('href', os.path.join(path, elem.get('href')))
            for elem in opf.iterguide():
                elem.set('href', os.path.join(path, elem.get('href')))

        if opf.package_version >= 3.0:
            f = self.rationalize_cover3
        else:
            f = self.rationalize_cover2
        self.removed_cover = f(opf, log)
        if self.removed_cover:
            self.removed_items_to_ignore = (self.removed_cover, )
        epub3_nav = opf.epub3_nav
        if epub3_nav is not None:
            self.convert_epub3_nav(epub3_nav, opf, log, options)

        for x in opf.itermanifest():
            if x.get('media-type', '') == 'application/x-dtbook+xml':
                raise ValueError(
                    'EPUB files with DTBook markup are not supported')

        not_for_spine = set()
        for y in opf.itermanifest():
            id_ = y.get('id', None)
            if id_:
                mt = y.get('media-type', None)
                if mt in {
                        'application/vnd.adobe-page-template+xml',
                        'application/vnd.adobe.page-template+xml',
                        'application/adobe-page-template+xml',
                        'application/adobe.page-template+xml',
                        'application/text'
                }:
                    not_for_spine.add(id_)
                ext = y.get('href', '').rpartition('.')[-1].lower()
                if mt == 'text/plain' and ext in {'otf', 'ttf'}:
                    # some epub authoring software sets font mime types to
                    # text/plain
                    not_for_spine.add(id_)
                    y.set('media-type', 'application/font')

        seen = set()
        for x in list(opf.iterspine()):
            ref = x.get('idref', None)
            if not ref or ref in not_for_spine or ref in seen:
                x.getparent().remove(x)
                continue
            seen.add(ref)

        if len(list(opf.iterspine())) == 0:
            raise ValueError('No valid entries in the spine of this EPUB')

        with open('content.opf', 'wb') as nopf:
            nopf.write(opf.render())

        return os.path.abspath('content.opf')
Exemple #7
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from ebook_converter.ebooks.chardet import detect
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.ebooks.txt.processor import (
            convert_basic, convert_markdown_with_metadata,
            separate_paragraphs_single_line,
            separate_paragraphs_print_formatted, preserve_spaces,
            detect_paragraph_type, detect_formatting_type,
            normalize_line_endings, convert_textile, remove_indents,
            block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = b''
        log.debug('Reading text from file...')
        length = 0
        base_dir = self.output_dir = os.getcwd()

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for root, _, fnames in os.walk('.'):
                for x in fnames:
                    x = os.path.join(root, x)
                    if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                        with open(x, 'rb') as tf:
                            txt += tf.read() + b'\n\n'
        else:
            if getattr(stream, 'name', None):
                base_dir = os.path.dirname(stream.name)
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {
                    'md': 'markdown'
                }.get(file_ext, file_ext)
                log.info(
                    'File extension indicates particular formatting. '
                    'Forcing formatting type to: %s', options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s', ienc)
        else:
            det_encoding = detect(txt[:4096])
            det_encoding, confidence = det_encoding['encoding'], det_encoding[
                'confidence']
            if det_encoding and det_encoding.lower().replace(
                    '_',
                    '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280',
                                     'euc-cn', 'euccn', 'eucgb2312-cn',
                                     'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug(
                'Detected input encoding as %s with a confidence of '
                '%s%%', ienc, confidence * 100)
        if not ienc:
            ienc = 'utf-8'
            log.debug(
                'No input encoding specified and could not auto detect '
                'using %s', ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8,
                    codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = entities.ENT_PAT.sub(entities.xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug('Could not reliably determine paragraph type using '
                          'block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s',
                          options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s',
                      options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options,
                                              log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(
                options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt, 'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the   entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        self.shifted_files = []
        try:
            html = ''
            input_mi = None
            if options.formatting_type == 'markdown':
                log.debug('Running text through markdown conversion...')
                try:
                    input_mi, html = convert_markdown_with_metadata(
                        txt,
                        extensions=[
                            x.strip()
                            for x in options.markdown_extensions.split(',')
                            if x.strip()
                        ])
                except RuntimeError:
                    raise ValueError(
                        'This txt file has malformed markup, it cannot be'
                        ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax'
                    )
                html = self.fix_resources(html, base_dir)
            elif options.formatting_type == 'textile':
                log.debug('Running text through textile conversion...')
                html = convert_textile(txt)
                html = self.fix_resources(html, base_dir)
            else:
                log.debug('Running text through basic conversion...')
                flow_size = getattr(options, 'flow_size', 0)
                html = convert_basic(txt, epub_split_size_kb=flow_size)

            # Run the HTMLized text through the html processing plugin.
            from ebook_converter.customize.ui import plugin_for_input_format
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
            options.input_encoding = 'utf-8'
            htmlfile = self.shift_file('index.html', html.encode('utf-8'))
            odi = options.debug_pipeline
            options.debug_pipeline = None
            # Generate oeb from html conversion.
            oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html',
                                     log, {})
            options.debug_pipeline = odi
        finally:
            for x in self.shifted_files:
                os.remove(x)

        # Set metadata from file.
        if input_mi is None:
            from ebook_converter.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb