Example #1
0
def bulk_defaults_for_input_format(fmt):
    plugin = plugin_for_input_format(fmt)
    if plugin is not None:
        w = config_widget_for_input_plugin(plugin)
        if w is not None:
            return load_defaults(w.COMMIT_NAME)
    return {}
Example #2
0
    def queue_convert_jobs(self, jobs, changed, bad, rows, previous,
            converted_func, extra_job_args=[], rows_are_ids=False):
        for func, args, desc, fmt, id, temp_files in jobs:
            func, _, parts = func.partition(':')
            parts = {x for x in parts.split(';')}
            input_file = args[0]
            input_fmt = os.path.splitext(input_file)[1]
            core_usage = 1
            if input_fmt:
                input_fmt = input_fmt[1:]
                plugin = plugin_for_input_format(input_fmt)
                if plugin is not None:
                    core_usage = plugin.core_usage

            if id not in bad:
                job = self.gui.job_manager.run_job(Dispatcher(converted_func),
                                            func, args=args, description=desc,
                                            core_usage=core_usage)
                job.conversion_of_same_fmt = 'same_fmt' in parts
                job.manually_fine_tune_toc = 'manually_fine_tune_toc' in parts
                args = [temp_files, fmt, id]+extra_job_args
                self.conversion_jobs[job] = tuple(args)

        if changed:
            m = self.gui.library_view.model()
            if rows_are_ids:
                m.refresh_ids(rows)
            else:
                m.refresh_rows(rows)
            current = self.gui.library_view.currentIndex()
            self.gui.library_view.model().current_changed(current, previous)
Example #3
0
def book_manifest(ctx, rd, book_id, fmt):
    db, library_id = get_library_data(ctx, rd)[:2]
    force_reload = rd.query.get('force_reload') == '1'
    if plugin_for_input_format(fmt) is None:
        raise HTTPNotFound('The format %s cannot be viewed' % fmt.upper())
    if book_id not in ctx.allowed_book_ids(rd, db):
        raise HTTPNotFound('No book with id: %s in library: %s' % (book_id, library_id))
    with db.safe_read_lock:
        fm = db.format_metadata(book_id, fmt)
        if not fm:
            raise HTTPNotFound('No %s format for the book (id:%s) in the library: %s' % (fmt, book_id, library_id))
        size, mtime = map(int, (fm['size'], time.mktime(fm['mtime'].utctimetuple())*10))
        bhash = book_hash(db.library_id, book_id, fmt, size, mtime)
        with cache_lock:
            mpath = abspath(os.path.join(books_cache_dir(), 'f', bhash, 'calibre-book-manifest.json'))
            if force_reload:
                safe_remove(mpath, True)
            try:
                os.utime(mpath, None)
                with lopen(mpath, 'rb') as f:
                    ans = jsonlib.load(f)
                ans['metadata'] = book_as_json(db, book_id)
                return ans
            except EnvironmentError as e:
                if e.errno != errno.ENOENT:
                    raise
            x = failed_jobs.pop(bhash, None)
            if x is not None:
                return {'aborted':x[0], 'traceback':x[1], 'job_status':'finished'}
            job_id = queued_jobs.get(bhash)
            if job_id is None:
                job_id = queue_job(ctx, partial(db.copy_format_to, book_id, fmt), bhash, fmt, book_id, size, mtime)
    status, result, tb, aborted = ctx.job_status(job_id)
    return {'aborted': aborted, 'traceback':tb, 'job_status':status, 'job_id':job_id}
Example #4
0
    def queue_convert_jobs(self, jobs, changed, bad, rows, previous,
            converted_func, extra_job_args=[], rows_are_ids=False):
        for func, args, desc, fmt, id, temp_files in jobs:
            func, _, parts = func.partition(':')
            parts = {x for x in parts.split(';')}
            input_file = args[0]
            input_fmt = os.path.splitext(input_file)[1]
            core_usage = 1
            if input_fmt:
                input_fmt = input_fmt[1:]
                plugin = plugin_for_input_format(input_fmt)
                if plugin is not None:
                    core_usage = plugin.core_usage

            if id not in bad:
                job = self.gui.job_manager.run_job(Dispatcher(converted_func),
                                            func, args=args, description=desc,
                                            core_usage=core_usage)
                job.conversion_of_same_fmt = 'same_fmt' in parts
                job.manually_fine_tune_toc = 'manually_fine_tune_toc' in parts
                args = [temp_files, fmt, id]+extra_job_args
                self.conversion_jobs[job] = tuple(args)

        if changed:
            m = self.gui.library_view.model()
            if rows_are_ids:
                m.refresh_ids(rows)
            else:
                m.refresh_rows(rows)
            current = self.gui.library_view.currentIndex()
            self.gui.library_view.model().current_changed(current, previous)
Example #5
0
    def write_mobi(self, input_plugin, output_path, kf8, resources):
        from calibre.ebooks.mobi.mobiml import MobiMLizer
        from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
        from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
        from calibre.customize.ui import plugin_for_input_format

        opts, oeb = self.opts, self.oeb
        if not opts.no_inline_toc:
            tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
                    opts.mobi_toc_at_start else 'end')
            tocadder(oeb, opts)
        mangler = CaseMangler()
        mangler(oeb, opts)
        try:
            rasterizer = SVGRasterizer()
            rasterizer(oeb, opts)
        except Unavailable:
            self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
        else:
            # Add rasterized SVG images
            resources.add_extra_images()
        if hasattr(self.oeb, 'inserted_metadata_jacket'):
            self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
        mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
        mobimlizer(oeb, opts)
        write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
        from calibre.ebooks.mobi.writer2.main import MobiWriter
        writer = MobiWriter(opts, resources, kf8,
                        write_page_breaks_after_item=write_page_breaks_after_item)
        writer(oeb, output_path)
        extract_mobi(output_path, opts)
Example #6
0
    def write_mobi(self, input_plugin, output_path, kf8, resources):
        from calibre.ebooks.mobi.mobiml import MobiMLizer
        from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
        from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
        from calibre.customize.ui import plugin_for_input_format

        opts, oeb = self.opts, self.oeb
        if not opts.no_inline_toc:
            tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
                    opts.mobi_toc_at_start else 'end')
            tocadder(oeb, opts)
        mangler = CaseMangler()
        mangler(oeb, opts)
        try:
            rasterizer = SVGRasterizer()
            rasterizer(oeb, opts)
        except Unavailable:
            self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
        else:
            # Add rasterized SVG images
            resources.add_extra_images()
        if hasattr(self.oeb, 'inserted_metadata_jacket'):
            self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
        mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
        mobimlizer(oeb, opts)
        write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
        from calibre.ebooks.mobi.writer2.main import MobiWriter
        writer = MobiWriter(opts, resources, kf8,
                        write_page_breaks_after_item=write_page_breaks_after_item)
        writer(oeb, output_path)
        extract_mobi(output_path, opts)
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.txt.processor import convert_basic

        stdout = StringIO()
        ppdjvu = True
        # using djvutxt is MUCH faster, should make it an option
        if options.use_djvutxt and os.path.exists('/usr/bin/djvutxt'):
            from calibre.ptempfile import PersistentTemporaryFile
            try:
                fp = PersistentTemporaryFile(suffix='.djvu',
                                             prefix='djv_input')
                filename = fp._name
                fp.write(stream.read())
                fp.close()
                cmd = ['djvutxt', filename]
                stdout.write(
                    Popen(cmd, stdout=PIPE, close_fds=True).communicate()[0])
                os.remove(filename)
                ppdjvu = False
            except:
                stream.seek(0)  # retry with the pure python converter
        if ppdjvu:
            from calibre.ebooks.djvu.djvu import DJVUFile
            x = DJVUFile(stream)
            x.get_text(stdout)

        html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace(
            b'\037', b'\n\n'))
        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        if file_ext != 'txtz' and hasattr(stream, 'name'):
            base = os.path.dirname(stream.name)
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = 'index%d.html' % c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile.name, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        return oeb
Example #8
0
def book_manifest(ctx, rd, book_id, fmt):
    db, library_id = get_library_data(ctx, rd)[:2]
    if plugin_for_input_format(fmt) is None:
        raise HTTPNotFound('The format %s cannot be viewed' % fmt.upper())
    if book_id not in ctx.allowed_book_ids(rd, db):
        raise HTTPNotFound('No book with id: %s in library: %s' % (book_id, library_id))
    with db.safe_read_lock:
        fm = db.format_metadata(book_id, fmt)
        if not fm:
            raise HTTPNotFound('No %s format for the book (id:%s) in the library: %s' % (fmt, book_id, library_id))
        size, mtime = map(int, (fm['size'], time.mktime(fm['mtime'].utctimetuple())*10))
        bhash = book_hash(db.library_id, book_id, fmt, size, mtime)
        with cache_lock:
            mpath = abspath(os.path.join(books_cache_dir(), 'f', bhash, 'calibre-book-manifest.json'))
            try:
                os.utime(mpath, None)
                with lopen(mpath, 'rb') as f:
                    ans = jsonlib.load(f)
                ans['metadata'] = book_as_json(db, book_id)
                return ans
            except EnvironmentError as e:
                if e.errno != errno.ENOENT:
                    raise
            x = failed_jobs.pop(bhash, None)
            if x is not None:
                return {'aborted':x[0], 'traceback':x[1], 'job_status':'finished'}
            job_id = queued_jobs.get(bhash)
            if job_id is None:
                job_id = queue_job(ctx, partial(db.copy_format_to, book_id, fmt), bhash, fmt, book_id, size, mtime)
    status, result, tb, aborted = ctx.job_status(job_id)
    return {'aborted': aborted, 'traceback':tb, 'job_status':status, 'job_id':job_id}
Example #9
0
def do_print():
    from calibre.customize.ui import plugin_for_input_format
    stdin = getattr(sys.stdin, 'buffer', sys.stdin)
    data = msgpack_loads(stdin.read())
    ext = data['input'].lower().rpartition('.')[-1]
    input_plugin = plugin_for_input_format(ext)
    if input_plugin is None:
        raise ValueError('Not a supported file type: {}'.format(ext.upper()))
    args = [
        'ebook-convert', data['input'], data['output'], '--paper-size',
        data['paper_size'], '--pdf-add-toc', '--disable-remove-fake-margins',
        '--chapter-mark', 'none', '-vv'
    ]
    if input_plugin.is_image_collection:
        args.append('--no-process')
    else:
        args.append('--disable-font-rescaling')
        args.append('--page-breaks-before=/')
    if data['page_numbers']:
        args.append('--pdf-page-numbers')
    for edge in 'left top right bottom'.split():
        args.append('--pdf-page-margin-' + edge), args.append(
            '%.1f' % (data['margin_' + edge] * 72))
    from calibre.ebooks.conversion.cli import main
    main(args)
Example #10
0
def bulk_defaults_for_input_format(fmt):
    plugin = plugin_for_input_format(fmt)
    if plugin is not None:
        w = config_widget_for_input_plugin(plugin)
        if w is not None:
            return load_defaults(w.COMMIT_NAME)
    return {}
Example #11
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.txt.processor import convert_basic

        stdout = StringIO()
        ppdjvu = True
        # using djvutxt is MUCH faster, should make it an option
        if options.use_djvutxt and os.path.exists('/usr/bin/djvutxt'):
            from calibre.ptempfile import PersistentTemporaryFile
            try:
                fp = PersistentTemporaryFile(suffix='.djvu', prefix='djv_input')
                filename = fp._name
                fp.write(stream.read())
                fp.close()
                cmd = ['djvutxt', filename]
                stdout.write(Popen(cmd, stdout=PIPE, close_fds=True).communicate()[0])
                os.remove(filename)
                ppdjvu = False
            except:
                stream.seek(0) # retry with the pure python converter
        if ppdjvu:
            from calibre.ebooks.djvu.djvu import DJVUFile
            x = DJVUFile(stream)
            x.get_text(stdout)

        html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace(
            b'\037', b'\n\n'))
        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        if file_ext != 'txtz' and hasattr(stream, 'name'):
            base = os.path.dirname(stream.name)
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = 'index%d.html'%c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile.name, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log,
                {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        return oeb
Example #12
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.chm.metadata import get_metadata_from_reader
        from calibre.customize.ui import plugin_for_input_format
        self.opts = options

        log.debug('Processing CHM...')
        with TemporaryDirectory('_chm2oeb') as tdir:
            if not isinstance(tdir, unicode_type):
                tdir = tdir.decode(filesystem_encoding)
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
            no_images = False  # options.no_images
            chm_name = stream.name
            # chm_data = stream.read()

            # closing stream so CHM can be opened by external library
            stream.close()
            log.debug('tdir=%s' % tdir)
            log.debug('stream.name=%s' % stream.name)
            debug_dump = False
            odi = options.debug_pipeline
            if odi:
                debug_dump = os.path.join(odi, 'input')
            mainname = self._chmtohtml(tdir,
                                       chm_name,
                                       no_images,
                                       log,
                                       debug_dump=debug_dump)
            mainpath = os.path.join(tdir, mainname)

            try:
                metadata = get_metadata_from_reader(self._chm_reader)
            except Exception:
                log.exception('Failed to read metadata, using filename')
                from calibre.ebooks.metadata.book.base import Metadata
                metadata = Metadata(os.path.basename(chm_name))
            encoding = self._chm_reader.get_encoding(
            ) or options.input_encoding or 'cp1252'
            self._chm_reader.CloseCHM()
            # print((tdir, mainpath))
            # from calibre import ipython
            # ipython()

            options.debug_pipeline = None
            options.input_encoding = 'utf-8'
            uenc = encoding
            if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
                uenc = 'utf-8'
            htmlpath, toc = self._create_html_root(mainpath, log, uenc)
            oeb = self._create_oebbook_html(htmlpath, tdir, options, log,
                                            metadata)
            options.debug_pipeline = odi
            if toc.count() > 1:
                oeb.toc = self.parse_html_toc(oeb.spine[0])
                oeb.manifest.remove(oeb.spine[0])
                oeb.auto_generated_toc = False
        return oeb
Example #13
0
def options_for_input_fmt(fmt):
    from calibre.customize.ui import plugin_for_input_format
    fmt = fmt.lower()
    plugin = plugin_for_input_format(fmt)
    if plugin is None:
        return None, ()
    full_name = plugin.name.lower().replace(' ', '_')
    name = full_name.rpartition('_')[0]
    return full_name, OPTIONS['input'].get(name, ())
Example #14
0
def options_for_input_fmt(fmt):
    from calibre.customize.ui import plugin_for_input_format
    fmt = fmt.lower()
    plugin = plugin_for_input_format(fmt)
    if plugin is None:
        return None, ()
    full_name = plugin.name.lower().replace(' ', '_')
    name = full_name.rpartition('_')[0]
    return full_name, OPTIONS['input'].get(name, ())
Example #15
0
def opf_to_azw3(opf, outpath, log):
    from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
    plumber = Plumber(opf, outpath, log)
    plumber.setup_options()
    inp = plugin_for_input_format('azw3')
    outp = plugin_for_output_format('azw3')
    plumber.opts.mobi_passthrough = True
    oeb = create_oebbook(log, opf, plumber.opts)
    set_cover(oeb)
    outp.convert(oeb, outpath, inp, plumber.opts, log)
Example #16
0
def do_rebuild(opf, dest_path):
    plumber = Plumber(opf, dest_path, default_log)
    plumber.setup_options()
    inp = plugin_for_input_format('azw3')
    outp = plugin_for_output_format('azw3')

    plumber.opts.mobi_passthrough = True
    oeb = create_oebbook(default_log, opf, plumber.opts)
    set_cover(oeb)
    outp.convert(oeb, dest_path, inp, plumber.opts, default_log)
Example #17
0
def opf_to_azw3(opf, outpath, log):
    from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
    plumber = Plumber(opf, outpath, log)
    plumber.setup_options()
    inp = plugin_for_input_format('azw3')
    outp = plugin_for_output_format('azw3')
    plumber.opts.mobi_passthrough = True
    oeb = create_oebbook(log, opf, plumber.opts)
    set_cover(oeb)
    outp.convert(oeb, outpath, inp, plumber.opts, log)
Example #18
0
def do_rebuild(opf, dest_path):
    plumber = Plumber(opf, dest_path, default_log)
    plumber.setup_options()
    inp = plugin_for_input_format('azw3')
    outp = plugin_for_output_format('azw3')

    plumber.opts.mobi_passthrough = True
    oeb = create_oebbook(default_log, opf, plumber.opts)
    set_cover(oeb)
    outp.convert(oeb, dest_path, inp, plumber.opts, default_log)
Example #19
0
def book_manifest(ctx, rd, book_id, fmt):
    db, library_id = get_library_data(ctx, rd)[:2]
    force_reload = rd.query.get('force_reload') == '1'
    if plugin_for_input_format(fmt) is None:
        raise HTTPNotFound('The format %s cannot be viewed' % fmt.upper())
    if not ctx.has_id(rd, db, book_id):
        raise BookNotFound(book_id, db)
    with db.safe_read_lock:
        fm = db.format_metadata(book_id, fmt, allow_cache=False)
        if not fm:
            raise HTTPNotFound(
                'No %s format for the book (id:%s) in the library: %s' %
                (fmt, book_id, library_id))
        size, mtime = map(
            int, (fm['size'], time.mktime(fm['mtime'].utctimetuple()) * 10))
        bhash = book_hash(db.library_id, book_id, fmt, size, mtime)
        with cache_lock:
            mpath = abspath(
                os.path.join(books_cache_dir(), 'f', bhash,
                             'calibre-book-manifest.json'))
            if force_reload:
                safe_remove(mpath, True)
            try:
                os.utime(mpath, None)
                with lopen(mpath, 'rb') as f:
                    ans = jsonlib.load(f)
                ans['metadata'] = book_as_json(db, book_id)
                user = rd.username or None
                ans['last_read_positions'] = db.get_last_read_positions(
                    book_id, fmt, user) if user else []
                ans['annotations_map'] = db.annotations_map_for_book(
                    book_id, fmt, user_type='web', user=user or '*')
                return ans
            except OSError as e:
                if e.errno != errno.ENOENT:
                    raise
            x = failed_jobs.pop(bhash, None)
            if x is not None:
                return {
                    'aborted': x[0],
                    'traceback': x[1],
                    'job_status': 'finished'
                }
            job_id = queued_jobs.get(bhash)
            if job_id is None:
                job_id = queue_job(ctx, partial(db.copy_format_to, book_id,
                                                fmt), bhash, fmt, book_id,
                                   size, mtime)
    status, result, tb, aborted = ctx.job_status(job_id)
    return {
        'aborted': aborted,
        'traceback': tb,
        'job_status': status,
        'job_id': job_id
    }
Example #20
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.chm.metadata import get_metadata_from_reader
        from calibre.customize.ui import plugin_for_input_format
        self.opts = options

        log.debug('Processing CHM...')
        with TemporaryDirectory('_chm2oeb') as tdir:
            if not isinstance(tdir, unicode):
                tdir = tdir.decode(filesystem_encoding)
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
            no_images = False  # options.no_images
            chm_name = stream.name
            # chm_data = stream.read()

            # closing stream so CHM can be opened by external library
            stream.close()
            log.debug('tdir=%s' % tdir)
            log.debug('stream.name=%s' % stream.name)
            debug_dump = False
            odi = options.debug_pipeline
            if odi:
                debug_dump = os.path.join(odi, 'input')
            mainname = self._chmtohtml(tdir, chm_name, no_images, log,
                    debug_dump=debug_dump)
            mainpath = os.path.join(tdir, mainname)

            try:
                metadata = get_metadata_from_reader(self._chm_reader)
            except Exception:
                log.exception('Failed to read metadata, using filename')
                from calibre.ebooks.metadata.book.base import Metadata
                metadata = Metadata(os.path.basename(chm_name))
            encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
            self._chm_reader.CloseCHM()
            # print tdir, mainpath
            # from calibre import ipython
            # ipython()

            options.debug_pipeline = None
            options.input_encoding = 'utf-8'
            uenc = encoding
            if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
                uenc = 'utf-8'
            htmlpath, toc = self._create_html_root(mainpath, log, uenc)
            oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
            options.debug_pipeline = odi
            if toc.count() > 1:
                oeb.toc = self.parse_html_toc(oeb.spine[0])
                oeb.manifest.remove(oeb.spine[0])
                oeb.auto_generated_toc = False
        return oeb
Example #21
0
 def commit(self, outpath=None, keep_parsed=False):
     super(AZW3Container, self).commit(keep_parsed=keep_parsed)
     if outpath is None:
         outpath = self.pathtoazw3
     from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
     opf = self.name_path_map[self.opf_name]
     plumber = Plumber(opf, outpath, self.log)
     plumber.setup_options()
     inp = plugin_for_input_format('azw3')
     outp = plugin_for_output_format('azw3')
     plumber.opts.mobi_passthrough = True
     oeb = create_oebbook(default_log, opf, plumber.opts)
     set_cover(oeb)
     outp.convert(oeb, outpath, inp, plumber.opts, default_log)
Example #22
0
 def commit(self, outpath=None, keep_parsed=False):
     super(AZW3Container, self).commit(keep_parsed=keep_parsed)
     if outpath is None:
         outpath = self.pathtoazw3
     from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
     opf = self.name_path_map[self.opf_name]
     plumber = Plumber(opf, outpath, self.log)
     plumber.setup_options()
     inp = plugin_for_input_format('azw3')
     outp = plugin_for_output_format('azw3')
     plumber.opts.mobi_passthrough = True
     oeb = create_oebbook(default_log, opf, plumber.opts)
     set_cover(oeb)
     outp.convert(oeb, outpath, inp, plumber.opts, default_log)
Example #23
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.chm.metadata import get_metadata_from_reader
        from calibre.customize.ui import plugin_for_input_format
        self.opts = options

        log.debug('Processing CHM...')
        with TemporaryDirectory('_chm2oeb') as tdir:
            if not isinstance(tdir, unicode):
                tdir = tdir.decode(filesystem_encoding)
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
            no_images = False  #options.no_images
            chm_name = stream.name
            #chm_data = stream.read()

            #closing stream so CHM can be opened by external library
            stream.close()
            log.debug('tdir=%s' % tdir)
            log.debug('stream.name=%s' % stream.name)
            debug_dump = False
            odi = options.debug_pipeline
            if odi:
                debug_dump = os.path.join(odi, 'input')
            mainname = self._chmtohtml(tdir,
                                       chm_name,
                                       no_images,
                                       log,
                                       debug_dump=debug_dump)
            mainpath = os.path.join(tdir, mainname)

            metadata = get_metadata_from_reader(self._chm_reader)
            self._chm_reader.CloseCHM()
            #print tdir
            #from calibre import ipython
            #ipython()

            options.debug_pipeline = None
            options.input_encoding = 'utf-8'
            # try a custom conversion:
            #oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
            # try using html converter:
            htmlpath = self._create_html_root(mainpath, log)
            oeb = self._create_oebbook_html(htmlpath, tdir, options, log,
                                            metadata)
            options.debug_pipeline = odi
            #log.debug('DEBUG: Not removing tempdir %s' % tdir)
        return oeb
Example #24
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.txt.processor import convert_basic

        stdout = BytesIO()
        from calibre.ebooks.djvu.djvu import DJVUFile
        x = DJVUFile(stream)
        x.get_text(stdout)
        raw_text = stdout.getvalue()
        if not raw_text:
            raise ValueError(
                'The DJVU file contains no text, only images, probably page scans.'
                ' calibre only supports conversion of DJVU files with actual text in them.'
            )

        html = convert_basic(
            raw_text.replace(b"\n", b' ').replace(b'\037', b'\n\n'))
        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwd()
        htmlfile = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(htmlfile):
            c += 1
            htmlfile = os.path.join(base, 'index%d.html' % c)
        with open(htmlfile, 'wb') as f:
            f.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        return oeb
Example #25
0
    def extract_content(self, output_dir):
        self.log.info('Extracting PDF...')

        pdf = PersistentTemporaryFile('.pdf')
        pdf.close()
        pdf = open(pdf, 'wb')
        for x in xrange(self.header.section_count()):
            pdf.write(self.header.section_data(x))
        pdf.close()

        from calibre.customize.ui import plugin_for_input_format

        pdf_plugin = plugin_for_input_format('pdf')
        for opt in pdf_plugin.options:
            if not hasattr(self.options, opt.option.name):
                setattr(self.options, opt.option.name, opt.recommended_value)

        return pdf_plugin.convert(open(pdf, 'rb'), self.options, 'pdf', self.log, {})
Example #26
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.compression.tcr import decompress

        log.info('Decompressing text...')
        raw_txt = decompress(stream)

        log.info('Converting text to OEB...')
        stream = BytesIO(raw_txt)

        from calibre.customize.ui import plugin_for_input_format

        txt_plugin = plugin_for_input_format('txt')
        for opt in txt_plugin.options:
            if not hasattr(self.options, opt.option.name):
                setattr(options, opt.option.name, opt.recommended_value)

        stream.seek(0)
        return txt_plugin.convert(stream, options, 'txt', log, accelerators)
Example #27
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.chm.metadata import get_metadata_from_reader
        from calibre.customize.ui import plugin_for_input_format
        self.opts = options

        log.debug('Processing CHM...')
        with TemporaryDirectory('_chm2oeb') as tdir:
            if not isinstance(tdir, unicode):
                tdir = tdir.decode(filesystem_encoding)
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
            no_images = False #options.no_images
            chm_name = stream.name
            #chm_data = stream.read()

            #closing stream so CHM can be opened by external library
            stream.close()
            log.debug('tdir=%s' % tdir)
            log.debug('stream.name=%s' % stream.name)
            debug_dump = False
            odi = options.debug_pipeline
            if odi:
                debug_dump = os.path.join(odi, 'input')
            mainname = self._chmtohtml(tdir, chm_name, no_images, log,
                    debug_dump=debug_dump)
            mainpath = os.path.join(tdir, mainname)

            metadata = get_metadata_from_reader(self._chm_reader)
            self._chm_reader.CloseCHM()
            #print tdir
            #from calibre import ipython
            #ipython()

            options.debug_pipeline = None
            options.input_encoding = 'utf-8'
            # try a custom conversion:
            #oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
            # try using html converter:
            htmlpath = self._create_html_root(mainpath, log)
            oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
            options.debug_pipeline = odi
            #log.debug('DEBUG: Not removing tempdir %s' % tdir)
        return oeb
Example #28
0
def do_print():
    from calibre.customize.ui import plugin_for_input_format
    stdin = getattr(sys.stdin, 'buffer', sys.stdin)
    data = msgpack_loads(stdin.read())
    ext = data['input'].lower().rpartition('.')[-1]
    input_plugin = plugin_for_input_format(ext)
    args = ['ebook-convert', data['input'], data['output'], '--paper-size', data['paper_size'], '--pdf-add-toc',
            '--disable-remove-fake-margins', '--chapter-mark', 'none', '-vv']
    if input_plugin.is_image_collection:
        args.append('--no-process')
    else:
        args.append('--disable-font-rescaling')
        args.append('--page-breaks-before=/')
    if data['page_numbers']:
        args.append('--pdf-page-numbers')
    for edge in 'left top right bottom'.split():
        args.append('--pdf-page-margin-' + edge), args.append('%.1f' % (data['margin_' + edge] * 72))
    from calibre.ebooks.conversion.cli import main
    main(args)
Example #29
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.compression.tcr import decompress

        log.info('Decompressing text...')
        raw_txt = decompress(stream)

        log.info('Converting text to OEB...')
        stream = BytesIO(raw_txt)

        from calibre.customize.ui import plugin_for_input_format

        txt_plugin = plugin_for_input_format('txt')
        for opt in txt_plugin.options:
            if not hasattr(self.options, opt.option.name):
                setattr(options, opt.option.name, opt.recommended_value)

        stream.seek(0)
        return txt_plugin.convert(stream, options,
                'txt', log, accelerators)
Example #30
0
    def extract_content(self, output_dir):
        raw_txt = b''

        self.log.info('Decompressing text...')
        for i in range(1, self.header_record.num_records + 1):
            self.log.debug('\tDecompressing text section %i' % i)
            raw_txt += self.decompress_text(i)

        self.log.info('Converting text to OEB...')
        stream = io.BytesIO(raw_txt)

        from calibre.customize.ui import plugin_for_input_format

        txt_plugin = plugin_for_input_format('txt')
        for opt in txt_plugin.options:
            if not hasattr(self.options, opt.option.name):
                setattr(self.options, opt.option.name, opt.recommended_value)

        stream.seek(0)
        return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
Example #31
0
    def extract_content(self, output_dir):
        raw_txt = b''

        self.log.info('Decompressing text...')
        for i in range(1, self.header_record.num_records + 1):
            self.log.debug('\tDecompressing text section %i' % i)
            raw_txt += self.decompress_text(i)

        self.log.info('Converting text to OEB...')
        stream = io.BytesIO(raw_txt)

        from calibre.customize.ui import plugin_for_input_format

        txt_plugin = plugin_for_input_format('txt')
        for opt in txt_plugin.options:
            if not hasattr(self.options, opt.option.name):
                setattr(self.options, opt.option.name, opt.recommended_value)

        stream.seek(0)
        return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
Example #32
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.txt.processor import convert_basic

        stdout = BytesIO()
        from calibre.ebooks.djvu.djvu import DJVUFile
        x = DJVUFile(stream)
        x.get_text(stdout)

        html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace(
            b'\037', b'\n\n'))
        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = getcwd()
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = os.path.join(base, 'index%d.html'%c)
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile.name, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log,
                {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        return oeb
Example #33
0
    def extract_content(self, output_dir):
        self.log.info('Extracting PDF from AZW4 Container...')

        self.stream.seek(0)
        raw_data = self.stream.read()
        data = ''
        mo = re.search(br'%PDF.+%%EOF', raw_data, flags=re.DOTALL)
        if mo:
            data = mo.group()

        pdf_n = os.path.join(os.getcwdu(), 'tmp.pdf')
        with open(pdf_n, 'wb') as pdf:
            pdf.write(data)
        from calibre.customize.ui import plugin_for_input_format

        pdf_plugin = plugin_for_input_format('pdf')
        for opt in pdf_plugin.options:
            if not hasattr(self.options, opt.option.name):
                setattr(self.options, opt.option.name, opt.recommended_value)

        return pdf_plugin.convert(open(pdf_n, 'rb'), self.options, 'pdf', self.log, {})
Example #34
0
    def extract_content(self, output_dir):
        self.log.info('Extracting PDF from AZW4 Container...')

        self.stream.seek(0)
        raw_data = self.stream.read()
        data = b''
        mo = re.search(br'%PDF.+%%EOF', raw_data, flags=re.DOTALL)
        if mo:
            data = mo.group()

        pdf_n = os.path.join(os.getcwd(), 'tmp.pdf')
        with open(pdf_n, 'wb') as pdf:
            pdf.write(data)
        from calibre.customize.ui import plugin_for_input_format

        pdf_plugin = plugin_for_input_format('pdf')
        for opt in pdf_plugin.options:
            if not hasattr(self.options, opt.option.name):
                setattr(self.options, opt.option.name, opt.recommended_value)

        return pdf_plugin.convert(open(pdf_n, 'rb'), self.options, 'pdf', self.log, {})
Example #35
0
    def convert_vtt_files(self):
        main_lang = self.main_lang_combo.currentData()
        if main_lang == '-':
            QMessageBox.about(self, 'Information',
                              'Select the main language before conversion.')
            return

        sub_lang = str(self.sub_lang_combo.currentData())
        # if user does not select sub_lang, set it to main_lang, so that when converting, it won't generate sub language.
        if sub_lang == '-':
            sub_lang = main_lang

        # convert to html
        if hasattr(self, 'cover_file_path'):
            cover_file_path = self.cover_file_path
        else:
            cover_file_path = None

        self.book_id = self.convert_to_html_add_to_library(
            self.vtt_dir, main_lang, sub_lang, cover_file_path)

        # add html to epub conversion job
        self.jobs, changed, bad = convert_single_ebook(
            self.gui,
            self.gui.library_view.model().db, [self.book_id], True,
            self.outputFmt)
        func, args, desc, fmt, id, temp_files = self.jobs[0]

        core_usage = 1
        plugin = plugin_for_input_format('html')
        if plugin is not None:
            core_usage = plugin.core_usage

        self.gui.job_manager.run_job(Dispatcher(self.converted_func),
                                     func,
                                     args=args,
                                     description=desc,
                                     core_usage=core_usage)

        self.close()
Example #36
0
    def __init__(self, input, output, log, report_progress=DummyReporter(),
            dummy=False, merge_plugin_recs=True, abort_after_input_dump=False,
            override_input_metadata=False):
        '''
        :param input: Path to input file.
        :param output: Path to output file/directory
        '''
        if isbytestring(input):
            input = input.decode(filesystem_encoding)
        if isbytestring(output):
            output = output.decode(filesystem_encoding)
        self.original_input_arg = input
        self.input = os.path.abspath(input)
        self.output = os.path.abspath(output)
        self.log = log
        self.ui_reporter = report_progress
        self.abort_after_input_dump = abort_after_input_dump
        self.override_input_metadata = override_input_metadata

        # Pipeline options {{{
        # Initialize the conversion options that are independent of input and
        # output formats. The input and output plugins can still disable these
        # options via recommendations.
        self.pipeline_options = [

OptionRecommendation(name='verbose',
            recommended_value=0, level=OptionRecommendation.LOW,
            short_switch='v',
            help=_('Level of verbosity. Specify multiple times for greater '
                   'verbosity.')
        ),

OptionRecommendation(name='debug_pipeline',
            recommended_value=None, level=OptionRecommendation.LOW,
            short_switch='d',
            help=_('Save the output from different stages of the conversion '
                   'pipeline to the specified '
                   'directory. Useful if you are unsure at which stage '
                   'of the conversion process a bug is occurring.')
        ),

OptionRecommendation(name='input_profile',
            recommended_value='default', level=OptionRecommendation.LOW,
            choices=[x.short_name for x in input_profiles()],
            help=_('Specify the input profile. The input profile gives the '
                   'conversion system information on how to interpret '
                   'various information in the input document. For '
                   'example resolution dependent lengths (i.e. lengths in '
                   'pixels). Choices are:')+\
                        ', '.join([x.short_name for x in input_profiles()])
        ),

OptionRecommendation(name='output_profile',
            recommended_value='default', level=OptionRecommendation.LOW,
            choices=[x.short_name for x in output_profiles()],
            help=_('Specify the output profile. The output profile '
                   'tells the conversion system how to optimize the '
                   'created document for the specified device. In some cases, '
                   'an output profile is required to produce documents that '
                   'will work on a device. For example EPUB on the SONY reader. '
                   'Choices are:') + \
                           ', '.join([x.short_name for x in output_profiles()])
        ),

OptionRecommendation(name='base_font_size',
            recommended_value=0, level=OptionRecommendation.LOW,
            help=_('The base font size in pts. All font sizes in the produced book '
                   'will be rescaled based on this size. By choosing a larger '
                   'size you can make the fonts in the output bigger and vice '
                   'versa. By default, the base font size is chosen based on '
                   'the output profile you chose.'
                   )
        ),

OptionRecommendation(name='font_size_mapping',
            recommended_value=None, level=OptionRecommendation.LOW,
            help=_('Mapping from CSS font names to font sizes in pts. '
                   'An example setting is 12,12,14,16,18,20,22,24. '
                   'These are the mappings for the sizes xx-small to xx-large, '
                   'with the final size being for huge fonts. The font '
                   'rescaling algorithm uses these sizes to intelligently '
                   'rescale fonts. The default is to use a mapping based on '
                   'the output profile you chose.'
                   )
        ),

OptionRecommendation(name='disable_font_rescaling',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Disable all rescaling of font sizes.'
                   )
        ),

OptionRecommendation(name='minimum_line_height',
            recommended_value=120.0, level=OptionRecommendation.LOW,
            help=_(
            'The minimum line height, as a percentage of the element\'s '
            'calculated font size. calibre will ensure that every element '
            'has a line height of at least this setting, irrespective of '
            'what the input document specifies. Set to zero to disable. '
            'Default is 120%. Use this setting in preference to '
            'the direct line height specification, unless you know what '
            'you are doing. For example, you can achieve "double spaced" '
            'text by setting this to 240.'
            )
        ),


OptionRecommendation(name='line_height',
            recommended_value=0, level=OptionRecommendation.LOW,
            help=_(
            'The line height in pts. Controls spacing between consecutive '
            'lines of text. Only applies to elements that do not define '
            'their own line height. In most cases, the minimum line height '
            'option is more useful. '
            'By default no line height manipulation is performed.'
            )
        ),

OptionRecommendation(name='linearize_tables',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Some badly designed documents use tables to control the '
                'layout of text on the page. When converted these documents '
                'often have text that runs off the page and other artifacts. '
                'This option will extract the content from the tables and '
                'present it in a linear fashion.'
                )
        ),

OptionRecommendation(name='level1_toc',
            recommended_value=None, level=OptionRecommendation.LOW,
            help=_('XPath expression that specifies all tags that '
            'should be added to the Table of Contents at level one. If '
            'this is specified, it takes precedence over other forms '
            'of auto-detection.'
            ' See the XPath Tutorial in the calibre User Manual for examples.'
                )
        ),

OptionRecommendation(name='level2_toc',
            recommended_value=None, level=OptionRecommendation.LOW,
            help=_('XPath expression that specifies all tags that should be '
            'added to the Table of Contents at level two. Each entry is added '
            'under the previous level one entry.'
            ' See the XPath Tutorial in the calibre User Manual for examples.'
                )
        ),

OptionRecommendation(name='level3_toc',
            recommended_value=None, level=OptionRecommendation.LOW,
            help=_('XPath expression that specifies all tags that should be '
                'added to the Table of Contents at level three. Each entry '
                'is added under the previous level two entry.'
            ' See the XPath Tutorial in the calibre User Manual for examples.'
                )
        ),

OptionRecommendation(name='use_auto_toc',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Normally, if the source file already has a Table of '
            'Contents, it is used in preference to the auto-generated one. '
            'With this option, the auto-generated one is always used.'
                )
        ),

OptionRecommendation(name='no_chapters_in_toc',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_("Don't add auto-detected chapters to the Table of "
            'Contents.'
                )
        ),

OptionRecommendation(name='toc_threshold',
            recommended_value=6, level=OptionRecommendation.LOW,
            help=_(
        'If fewer than this number of chapters is detected, then links '
        'are added to the Table of Contents. Default: %default')
        ),

OptionRecommendation(name='max_toc_links',
            recommended_value=50, level=OptionRecommendation.LOW,
            help=_('Maximum number of links to insert into the TOC. Set to 0 '
               'to disable. Default is: %default. Links are only added to the '
            'TOC if less than the threshold number of chapters were detected.'
                )
        ),

OptionRecommendation(name='toc_filter',
            recommended_value=None, level=OptionRecommendation.LOW,
            help=_('Remove entries from the Table of Contents whose titles '
            'match the specified regular expression. Matching entries and all '
            'their children are removed.'
                )
        ),

OptionRecommendation(name='duplicate_links_in_toc',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('When creating a TOC from links in the input document, '
                'allow duplicate entries, i.e. allow more than one entry '
                'with the same text, provided that they point to a '
                'different location.')
        ),


OptionRecommendation(name='chapter',
        recommended_value="//*[((name()='h1' or name()='h2') and "
              r"re:test(., '\s*((chapter|book|section|part)\s+)|((prolog|prologue|epilogue)(\s+|$))', 'i')) or @class "
              "= 'chapter']", level=OptionRecommendation.LOW,
            help=_('An XPath expression to detect chapter titles. The default '
                'is to consider <h1> or <h2> tags that contain the words '
                '"chapter","book","section", "prologue", "epilogue", or "part" as chapter titles as '
                'well as any tags that have class="chapter". The expression '
                'used must evaluate to a list of elements. To disable chapter '
                'detection, use the expression "/". See the XPath Tutorial '
                'in the calibre User Manual for further help on using this '
                'feature.'
                )
        ),

OptionRecommendation(name='chapter_mark',
            recommended_value='pagebreak', level=OptionRecommendation.LOW,
            choices=['pagebreak', 'rule', 'both', 'none'],
            help=_('Specify how to mark detected chapters. A value of '
                    '"pagebreak" will insert page breaks before chapters. '
                    'A value of "rule" will insert a line before chapters. '
                    'A value of "none" will disable chapter marking and a '
                    'value of "both" will use both page breaks and lines '
                    'to mark chapters.')
        ),

OptionRecommendation(name='extra_css',
            recommended_value=None, level=OptionRecommendation.LOW,
            help=_('Either the path to a CSS stylesheet or raw CSS. '
                'This CSS will be appended to the style rules from '
                'the source file, so it can be used to override those '
                'rules.')
        ),

OptionRecommendation(name='filter_css',
            recommended_value=None, level=OptionRecommendation.LOW,
            help=_('A comma separated list of CSS properties that '
                'will be removed from all CSS style rules. This is useful '
                'if the presence of some style information prevents it '
                'from being overridden on your device. '
                'For example: '
                'font-family,color,margin-left,margin-right')
        ),

OptionRecommendation(name='page_breaks_before',
            recommended_value="//*[name()='h1' or name()='h2']",
            level=OptionRecommendation.LOW,
            help=_('An XPath expression. Page breaks are inserted '
            'before the specified elements.')
        ),

OptionRecommendation(name='remove_fake_margins',
            recommended_value=True, level=OptionRecommendation.LOW,
            help=_('Some documents specify page margins by '
                'specifying a left and right margin on each individual '
                'paragraph. calibre will try to detect and remove these '
                'margins. Sometimes, this can cause the removal of '
                'margins that should not have been removed. In this '
                'case you can disable the removal.')
        ),


OptionRecommendation(name='margin_top',
        recommended_value=5.0, level=OptionRecommendation.LOW,
        help=_('Set the top margin in pts. Default is %default. '
            'Setting this to less than zero will cause no margin to be set. '
            'Note: 72 pts equals 1 inch')),

OptionRecommendation(name='margin_bottom',
        recommended_value=5.0, level=OptionRecommendation.LOW,
        help=_('Set the bottom margin in pts. Default is %default. '
            'Setting this to less than zero will cause no margin to be set. '
            'Note: 72 pts equals 1 inch')),

OptionRecommendation(name='margin_left',
        recommended_value=5.0, level=OptionRecommendation.LOW,
        help=_('Set the left margin in pts. Default is %default. '
            'Setting this to less than zero will cause no margin to be set. '
            'Note: 72 pts equals 1 inch')),

OptionRecommendation(name='margin_right',
        recommended_value=5.0, level=OptionRecommendation.LOW,
        help=_('Set the right margin in pts. Default is %default. '
            'Setting this to less than zero will cause no margin to be set. '
            'Note: 72 pts equals 1 inch')),

OptionRecommendation(name='change_justification',
        recommended_value='original', level=OptionRecommendation.LOW,
        choices=['left','justify','original'],
        help=_('Change text justification. A value of "left" converts all'
            ' justified text in the source to left aligned (i.e. '
            'unjustified) text. A value of "justify" converts all '
            'unjustified text to justified. A value of "original" '
            '(the default) does not change justification in the '
            'source file. Note that only some output formats support '
            'justification.')),

OptionRecommendation(name='remove_paragraph_spacing',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Remove spacing between paragraphs. Also sets an indent on '
        'paragraphs of 1.5em. Spacing removal will not work '
        'if the source file does not use paragraphs (<p> or <div> tags).')
        ),

OptionRecommendation(name='remove_paragraph_spacing_indent_size',
        recommended_value=1.5, level=OptionRecommendation.LOW,
        help=_('When calibre removes blank lines between paragraphs, it automatically '
            'sets a paragraph indent, to ensure that paragraphs can be easily '
            'distinguished. This option controls the width of that indent (in em). '
            'If you set this value negative, then the indent specified in the input '
            'document is used, that is, calibre does not change the indentation.')
        ),

OptionRecommendation(name='prefer_metadata_cover',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Use the cover detected from the source file in preference '
        'to the specified cover.')
        ),

OptionRecommendation(name='insert_blank_line',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Insert a blank line between paragraphs. Will not work '
            'if the source file does not use paragraphs (<p> or <div> tags).'
            )
        ),

OptionRecommendation(name='insert_blank_line_size',
        recommended_value=0.5, level=OptionRecommendation.LOW,
        help=_('Set the height of the inserted blank lines (in em).'
            ' The height of the lines between paragraphs will be twice the value'
            ' set here.')
        ),

OptionRecommendation(name='remove_first_image',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Remove the first image from the input ebook. Useful if the '
        'input document has a cover image that is not identified as a cover. '
        'In this case, if you set a cover in calibre, the output document will '
        'end up with two cover images if you do not specify this option.'
            )
        ),

OptionRecommendation(name='insert_metadata',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Insert the book metadata at the start of '
            'the book. This is useful if your ebook reader does not support '
            'displaying/searching metadata directly.'
            )
        ),

OptionRecommendation(name='smarten_punctuation',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Convert plain quotes, dashes and ellipsis to their '
            'typographically correct equivalents. For details, see '
            'http://daringfireball.net/projects/smartypants'
            )
        ),

OptionRecommendation(name='unsmarten_punctuation',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Convert fancy quotes, dashes and ellipsis to their '
               'plain equivalents.'
            )
        ),

OptionRecommendation(name='read_metadata_from_opf',
            recommended_value=None, level=OptionRecommendation.LOW,
            short_switch='m',
            help=_('Read metadata from the specified OPF file. Metadata read '
                   'from this file will override any metadata in the source '
                   'file.')
        ),

OptionRecommendation(name='asciiize',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=(_('Transliterate unicode characters to an ASCII '
            'representation. Use with care because this will replace '
            'unicode characters with ASCII. For instance it will replace "%s" '
            'with "Mikhail Gorbachiov". Also, note that in '
            'cases where there are multiple representations of a character '
            '(characters shared by Chinese and Japanese for instance) the '
            'representation based on the current calibre interface language will be '
            'used.')%\
            u'\u041c\u0438\u0445\u0430\u0438\u043b '
            u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
)
        ),

OptionRecommendation(name='keep_ligatures',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Preserve ligatures present in the input document. '
                'A ligature is a special rendering of a pair of '
                'characters like ff, fi, fl et cetera. '
                'Most readers do not have support for '
                'ligatures in their default fonts, so they are '
                'unlikely to render correctly. By default, calibre '
                'will turn a ligature into the corresponding pair of normal '
                'characters. This option will preserve them instead.')
        ),

OptionRecommendation(name='title',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the title.')),

OptionRecommendation(name='authors',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the authors. Multiple authors should be separated by '
    'ampersands.')),

OptionRecommendation(name='title_sort',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('The version of the title to be used for sorting. ')),

OptionRecommendation(name='author_sort',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('String to be used when sorting by author. ')),

OptionRecommendation(name='cover',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the cover to the specified file or URL')),

OptionRecommendation(name='comments',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the ebook description.')),

OptionRecommendation(name='publisher',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the ebook publisher.')),

OptionRecommendation(name='series',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the series this ebook belongs to.')),

OptionRecommendation(name='series_index',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the index of the book in this series.')),

OptionRecommendation(name='rating',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the rating. Should be a number between 1 and 5.')),

OptionRecommendation(name='isbn',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the ISBN of the book.')),

OptionRecommendation(name='tags',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the tags for the book. Should be a comma separated list.')),

OptionRecommendation(name='book_producer',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the book producer.')),

OptionRecommendation(name='language',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the language.')),

OptionRecommendation(name='pubdate',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the publication date.')),

OptionRecommendation(name='timestamp',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the book timestamp (no longer used anywhere)')),

OptionRecommendation(name='enable_heuristics',
    recommended_value=False, level=OptionRecommendation.LOW,
    help=_('Enable heuristic processing. This option must be set for any '
           'heuristic processing to take place.')),

OptionRecommendation(name='markup_chapter_headings',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Detect unformatted chapter headings and sub headings. Change '
           'them to h2 and h3 tags.  This setting will not create a TOC, '
           'but can be used in conjunction with structure detection to create '
           'one.')),

OptionRecommendation(name='italicize_common_cases',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Look for common words and patterns that denote '
           'italics and italicize them.')),

OptionRecommendation(name='fix_indents',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Turn indentation created from multiple non-breaking space entities '
           'into CSS indents.')),

OptionRecommendation(name='html_unwrap_factor',
    recommended_value=0.40, level=OptionRecommendation.LOW,
    help=_('Scale used to determine the length at which a line should '
            'be unwrapped. Valid values are a decimal between 0 and 1. The '
            'default is 0.4, just below the median line length.  If only a '
            'few lines in the document require unwrapping this value should '
            'be reduced')),

OptionRecommendation(name='unwrap_lines',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Unwrap lines using punctuation and other formatting clues.')),

OptionRecommendation(name='delete_blank_paragraphs',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Remove empty paragraphs from the document when they exist between '
           'every other paragraph')),

OptionRecommendation(name='format_scene_breaks',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Left aligned scene break markers are center aligned. '
           'Replace soft scene breaks that use multiple blank lines with '
           'horizontal rules.')),

OptionRecommendation(name='replace_scene_breaks',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Replace scene breaks with the specified text. By default, the '
        'text from the input document is used.')),

OptionRecommendation(name='dehyphenate',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Analyze hyphenated words throughout the document.  The '
           'document itself is used as a dictionary to determine whether hyphens '
           'should be retained or removed.')),

OptionRecommendation(name='renumber_headings',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Looks for occurrences of sequential <h1> or <h2> tags. '
           'The tags are renumbered to prevent splitting in the middle '
           'of chapter headings.')),

OptionRecommendation(name='sr1_search',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Search pattern (regular expression) to be replaced with '
           'sr1-replace.')),

OptionRecommendation(name='sr1_replace',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Replacement to replace the text found with sr1-search.')),

OptionRecommendation(name='sr2_search',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Search pattern (regular expression) to be replaced with '
           'sr2-replace.')),

OptionRecommendation(name='sr2_replace',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Replacement to replace the text found with sr2-search.')),

OptionRecommendation(name='sr3_search',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Search pattern (regular expression) to be replaced with '
           'sr3-replace.')),

OptionRecommendation(name='sr3_replace',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Replacement to replace the text found with sr3-search.')),

OptionRecommendation(name='search_replace',
    recommended_value=None, level=OptionRecommendation.LOW, help=_(
        'Path to a file containing search and replace regular expressions. '
        'The file must contain alternating lines of regular expression '
        'followed by replacement pattern (which can be an empty line). '
        'The regular expression must be in the python regex syntax and '
        'the file must be UTF-8 encoded.')),
]
        # }}}

        input_fmt = os.path.splitext(self.input)[1]
        if not input_fmt:
            raise ValueError('Input file must have an extension')
        input_fmt = input_fmt[1:].lower().replace('original_', '')
        self.archive_input_tdir = None
        if input_fmt in ARCHIVE_FMTS:
            self.log('Processing archive...')
            tdir = PersistentTemporaryDirectory('_plumber_archive')
            self.input, input_fmt = self.unarchive(self.input, tdir)
            self.archive_input_tdir = tdir
        if os.access(self.input, os.R_OK):
            nfp = run_plugins_on_preprocess(self.input, input_fmt)
            if nfp != self.input:
                self.input = nfp
                input_fmt = os.path.splitext(self.input)[1]
                if not input_fmt:
                    raise ValueError('Input file must have an extension')
                input_fmt = input_fmt[1:].lower()

        if os.path.exists(self.output) and os.path.isdir(self.output):
            output_fmt = 'oeb'
        else:
            output_fmt = os.path.splitext(self.output)[1]
            if not output_fmt:
                output_fmt = '.oeb'
            output_fmt = output_fmt[1:].lower()

        self.input_plugin  = plugin_for_input_format(input_fmt)
        self.output_plugin = plugin_for_output_format(output_fmt)

        if self.input_plugin is None:
            raise ValueError('No plugin to handle input format: '+input_fmt)

        if self.output_plugin is None:
            raise ValueError('No plugin to handle output format: '+output_fmt)

        self.input_fmt = input_fmt
        self.output_fmt = output_fmt


        self.all_format_options = set()
        self.input_options = set()
        self.output_options = set()
        # Build set of all possible options. Two options are equal if their
        # names are the same.
        if not dummy:
            self.input_options  = self.input_plugin.options.union(
                                        self.input_plugin.common_options)
            self.output_options = self.output_plugin.options.union(
                                    self.output_plugin.common_options)
        else:
            for fmt in available_input_formats():
                input_plugin = plugin_for_input_format(fmt)
                if input_plugin:
                    self.all_format_options = self.all_format_options.union(
                        input_plugin.options.union(input_plugin.common_options))
            for fmt in available_output_formats():
                output_plugin = plugin_for_output_format(fmt)
                if output_plugin:
                    self.all_format_options = self.all_format_options.union(
                        output_plugin.options.union(output_plugin.common_options))

        # Remove the options that have been disabled by recommendations from the
        # plugins.
        for w in ('input_options', 'output_options',
                'all_format_options'):
            temp = set([])
            for x in getattr(self, w):
                temp.add(x.clone())
            setattr(self, w, temp)
        if merge_plugin_recs:
            self.merge_plugin_recommendations()
Example #37
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from calibre.ebooks.chardet import detect
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.txt.processor import (
            convert_basic, convert_markdown_with_metadata,
            separate_paragraphs_single_line,
            separate_paragraphs_print_formatted, preserve_spaces,
            detect_paragraph_type, detect_formatting_type,
            normalize_line_endings, convert_textile, remove_indents,
            block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = ''
        log.debug('Reading text from file...')
        length = 0

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for x in walk('.'):
                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                    with open(x, 'rb') as tf:
                        txt += tf.read() + '\n\n'
        else:
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {
                    'md': 'markdown'
                }.get(file_ext, file_ext)
                log.info('File extension indicates particular formatting. '
                         'Forcing formatting type to: %s' %
                         options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
        else:
            det_encoding = detect(txt)
            det_encoding, confidence = det_encoding['encoding'], det_encoding[
                'confidence']
            if det_encoding and det_encoding.lower().replace(
                    '_',
                    '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280',
                                     'euc-cn', 'euccn', 'eucgb2312-cn',
                                     'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug(
                'Detected input encoding as %s with a confidence of %s%%' %
                (ienc, confidence * 100))
        if not ienc:
            ienc = 'utf-8'
            log.debug(
                'No input encoding specified and could not auto detect using %s'
                % ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8,
                    codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = _ent_pat.sub(xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug(
                    'Could not reliably determine paragraph type using block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s' %
                          options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s' %
                      options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options,
                                              log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(
                options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt, 'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        html = ''
        input_mi = None
        if options.formatting_type == 'markdown':
            log.debug('Running text through markdown conversion...')
            try:
                input_mi, html = convert_markdown_with_metadata(
                    txt,
                    extensions=[
                        x.strip()
                        for x in options.markdown_extensions.split(',')
                        if x.strip()
                    ])
            except RuntimeError:
                raise ValueError(
                    'This txt file has malformed markup, it cannot be'
                    ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax'
                )
        elif options.formatting_type == 'textile':
            log.debug('Running text through textile conversion...')
            html = convert_textile(txt)
        else:
            log.debug('Running text through basic conversion...')
            flow_size = getattr(options, 'flow_size', 0)
            html = convert_basic(txt, epub_split_size_kb=flow_size)

        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        if file_ext != 'txtz' and hasattr(stream, 'name'):
            base = os.path.dirname(stream.name)
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = 'index%d.html' % c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html',
                                 log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        if input_mi is None:
            from calibre.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb
Example #38
0
    def __init__(self,
                 book_fmt,
                 opfpath,
                 input_fmt,
                 tdir,
                 log=None,
                 book_hash=None,
                 save_bookmark_data=False,
                 book_metadata=None,
                 allow_no_cover=True,
                 virtualize_resources=True):
        log = log or default_log
        self.allow_no_cover = allow_no_cover
        ContainerBase.__init__(self, tdir, opfpath, log)
        self.book_metadata = book_metadata
        input_plugin = plugin_for_input_format(input_fmt)
        self.is_comic = bool(
            getattr(input_plugin, 'is_image_collection', False))
        if save_bookmark_data:
            bm_file = 'META-INF/calibre_bookmarks.txt'
            self.bookmark_data = None
            if self.exists(bm_file):
                with self.open(bm_file, 'rb') as f:
                    self.bookmark_data = f.read()
        # We do not add zero byte sized files as the IndexedDB API in the
        # browser has no good way to distinguish between zero byte files and
        # load failures.
        excluded_names = {
            name
            for name, mt in iteritems(self.mime_map) if name == self.opf_name
            or mt == guess_type('a.ncx') or name.startswith('META-INF/')
            or name == 'mimetype' or not self.has_name_and_is_not_empty(name)
        }
        raster_cover_name, titlepage_name = self.create_cover_page(
            input_fmt.lower())

        toc = get_toc(self).to_dict(count())
        if not toc or not toc.get('children'):
            toc = from_xpaths(self,
                              ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
        spine = [name for name, is_linear in self.spine_names]
        spineq = frozenset(spine)
        landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq]

        self.book_render_data = data = {
            'version': RENDER_VERSION,
            'toc': toc,
            'book_format': book_fmt,
            'spine': spine,
            'link_uid': uuid4(),
            'book_hash': book_hash,
            'is_comic': self.is_comic,
            'raster_cover_name': raster_cover_name,
            'title_page_name': titlepage_name,
            'has_maths': False,
            'total_length': 0,
            'spine_length': 0,
            'toc_anchor_map': toc_anchor_map(toc),
            'landmarks': landmarks,
            'link_to_map': {},
        }
        # Mark the spine as dirty since we have to ensure it is normalized
        for name in data['spine']:
            self.parsed(name), self.dirty(name)
        self.virtualized_names = set()
        self.transform_all(virtualize_resources)

        def manifest_data(name):
            mt = (self.mime_map.get(name)
                  or 'application/octet-stream').lower()
            ans = {
                'size': os.path.getsize(self.name_path_map[name]),
                'is_virtualized': name in self.virtualized_names,
                'mimetype': mt,
                'is_html': mt in OEB_DOCS,
            }
            if ans['is_html']:
                root = self.parsed(name)
                ans['length'] = l = get_length(root)
                self.book_render_data['total_length'] += l
                if name in data['spine']:
                    self.book_render_data['spine_length'] += l
                ans['has_maths'] = hm = check_for_maths(root)
                if hm:
                    self.book_render_data['has_maths'] = True
                ans['anchor_map'] = anchor_map(root)
            return ans

        data['files'] = {
            name: manifest_data(name)
            for name in set(self.name_path_map) - excluded_names
        }
        self.commit()
        for name in excluded_names:
            os.remove(self.name_path_map[name])
        data = json.dumps(self.book_render_data, ensure_ascii=False)
        if not isinstance(data, bytes):
            data = data.encode('utf-8')
        with lopen(os.path.join(self.root, 'calibre-book-manifest.json'),
                   'wb') as f:
            f.write(data)
Example #39
0
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from calibre.ebooks.chardet import detect
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.txt.processor import (convert_basic,
                convert_markdown_with_metadata, separate_paragraphs_single_line,
                separate_paragraphs_print_formatted, preserve_spaces,
                detect_paragraph_type, detect_formatting_type,
                normalize_line_endings, convert_textile, remove_indents,
                block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = ''
        log.debug('Reading text from file...')
        length = 0

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for x in walk('.'):
                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                    with open(x, 'rb') as tf:
                        txt += tf.read() + '\n\n'
        else:
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
                log.info('File extension indicates particular formatting. '
                        'Forcing formatting type to: %s'%options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
        else:
            det_encoding = detect(txt[:4096])
            det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
            if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
                    'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
                    'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
        if not ienc:
            ienc = 'utf-8'
            log.debug('No input encoding specified and could not auto detect using %s' % ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = _ent_pat.sub(xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug('Could not reliably determine paragraph type using block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s' % options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s' % options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt,'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        html = ''
        input_mi = None
        if options.formatting_type == 'markdown':
            log.debug('Running text through markdown conversion...')
            try:
                input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
            except RuntimeError:
                raise ValueError('This txt file has malformed markup, it cannot be'
                    ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
        elif options.formatting_type == 'textile':
            log.debug('Running text through textile conversion...')
            html = convert_textile(txt)
        else:
            log.debug('Running text through basic conversion...')
            flow_size = getattr(options, 'flow_size', 0)
            html = convert_basic(txt, epub_split_size_kb=flow_size)

        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        if file_ext != 'txtz' and hasattr(stream, 'name'):
            base = os.path.dirname(stream.name)
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = 'index%d.html'%c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
                {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        if input_mi is None:
            from calibre.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb
Example #40
0
    def extract_content(self, output_dir):
        # Each text record is independent (unless the continuation
        # value is set in the previous record). Put each converted
        # text recored into a separate file. We will reference the
        # home.html file as the first file and let the HTML input
        # plugin assemble the order based on hyperlinks.
        with CurrentDir(output_dir):
            for uid, num in self.uid_text_secion_number.items():
                self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid))
                with open('%s.html' % uid, 'wb') as htmlf:
                    html = u'<html><body>'
                    section_header, section_data = self.sections[num]
                    if section_header.type == DATATYPE_PHTML:
                        html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets)
                    elif section_header.type == DATATYPE_PHTML_COMPRESSED:
                        d = self.decompress_phtml(section_data.data)
                        html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
                    html += '</body></html>'
                    htmlf.write(html.encode('utf-8'))

        # Images.
        # Cache the image sizes in case they are used by a composite image.
        images = set()
        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        with CurrentDir(os.path.join(output_dir, 'images/')):
            # Single images.
            for uid, num in self.uid_image_section_number.items():
                section_header, section_data = self.sections[num]
                if section_data:
                    idata = None
                    if section_header.type == DATATYPE_TBMP:
                        idata = section_data
                    elif section_header.type == DATATYPE_TBMP_COMPRESSED:
                        if self.header_record.compression == 1:
                            idata = decompress_doc(section_data)
                        elif self.header_record.compression == 2:
                            idata = zlib.decompress(section_data)
                    try:
                        save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70)
                        images.add(uid)
                        self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
                    except Exception as e:
                        self.log.error('Failed to write image with uid %s: %s' % (uid, e))
                else:
                    self.log.error('Failed to write image with uid %s: No data.' % uid)
            # Composite images.
            # We're going to use the already compressed .jpg images here.
            for uid, num in self.uid_composite_image_section_number.items():
                try:
                    section_header, section_data = self.sections[num]
                    # Get the final width and height.
                    width = 0
                    height = 0
                    for row in section_data.layout:
                        row_width = 0
                        col_height = 0
                        for col in row:
                            if col not in images:
                                raise Exception('Image with uid: %s missing.' % col)
                            w, h = identify(lopen('%s.jpg' % col, 'rb'))[1:]
                            row_width += w
                            if col_height < h:
                                col_height = h
                        if width < row_width:
                            width = row_width
                        height += col_height
                    # Create a new image the total size of all image
                    # parts. Put the parts into the new image.
                    with Canvas(width, height) as canvas:
                        y_off = 0
                        for row in section_data.layout:
                            x_off = 0
                            largest_height = 0
                            for col in row:
                                im = image_from_data(lopen('%s.jpg' % col, 'rb').read())
                                canvas.compose(im, x_off, y_off)
                                w, h = im.width(), im.height()
                                x_off += w
                                if largest_height < h:
                                    largest_height = h
                            y_off += largest_height
                    with lopen('%s.jpg' % uid) as out:
                        out.write(canvas.export(compression_quality=70))
                    self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid))
                except Exception as e:
                    self.log.error('Failed to write composite image with uid %s: %s' % (uid, e))

        # Run the HTML through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(self.options, opt.option.name, opt.recommended_value)
        self.options.input_encoding = 'utf-8'
        odi = self.options.debug_pipeline
        self.options.debug_pipeline = None
        # Determine the home.html record uid. This should be set in the
        # reserved values in the metadata recored. home.html is the first
        # text record (should have hyper link references to other records)
        # in the document.
        try:
            home_html = self.header_record.home_html
            if not home_html:
                home_html = self.uid_text_secion_number.items()[0][0]
        except:
            raise Exception('Could not determine home.html')
        # Generate oeb from html conversion.
        oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {})
        self.options.debug_pipeline = odi

        return oeb
Example #41
0
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.metadata.opf2 import OPF
        from calibre.utils.zipfile import ZipFile

        self.log = log
        html = u''
        top_levels = []

        # Extract content from zip archive.
        zf = ZipFile(stream)
        zf.extractall()

        # Find the HTML file in the archive. It needs to be
        # top level.
        index = u''
        multiple_html = False
        # Get a list of all top level files in the archive.
        for x in os.listdir(u'.'):
            if os.path.isfile(x):
                top_levels.append(x)
        # Try to find an index. file.
        for x in top_levels:
            if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
                index = x
                break
        # Look for multiple HTML files in the archive. We look at the
        # top level files only as only they matter in HTMLZ.
        for x in top_levels:
            if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
                # Set index to the first HTML file found if it's not
                # called index.
                if not index:
                    index = x
                else:
                    multiple_html = True
        # Warn the user if there multiple HTML file in the archive. HTMLZ
        # supports a single HTML file. A conversion with a multiple HTML file
        # HTMLZ archive probably won't turn out as the user expects. With
        # Multiple HTML files ZIP input should be used in place of HTMLZ.
        if multiple_html:
            log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)

        if index:
            with open(index, 'rb') as tf:
                html = tf.read()
        else:
            raise Exception(_('No top level HTML file found.'))

        if not html:
            raise Exception(_('Top level HTML file %s is empty') % index)

        # Encoding
        if options.input_encoding:
            ienc = options.input_encoding
        else:
            ienc = xml_to_unicode(html[:4096])[-1]
        html = html.decode(ienc, 'replace')

        # Run the HTML through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        fname = os.path.join(base, u'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = u'index%d.html'%c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
                {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        # Get the cover path from the OPF.
        cover_path = None
        opf = None
        for x in top_levels:
            if os.path.splitext(x)[1].lower() == u'.opf':
                opf = x
                break
        if opf:
            opf = OPF(opf, basedir=os.getcwdu())
            cover_path = opf.raster_cover or opf.cover
        # Set the cover.
        if cover_path:
            cdata = None
            with open(os.path.join(os.getcwdu(), cover_path), 'rb') as cf:
                cdata = cf.read()
            cover_name = os.path.basename(cover_path)
            id, href = oeb.manifest.generate('cover', cover_name)
            oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata)
            oeb.guide.add('cover', 'Cover', href)

        return oeb
Example #42
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.metadata.opf2 import OPF
        from calibre.utils.zipfile import ZipFile

        self.log = log
        html = u''
        top_levels = []

        # Extract content from zip archive.
        zf = ZipFile(stream)
        zf.extractall()

        # Find the HTML file in the archive. It needs to be
        # top level.
        index = u''
        multiple_html = False
        # Get a list of all top level files in the archive.
        for x in os.listdir(u'.'):
            if os.path.isfile(x):
                top_levels.append(x)
        # Try to find an index. file.
        for x in top_levels:
            if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
                index = x
                break
        # Look for multiple HTML files in the archive. We look at the
        # top level files only as only they matter in HTMLZ.
        for x in top_levels:
            if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml',
                                                  u'.htm'):
                # Set index to the first HTML file found if it's not
                # called index.
                if not index:
                    index = x
                else:
                    multiple_html = True
        # Warn the user if there multiple HTML file in the archive. HTMLZ
        # supports a single HTML file. A conversion with a multiple HTML file
        # HTMLZ archive probably won't turn out as the user expects. With
        # Multiple HTML files ZIP input should be used in place of HTMLZ.
        if multiple_html:
            log.warn(
                _('Multiple HTML files found in the archive. Only %s will be used.'
                  ) % index)

        if index:
            with open(index, 'rb') as tf:
                html = tf.read()
        else:
            raise Exception(_('No top level HTML file found.'))

        if not html:
            raise Exception(_('Top level HTML file %s is empty') % index)

        # Encoding
        if options.input_encoding:
            ienc = options.input_encoding
        else:
            ienc = xml_to_unicode(html[:4096])[-1]
        html = html.decode(ienc, 'replace')

        # Run the HTML through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = getcwd()
        fname = os.path.join(base, u'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = u'index%d.html' % c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html',
                                 log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        # Get the cover path from the OPF.
        cover_path = None
        opf = None
        for x in top_levels:
            if os.path.splitext(x)[1].lower() == u'.opf':
                opf = x
                break
        if opf:
            opf = OPF(opf, basedir=getcwd())
            cover_path = opf.raster_cover or opf.cover
        # Set the cover.
        if cover_path:
            cdata = None
            with open(os.path.join(getcwd(), cover_path), 'rb') as cf:
                cdata = cf.read()
            cover_name = os.path.basename(cover_path)
            id, href = oeb.manifest.generate('cover', cover_name)
            oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata)
            oeb.guide.add('cover', 'Cover', href)

        return oeb
Example #43
0
    def extract_content(self, output_dir):
        # Each text record is independent (unless the continuation
        # value is set in the previous record). Put each converted
        # text recored into a separate file. We will reference the
        # home.html file as the first file and let the HTML input
        # plugin assemble the order based on hyperlinks.
        with CurrentDir(output_dir):
            for uid, num in self.uid_text_secion_number.items():
                self.log.debug('Writing record with uid: %s as %s.html' %
                               (uid, uid))
                with open('%s.html' % uid, 'wb') as htmlf:
                    html = u'<html><body>'
                    section_header, section_data = self.sections[num]
                    if section_header.type == DATATYPE_PHTML:
                        html += self.process_phtml(
                            section_data.data,
                            section_data.header.paragraph_offsets)
                    elif section_header.type == DATATYPE_PHTML_COMPRESSED:
                        d = self.decompress_phtml(section_data.data)
                        html += self.process_phtml(
                            d, section_data.header.paragraph_offsets).decode(
                                self.get_text_uid_encoding(section_header.uid),
                                'replace')
                    html += '</body></html>'
                    htmlf.write(html.encode('utf-8'))

        # Images.
        # Cache the image sizes in case they are used by a composite image.
        images = set()
        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        with CurrentDir(os.path.join(output_dir, 'images/')):
            # Single images.
            for uid, num in self.uid_image_section_number.items():
                section_header, section_data = self.sections[num]
                if section_data:
                    idata = None
                    if section_header.type == DATATYPE_TBMP:
                        idata = section_data
                    elif section_header.type == DATATYPE_TBMP_COMPRESSED:
                        if self.header_record.compression == 1:
                            idata = decompress_doc(section_data)
                        elif self.header_record.compression == 2:
                            idata = zlib.decompress(section_data)
                    try:
                        save_cover_data_to(idata,
                                           '%s.jpg' % uid,
                                           compression_quality=70)
                        images.add(uid)
                        self.log.debug(
                            'Wrote image with uid %s to images/%s.jpg' %
                            (uid, uid))
                    except Exception as e:
                        self.log.error(
                            'Failed to write image with uid %s: %s' % (uid, e))
                else:
                    self.log.error(
                        'Failed to write image with uid %s: No data.' % uid)
            # Composite images.
            # We're going to use the already compressed .jpg images here.
            for uid, num in self.uid_composite_image_section_number.items():
                try:
                    section_header, section_data = self.sections[num]
                    # Get the final width and height.
                    width = 0
                    height = 0
                    for row in section_data.layout:
                        row_width = 0
                        col_height = 0
                        for col in row:
                            if col not in images:
                                raise Exception('Image with uid: %s missing.' %
                                                col)
                            w, h = identify(lopen('%s.jpg' % col, 'rb'))[1:]
                            row_width += w
                            if col_height < h:
                                col_height = h
                        if width < row_width:
                            width = row_width
                        height += col_height
                    # Create a new image the total size of all image
                    # parts. Put the parts into the new image.
                    with Canvas(width, height) as canvas:
                        y_off = 0
                        for row in section_data.layout:
                            x_off = 0
                            largest_height = 0
                            for col in row:
                                im = image_from_data(
                                    lopen('%s.jpg' % col, 'rb').read())
                                canvas.compose(im, x_off, y_off)
                                w, h = im.width(), im.height()
                                x_off += w
                                if largest_height < h:
                                    largest_height = h
                            y_off += largest_height
                    with lopen('%s.jpg' % uid) as out:
                        out.write(canvas.export(compression_quality=70))
                    self.log.debug(
                        'Wrote composite image with uid %s to images/%s.jpg' %
                        (uid, uid))
                except Exception as e:
                    self.log.error(
                        'Failed to write composite image with uid %s: %s' %
                        (uid, e))

        # Run the HTML through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(self.options, opt.option.name, opt.recommended_value)
        self.options.input_encoding = 'utf-8'
        odi = self.options.debug_pipeline
        self.options.debug_pipeline = None
        # Determine the home.html record uid. This should be set in the
        # reserved values in the metadata recored. home.html is the first
        # text record (should have hyper link references to other records)
        # in the document.
        try:
            home_html = self.header_record.home_html
            if not home_html:
                home_html = self.uid_text_secion_number.items()[0][0]
        except:
            raise Exception('Could not determine home.html')
        # Generate oeb from html conversion.
        oeb = html_input.convert(open('%s.html' % home_html, 'rb'),
                                 self.options, 'html', self.log, {})
        self.options.debug_pipeline = odi

        return oeb
Example #44
0
def process_exploded_book(book_fmt,
                          opfpath,
                          input_fmt,
                          tdir,
                          render_manager,
                          log=None,
                          book_hash=None,
                          save_bookmark_data=False,
                          book_metadata=None,
                          virtualize_resources=True):
    log = log or default_log
    container = SimpleContainer(tdir, opfpath, log)
    input_plugin = plugin_for_input_format(input_fmt)
    is_comic = bool(getattr(input_plugin, 'is_image_collection', False))

    def needs_work(mt):
        return mt in OEB_STYLES or mt in OEB_DOCS or mt == 'image/svg+xml'

    def work_priority(name):
        # ensure workers with large files or stylesheets
        # have the less names
        size = os.path.getsize(container.name_path_map[name]),
        is_html = container.mime_map.get(name) in OEB_DOCS
        return (0 if is_html else 1), size

    if not is_comic:
        render_manager.launch_workers(
            tuple(n for n, mt in iteritems(container.mime_map)
                  if needs_work(mt)), container)

    bookmark_data = None
    if save_bookmark_data:
        bm_file = 'META-INF/calibre_bookmarks.txt'
        if container.exists(bm_file):
            with container.open(bm_file, 'rb') as f:
                bookmark_data = f.read()

    # We do not add zero byte sized files as the IndexedDB API in the
    # browser has no good way to distinguish between zero byte files and
    # load failures.
    excluded_names = {
        name
        for name, mt in iteritems(container.mime_map)
        if name == container.opf_name or mt == guess_type('a.ncx')
        or name.startswith('META-INF/') or name == 'mimetype'
        or not container.has_name_and_is_not_empty(name)
    }
    raster_cover_name, titlepage_name = create_cover_page(
        container, input_fmt.lower(), is_comic, book_metadata)

    toc = get_toc(container, verify_destinations=False).to_dict(count())
    if not toc or not toc.get('children'):
        toc = from_xpaths(container,
                          ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
    spine = [name for name, is_linear in container.spine_names]
    spineq = frozenset(spine)
    landmarks = [l for l in get_landmarks(container) if l['dest'] in spineq]

    book_render_data = {
        'version': RENDER_VERSION,
        'toc': toc,
        'book_format': book_fmt,
        'spine': spine,
        'link_uid': uuid4(),
        'book_hash': book_hash,
        'is_comic': is_comic,
        'raster_cover_name': raster_cover_name,
        'title_page_name': titlepage_name,
        'has_maths': False,
        'total_length': 0,
        'spine_length': 0,
        'toc_anchor_map': toc_anchor_map(toc),
        'landmarks': landmarks,
        'link_to_map': {},
    }

    names = sorted(
        (n for n, mt in iteritems(container.mime_map) if needs_work(mt)),
        key=work_priority)

    results = render_manager(
        names,
        (tdir, opfpath, virtualize_resources, book_render_data['link_uid'],
         container.data_for_clone()), container)
    ltm = book_render_data['link_to_map']
    html_data = {}
    virtualized_names = set()

    def merge_ltm(dest, src):
        for k, v in iteritems(src):
            if k in dest:
                dest[k] |= v
            else:
                dest[k] = v

    for link_to_map, hdata, vnames in results:
        html_data.update(hdata)
        virtualized_names |= vnames
        for k, v in iteritems(link_to_map):
            if k in ltm:
                merge_ltm(ltm[k], v)
            else:
                ltm[k] = v

    def manifest_data(name):
        mt = (container.mime_map.get(name)
              or 'application/octet-stream').lower()
        ans = {
            'size': os.path.getsize(container.name_path_map[name]),
            'is_virtualized': name in virtualized_names,
            'mimetype': mt,
            'is_html': mt in OEB_DOCS,
        }
        if ans['is_html']:
            data = html_data[name]
            ans['length'] = l = data['length']
            book_render_data['total_length'] += l
            if name in book_render_data['spine']:
                book_render_data['spine_length'] += l
            ans['has_maths'] = hm = data['has_maths']
            if hm:
                book_render_data['has_maths'] = True
            ans['anchor_map'] = data['anchor_map']
        return ans

    book_render_data['files'] = {
        name: manifest_data(name)
        for name in set(container.name_path_map) - excluded_names
    }
    container.commit()

    for name in excluded_names:
        os.remove(container.name_path_map[name])

    ltm = book_render_data['link_to_map']
    for name, amap in iteritems(ltm):
        for k, v in tuple(iteritems(amap)):
            amap[k] = tuple(v)  # needed for JSON serialization

    data = as_bytes(json.dumps(book_render_data, ensure_ascii=False))
    with lopen(os.path.join(container.root, 'calibre-book-manifest.json'),
               'wb') as f:
        f.write(data)

    return container, bookmark_data
Example #45
0
    def create_cover_page(self, input_fmt):
        templ = '''
        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
        <head><style>
        html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
        img {
            width: 100%%; height: 100%%;
            object-fit: contain;
            margin-left: auto; margin-right: auto;
            max-width: 100vw; max-height: 100vh;
            top: 50vh; transform: translateY(-50%%);
            position: relative;
        }
        body.cover-fill img { object-fit: fill; }
        </style></head><body><img src="%s"/></body></html>
        '''

        def generic_cover():
            if self.book_metadata is not None:
                from calibre.ebooks.covers import create_cover
                mi = self.book_metadata
                return create_cover(mi.title, mi.authors, mi.series,
                                    mi.series_index)
            return BLANK_JPEG

        if input_fmt == 'epub':

            def image_callback(cover_image, wrapped_image):
                if cover_image:
                    image_callback.cover_data = self.raw_data(cover_image,
                                                              decode=False)
                if wrapped_image and not getattr(image_callback, 'cover_data',
                                                 None):
                    image_callback.cover_data = self.raw_data(wrapped_image,
                                                              decode=False)

            def cover_path(action, data):
                if action == 'write_image':
                    cdata = getattr(image_callback, 'cover_data',
                                    None) or generic_cover()
                    data.write(cdata)

            if self.allow_no_cover and not has_epub_cover(self):
                return None, None
            raster_cover_name, titlepage_name = set_epub_cover(
                self,
                cover_path, (lambda *a: None),
                options={'template': templ},
                image_callback=image_callback)
        else:
            raster_cover_name = find_cover_image(self, strict=True)
            if raster_cover_name is None:
                if self.allow_no_cover:
                    return None, None
                item = self.generate_item(name='cover.jpeg', id_prefix='cover')
                raster_cover_name = self.href_to_name(item.get('href'),
                                                      self.opf_name)
                with self.open(raster_cover_name, 'wb') as dest:
                    dest.write(generic_cover())
            input_plugin = plugin_for_input_format(input_fmt)
            if getattr(input_plugin, 'is_image_collection', False):
                return raster_cover_name, None
            item = self.generate_item(name='titlepage.html',
                                      id_prefix='titlepage')
            titlepage_name = self.href_to_name(item.get('href'), self.opf_name)
            raw = templ % prepare_string_for_xml(
                self.name_to_href(raster_cover_name, titlepage_name), True)
            with self.open(titlepage_name, 'wb') as f:
                f.write(raw.encode('utf-8'))
            spine = self.opf_xpath('//opf:spine')[0]
            ref = spine.makeelement(OPF('itemref'), idref=item.get('id'))
            self.insert_into_xml(spine, ref, index=0)
            self.dirty(self.opf_name)
        return raster_cover_name, titlepage_name