Esempi in Python per DJVUFile.get_text, esempi in Python per calibre.ebooks.djvu.djvu.DJVUFile.get_text

Esempio n. 1

0

Mostra file

File: djvu_input.py Progetto: syn-gowthamsrungarapu/calibre

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.txt.processor import convert_basic

        stdout = StringIO()
        ppdjvu = True
        # using djvutxt is MUCH faster, should make it an option
        if options.use_djvutxt and os.path.exists('/usr/bin/djvutxt'):
            from calibre.ptempfile import PersistentTemporaryFile
            try:
                fp = PersistentTemporaryFile(suffix='.djvu',
                                             prefix='djv_input')
                filename = fp._name
                fp.write(stream.read())
                fp.close()
                cmd = ['djvutxt', filename]
                stdout.write(
                    Popen(cmd, stdout=PIPE, close_fds=True).communicate()[0])
                os.remove(filename)
                ppdjvu = False
            except:
                stream.seek(0)  # retry with the pure python converter
        if ppdjvu:
            from calibre.ebooks.djvu.djvu import DJVUFile
            x = DJVUFile(stream)
            x.get_text(stdout)

        html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace(
            b'\037', b'\n\n'))
        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        if file_ext != 'txtz' and hasattr(stream, 'name'):
            base = os.path.dirname(stream.name)
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = 'index%d.html' % c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile.name, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        return oeb

Esempio n. 2

0

Mostra file

File: djvu_input.py Progetto: 089git/calibre

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.txt.processor import convert_basic

        stdout = StringIO()
        ppdjvu = True
        # using djvutxt is MUCH faster, should make it an option
        if options.use_djvutxt and os.path.exists('/usr/bin/djvutxt'):
            from calibre.ptempfile import PersistentTemporaryFile
            try:
                fp = PersistentTemporaryFile(suffix='.djvu', prefix='djv_input')
                filename = fp._name
                fp.write(stream.read())
                fp.close()
                cmd = ['djvutxt', filename]
                stdout.write(Popen(cmd, stdout=PIPE, close_fds=True).communicate()[0])
                os.remove(filename)
                ppdjvu = False
            except:
                stream.seek(0) # retry with the pure python converter
        if ppdjvu:
            from calibre.ebooks.djvu.djvu import DJVUFile
            x = DJVUFile(stream)
            x.get_text(stdout)

        html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace(
            b'\037', b'\n\n'))
        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        if file_ext != 'txtz' and hasattr(stream, 'name'):
            base = os.path.dirname(stream.name)
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = 'index%d.html'%c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile.name, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log,
                {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        return oeb

Esempio n. 3

0

Mostra file

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.txt.processor import convert_basic

        stdout = BytesIO()
        from calibre.ebooks.djvu.djvu import DJVUFile
        x = DJVUFile(stream)
        x.get_text(stdout)
        raw_text = stdout.getvalue()
        if not raw_text:
            raise ValueError(
                'The DJVU file contains no text, only images, probably page scans.'
                ' calibre only supports conversion of DJVU files with actual text in them.'
            )

        html = convert_basic(
            raw_text.replace(b"\n", b' ').replace(b'\037', b'\n\n'))
        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwd()
        htmlfile = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(htmlfile):
            c += 1
            htmlfile = os.path.join(base, 'index%d.html' % c)
        with open(htmlfile, 'wb') as f:
            f.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        return oeb

Esempio n. 4

0

Mostra file

File: djvu_input.py Progetto: j-howell/calibre

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.txt.processor import convert_basic

        stdout = BytesIO()
        from calibre.ebooks.djvu.djvu import DJVUFile
        x = DJVUFile(stream)
        x.get_text(stdout)

        html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace(
            b'\037', b'\n\n'))
        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = getcwd()
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = os.path.join(base, 'index%d.html'%c)
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile.name, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log,
                {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        return oeb