Exemple #1
0
    def execute(self, input_filename, language=None):
        """
        Execute the command line binary of tesseract
        """
        fd, filepath = tempfile.mkstemp()
        os.close(fd)
        ocr_output = os.extsep.join([filepath, u'txt'])
        command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]

        if language is not None:
            command.extend([u'-l', language])

        proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
        return_code = proc.wait()
        if return_code != 0:
            error_text = proc.stderr.read()
            fs_cleanup(filepath)
            fs_cleanup(ocr_output)
            if language:
                # If tesseract gives an error with a language parameter
                # re-run it with no parameter again
                return self.execute(input_filename, language=None)
            else:
                raise OCRError(error_text)

        fd = codecs.open(ocr_output, 'r', 'utf-8')
        text = fd.read().strip()
        fd.close()

        os.unlink(filepath)

        return text
Exemple #2
0
    def soffice(self):
        """
        Executes LibreOffice as a subprocess
        """

        if not os.path.exists(setting_libreoffice_path.value):
            raise OfficeConversionError(
                _(
                    'LibreOffice not installed or not found at path: %s'
                ) % setting_libreoffice_path.value
            )

        new_file_object, input_filepath = tempfile.mkstemp()
        self.file_object.seek(0)
        os.write(new_file_object, self.file_object.read())
        self.file_object.seek(0)
        os.lseek(new_file_object, 0, os.SEEK_SET)
        os.close(new_file_object)

        libreoffice_filter = None
        if self.mime_type == 'text/plain':
            libreoffice_filter = 'Text (encoded):UTF8,LF,,,'

        args = (input_filepath, '--outdir', setting_temporary_directory.value)

        kwargs = {'_env': {'HOME': setting_temporary_directory.value}}

        if libreoffice_filter:
            kwargs.update({'infilter': libreoffice_filter})

        try:
            LIBREOFFICE(*args, **kwargs)
        except sh.ErrorReturnCode as exception:
            raise OfficeConversionError(exception)
        finally:
            fs_cleanup(input_filepath)

        filename, extension = os.path.splitext(
            os.path.basename(input_filepath)
        )
        logger.debug('filename: %s', filename)
        logger.debug('extension: %s', extension)

        converted_output = os.path.join(
            setting_temporary_directory.value, os.path.extsep.join(
                (filename, 'pdf')
            )
        )
        logger.debug('converted_output: %s', converted_output)

        with open(converted_output) as converted_file_object:
            while True:
                data = converted_file_object.read(CHUNK_SIZE)
                if not data:
                    break
                yield data

        fs_cleanup(input_filepath)
Exemple #3
0
    def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT, **kwargs):
        tmpfile = None
        mimetype = kwargs.get('mimetype', None)
        if not mimetype:
            mimetype, encoding = get_mimetype(open(input_filepath, 'rb'), input_filepath, mimetype_only=True)

        try:
            if mimetype == 'application/pdf' and pdftoppm:
                image_buffer = io.BytesIO()
                pdftoppm(input_filepath, f=page, l=page, _out=image_buffer)
                image_buffer.seek(0)
                im = Image.open(image_buffer)
            else:
                im = Image.open(input_filepath)
        except Exception as exception:
            logger.error('Error converting image; %s', exception)
            # Python Imaging Library doesn't recognize it as an image
            raise ConvertError
        except IOError:  # cannot identify image file
            raise UnknownFileFormat
        finally:
            if tmpfile:
                fs_cleanup(tmpfile)

        current_page = 0
        try:
            while current_page == page - 1:
                im.seek(im.tell() + 1)
                current_page += 1
                # do something to im
        except EOFError:
            # end of sequence
            pass

        try:
            if transformations:
                aspect = 1.0 * im.size[0] / im.size[1]
                for transformation in transformations:
                    arguments = transformation.get('arguments')
                    if transformation['transformation'] == TRANSFORMATION_RESIZE:
                        width = int(arguments.get('width', 0))
                        height = int(arguments.get('height', 1.0 * width * aspect))
                        im = self.resize(im, (width, height))
                    elif transformation['transformation'] == TRANSFORMATION_ZOOM:
                        decimal_value = float(arguments.get('percent', 100)) / 100
                        im = im.transform((int(im.size[0] * decimal_value), int(im.size[1] * decimal_value)), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
                    elif transformation['transformation'] == TRANSFORMATION_ROTATE:
                        # PIL counter degress counter-clockwise, reverse them
                        im = im.rotate(360 - arguments.get('degrees', 0))
        except:
            # Ignore all transformation error
            pass

        if im.mode not in ('L', 'RGB'):
            im = im.convert('RGB')

        im.save(output_filepath, format=file_format)
Exemple #4
0
    def soffice(self):
        """
        Executes LibreOffice as a subprocess
        """

        if not os.path.exists(setting_libreoffice_path.value):
            raise OfficeConversionError(
                _('LibreOffice not installed or not found at path: %s') %
                setting_libreoffice_path.value)

        new_file_object, input_filepath = tempfile.mkstemp()
        self.file_object.seek(0)
        os.write(new_file_object, self.file_object.read())
        self.file_object.seek(0)
        os.lseek(new_file_object, 0, os.SEEK_SET)
        os.close(new_file_object)

        libreoffice_filter = None
        if self.mime_type == 'text/plain':
            libreoffice_filter = 'Text (encoded):UTF8,LF,,,'

        args = (input_filepath, '--outdir', setting_temporary_directory.value)

        kwargs = {'_env': {'HOME': setting_temporary_directory.value}}

        if libreoffice_filter:
            kwargs.update({'infilter': libreoffice_filter})

        try:
            LIBREOFFICE(*args, **kwargs)
        except sh.ErrorReturnCode as exception:
            raise OfficeConversionError(exception)
        finally:
            fs_cleanup(input_filepath)

        filename, extension = os.path.splitext(
            os.path.basename(input_filepath))
        logger.debug('filename: %s', filename)
        logger.debug('extension: %s', extension)

        converted_output = os.path.join(setting_temporary_directory.value,
                                        os.path.extsep.join((filename, 'pdf')))
        logger.debug('converted_output: %s', converted_output)

        with open(converted_output) as converted_file_object:
            while True:
                data = converted_file_object.read(CHUNK_SIZE)
                if not data:
                    break
                yield data

        fs_cleanup(input_filepath)
Exemple #5
0
def do_document_ocr(document_version):
    """
    Try first to extract text from document pages using the registered
    parser, if the parser fails or if there is no parser registered for
    the document mimetype do a visual OCR by calling the corresponding
    OCR backend
    """
    for document_page in document_version.pages.all():
        try:
            # Try to extract text by means of a parser
            parse_document_page(document_page)
        except (ParserError, ParserUnknownFile):
            # Fall back to doing visual OCR

            document_filepath = document_page.document.get_image_cache_name(
                page=document_page.page_number,
                version=document_page.document_version.pk)

            logger.debug('document_filepath: %s', document_filepath)

            unpaper_input = convert(document_filepath,
                                    file_format=UNPAPER_FILE_FORMAT)

            logger.debug('unpaper_input: %s', unpaper_input)

            unpaper_output = execute_unpaper(input_filepath=unpaper_input)

            logger.debug('unpaper_output: %s', unpaper_output)

            # Convert to TIFF
            pre_ocr_filepath = convert(input_filepath=unpaper_output,
                                       file_format=DEFAULT_OCR_FILE_FORMAT)

            logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath)

            # Tesseract needs an explicit file extension
            pre_ocr_filepath_w_ext = os.extsep.join(
                [pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])

            logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext)

            os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
            try:
                ocr_text = ocr_backend.execute(
                    pre_ocr_filepath_w_ext, document_version.document.language)

                document_page.content = ocr_cleanup(
                    document_version.document.language, ocr_text)
                document_page.page_label = _('Text from OCR')
                document_page.save()
            finally:
                fs_cleanup(pre_ocr_filepath_w_ext)
                fs_cleanup(unpaper_input)
                fs_cleanup(document_filepath)
                fs_cleanup(unpaper_output)
Exemple #6
0
    def handle(self, *app_labels, **options):
        # Create the media/convertdb folder
        convertdb_folder_path = force_text(
            Path(
                settings.MEDIA_ROOT, CONVERTDB_FOLDER
            )
        )

        try:
            os.makedirs(convertdb_folder_path)
        except OSError as exception:
            if exception.errno == errno.EEXIST:
                pass

        convertdb_file_path = force_text(
            Path(
                convertdb_folder_path, CONVERTDB_OUTPUT_FILENAME
            )
        )

        management.call_command('purgeperiodictasks')

        management.call_command(
            'dumpdata', *app_labels, all=True,
            database=options['from'], natural_primary=True,
            natural_foreign=True, output=convertdb_file_path,
            interactive=False, format='json'
        )

        if DocumentType.objects.using(options['to']).count() and not options['force']:
            fs_cleanup(convertdb_file_path)
            raise CommandError(
                'There is existing data in the database that will be '
                'used for the import. If you proceed with the conversion '
                'you might lose data. Please check your settings.'
            )

        management.call_command(
            'loaddata', convertdb_file_path, database=options['to'],
            interactive=False, verbosity=3
        )
        fs_cleanup(convertdb_file_path)
Exemple #7
0
    def convert(self, *args, **kwargs):
        super(Python, self).convert(*args, **kwargs)

        if self.mime_type == 'application/pdf' and pdftoppm:

            new_file_object, input_filepath = tempfile.mkstemp()
            self.file_object.seek(0)
            os.write(new_file_object, self.file_object.read())
            self.file_object.seek(0)

            os.close(new_file_object)

            image_buffer = io.BytesIO()
            try:
                pdftoppm(
                    input_filepath, f=self.page_number + 1,
                    l=self.page_number + 1, _out=image_buffer
                )
                image_buffer.seek(0)
                return Image.open(image_buffer)
            finally:
                fs_cleanup(input_filepath)
Exemple #8
0
    def convert(self, *args, **kwargs):
        super(Python, self).convert(*args, **kwargs)

        if self.mime_type == 'application/pdf' and pdftoppm:

            new_file_object, input_filepath = mkstemp()
            self.file_object.seek(0)
            os.write(new_file_object, self.file_object.read())
            self.file_object.seek(0)

            os.close(new_file_object)

            image_buffer = io.BytesIO()
            try:
                pdftoppm(input_filepath,
                         f=self.page_number + 1,
                         l=self.page_number + 1,
                         _out=image_buffer)
                image_buffer.seek(0)
                return Image.open(image_buffer)
            finally:
                fs_cleanup(input_filepath)
Exemple #9
0
def do_document_ocr(document_version):
    """
    Try first to extract text from document pages using the registered
    parser, if the parser fails or if there is no parser registered for
    the document mimetype do a visual OCR by calling the corresponding
    OCR backend
    """
    for document_page in document_version.pages.all():
        try:
            # Try to extract text by means of a parser
            parse_document_page(document_page)
        except (ParserError, ParserUnknownFile):
            # Fall back to doing visual OCR

            document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number, version=document_page.document_version.pk)

            logger.debug('document_filepath: %s', document_filepath)

            unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT)

            logger.debug('unpaper_input: %s', unpaper_input)

            unpaper_output = execute_unpaper(input_filepath=unpaper_input)

            logger.debug('unpaper_output: %s', unpaper_output)

            # Convert to TIFF
            pre_ocr_filepath = convert(input_filepath=unpaper_output, file_format=DEFAULT_OCR_FILE_FORMAT)

            logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath)

            # Tesseract needs an explicit file extension
            pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])

            logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext)

            os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
            try:
                ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language)

                document_page.content = ocr_cleanup(document_version.document.language, ocr_text)
                document_page.page_label = _('Text from OCR')
                document_page.save()
            finally:
                fs_cleanup(pre_ocr_filepath_w_ext)
                fs_cleanup(unpaper_input)
                fs_cleanup(document_filepath)
                fs_cleanup(unpaper_output)
Exemple #10
0
    def handle(self, *args, **options):
        # Create the media/convertdb folder
        convertdb_folder_path = force_text(
            Path(settings.MEDIA_ROOT, CONVERTDB_FOLDER))

        try:
            os.makedirs(convertdb_folder_path)
        except OSError as exception:
            if exception.errno == errno.EEXIST:
                pass

        convertdb_file_path = force_text(
            Path(convertdb_folder_path, CONVERTDB_OUTPUT_FILENAME))

        management.call_command('purgeperiodictasks')

        management.call_command('dumpdata',
                                all=True,
                                database=options['from'],
                                natural_primary=True,
                                natural_foreign=True,
                                output=convertdb_file_path,
                                interactive=False,
                                format='json')

        if DocumentType.objects.using(
                'default').count() and not options['force']:
            fs_cleanup(convertdb_file_path)
            raise CommandError(
                'There is existing data in the database that will be '
                'used for the import. If you proceed with the conversion '
                'you might lose data. Please check you settings.')

        management.call_command('loaddata',
                                convertdb_file_path,
                                database=options['to'],
                                interactive=False)
        fs_cleanup(convertdb_file_path)
Exemple #11
0
    def execute(self, file_object, page_number):
        logger.debug('Parsing PDF page: %d', page_number)

        destination_descriptor, temp_filepath = mkstemp()
        copyfile(file_object, temp_filepath)

        command = []
        command.append(self.pdftotext_path)
        command.append('-f')
        command.append(str(page_number))
        command.append('-l')
        command.append(str(page_number))
        command.append(temp_filepath)
        command.append('-')

        proc = subprocess.Popen(command,
                                close_fds=True,
                                stderr=subprocess.PIPE,
                                stdout=subprocess.PIPE)
        return_code = proc.wait()
        if return_code != 0:
            logger.error(proc.stderr.readline())
            fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)

            raise ParserError

        output = proc.stdout.read()
        fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)

        if output == b'\x0c':
            logger.debug('Parser didn\'t return any output')
            return ''

        if output[-3:] == b'\x0a\x0a\x0c':
            return output[:-3]

        return output
Exemple #12
0
    def execute(self, file_object, page_number):
        logger.debug('Parsing PDF page: %d', page_number)

        destination_descriptor, temp_filepath = mkstemp()
        copyfile(file_object, temp_filepath)

        command = []
        command.append(self.pdftotext_path)
        command.append('-f')
        command.append(str(page_number))
        command.append('-l')
        command.append(str(page_number))
        command.append(temp_filepath)
        command.append('-')

        proc = subprocess.Popen(
            command, close_fds=True, stderr=subprocess.PIPE,
            stdout=subprocess.PIPE
        )
        return_code = proc.wait()
        if return_code != 0:
            logger.error(proc.stderr.readline())
            fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)

            raise ParserError

        output = proc.stdout.read()
        fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)

        if output == b'\x0c':
            logger.debug('Parser didn\'t return any output')
            return ''

        if output[-3:] == b'\x0a\x0a\x0c':
            return output[:-3]

        return output
 def tearDown(self):
     fs_cleanup(self.temporary_directory)
     super(StagingFolderTestCase, self).tearDown()
Exemple #14
0
 def tearDown(self):
     fs_cleanup(self.temporary_directory)
     super(StagingFolderViewTestCase, self).tearDown()
Exemple #15
0
    def soffice(self):
        """
        Executes LibreOffice as a subprocess
        """
        if not LIBREOFFICE:
            raise OfficeConversionError(
                _('LibreOffice not installed or not found.')
            )

        new_file_object, input_filepath = mkstemp()
        self.file_object.seek(0)
        os.write(new_file_object, self.file_object.read())
        self.file_object.seek(0)
        os.lseek(new_file_object, 0, os.SEEK_SET)
        os.close(new_file_object)

        libreoffice_filter = None
        if self.mime_type == 'text/plain':
            libreoffice_filter = 'Text (encoded):UTF8,LF,,,'

        libreoffice_home_directory = mkdtemp()
        args = (
            input_filepath, '--outdir', setting_temporary_directory.value,
            '-env:UserInstallation=file://{}'.format(
                os.path.join(
                    libreoffice_home_directory, 'LibreOffice_Conversion'
                )
            ),
        )

        kwargs = {'_env': {'HOME': libreoffice_home_directory}}

        if libreoffice_filter:
            kwargs.update({'infilter': libreoffice_filter})

        try:
            LIBREOFFICE(*args, **kwargs)
        except sh.ErrorReturnCode as exception:
            raise OfficeConversionError(exception)
        except Exception as exception:
            logger.error('Exception launching Libre Office; %s', exception)
            raise
        finally:
            fs_cleanup(input_filepath)
            fs_cleanup(libreoffice_home_directory)

        filename, extension = os.path.splitext(
            os.path.basename(input_filepath)
        )
        logger.debug('filename: %s', filename)
        logger.debug('extension: %s', extension)

        converted_output = os.path.join(
            setting_temporary_directory.value, os.path.extsep.join(
                (filename, 'pdf')
            )
        )
        logger.debug('converted_output: %s', converted_output)

        with open(converted_output, mode='rb') as converted_file_object:
            while True:
                data = converted_file_object.read(CHUNK_SIZE)
                if not data:
                    break
                yield data

        fs_cleanup(input_filepath)
        fs_cleanup(converted_output)
Exemple #16
0
    def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT, **kwargs):
        tmpfile = None
        mimetype = kwargs.get('mimetype', None)
        if not mimetype:
            mimetype, encoding = get_mimetype(open(input_filepath, 'rb'), input_filepath, mimetype_only=True)

        if mimetype == 'application/pdf' and USE_GHOSTSCRIPT:
            # If file is a PDF open it with ghostscript and convert it to
            # TIFF
            first_page_tmpl = '-dFirstPage=%d' % page
            last_page_tmpl = '-dLastPage=%d' % page
            fd, tmpfile = tempfile.mkstemp()
            os.close(fd)
            output_file_tmpl = '-sOutputFile=%s' % tmpfile
            input_file_tmpl = '-f%s' % input_filepath
            args = [
                'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH',
                '-dNOPAUSE', '-dNOPROMPT',
                first_page_tmpl, last_page_tmpl,
                '-sDEVICE=jpeg', '-dJPEGQ=95',
                '-r150', output_file_tmpl,
                input_file_tmpl,
                '-c "60000000 setvmthreshold"',  # use 30MB
                '-dNOGC',  # No garbage collection
                '-dMaxBitmap=500000000',
                '-dAlignToPixels=0',
                '-dGridFitTT=0',
                '-dTextAlphaBits=4',
                '-dGraphicsAlphaBits=4',
            ]

            ghostscript.Ghostscript(*args)
            page = 1  # Don't execute the following while loop
            input_filepath = tmpfile

        try:
            im = Image.open(input_filepath)
        except Exception:
            # Python Imaging Library doesn't recognize it as an image
            raise UnknownFileFormat
        finally:
            if tmpfile:
                fs_cleanup(tmpfile)

        current_page = 0
        try:
            while current_page == page - 1:
                im.seek(im.tell() + 1)
                current_page += 1
                # do something to im
        except EOFError:
            # end of sequence
            pass

        try:
            if transformations:
                aspect = 1.0 * im.size[0] / im.size[1]
                for transformation in transformations:
                    arguments = transformation.get('arguments')
                    if transformation['transformation'] == TRANSFORMATION_RESIZE:
                        width = int(arguments.get('width', 0))
                        height = int(arguments.get('height', 1.0 * width * aspect))
                        im = self.resize(im, (width, height))
                    elif transformation['transformation'] == TRANSFORMATION_ZOOM:
                        decimal_value = float(arguments.get('percent', 100)) / 100
                        im = im.transform((int(im.size[0] * decimal_value), int(im.size[1] * decimal_value)), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
                    elif transformation['transformation'] == TRANSFORMATION_ROTATE:
                        # PIL counter degress counter-clockwise, reverse them
                        im = im.rotate(360 - arguments.get('degrees', 0))
        except:
            # Ignore all transformation error
            pass

        if im.mode not in ('L', 'RGB'):
            im = im.convert('RGB')

        im.save(output_filepath, format=file_format)
Exemple #17
0
    def convert_file(self,
                     input_filepath,
                     output_filepath,
                     transformations=None,
                     page=DEFAULT_PAGE_NUMBER,
                     file_format=DEFAULT_FILE_FORMAT,
                     **kwargs):
        tmpfile = None
        mimetype = kwargs.get('mimetype', None)
        if not mimetype:
            mimetype, encoding = get_mimetype(open(input_filepath, 'rb'),
                                              input_filepath,
                                              mimetype_only=True)

        if mimetype == 'application/pdf' and USE_GHOSTSCRIPT:
            # If file is a PDF open it with ghostscript and convert it to
            # TIFF
            first_page_tmpl = '-dFirstPage=%d' % page
            last_page_tmpl = '-dLastPage=%d' % page
            fd, tmpfile = tempfile.mkstemp()
            os.close(fd)
            output_file_tmpl = '-sOutputFile=%s' % tmpfile
            input_file_tmpl = '-f%s' % input_filepath
            args = [
                'gs',
                '-q',
                '-dQUIET',
                '-dSAFER',
                '-dBATCH',
                '-dNOPAUSE',
                '-dNOPROMPT',
                first_page_tmpl,
                last_page_tmpl,
                '-sDEVICE=jpeg',
                '-dJPEGQ=95',
                '-r150',
                output_file_tmpl,
                input_file_tmpl,
                '-c "60000000 setvmthreshold"',  # use 30MB
                '-dNOGC',  # No garbage collection
                '-dMaxBitmap=500000000',
                '-dAlignToPixels=0',
                '-dGridFitTT=0',
                '-dTextAlphaBits=4',
                '-dGraphicsAlphaBits=4',
            ]

            ghostscript.Ghostscript(*args)
            page = 1  # Don't execute the following while loop
            input_filepath = tmpfile

        try:
            im = Image.open(input_filepath)
        except Exception:
            # Python Imaging Library doesn't recognize it as an image
            raise UnknownFileFormat
        finally:
            if tmpfile:
                fs_cleanup(tmpfile)

        current_page = 0
        try:
            while current_page == page - 1:
                im.seek(im.tell() + 1)
                current_page += 1
                # do something to im
        except EOFError:
            # end of sequence
            pass

        try:
            if transformations:
                aspect = 1.0 * im.size[0] / im.size[1]
                for transformation in transformations:
                    arguments = transformation.get('arguments')
                    if transformation[
                            'transformation'] == TRANSFORMATION_RESIZE:
                        width = int(arguments.get('width', 0))
                        height = int(
                            arguments.get('height', 1.0 * width * aspect))
                        im = self.resize(im, (width, height))
                    elif transformation[
                            'transformation'] == TRANSFORMATION_ZOOM:
                        decimal_value = float(arguments.get('percent',
                                                            100)) / 100
                        im = im.transform((int(im.size[0] * decimal_value),
                                           int(im.size[1] * decimal_value)),
                                          Image.EXTENT,
                                          (0, 0, im.size[0], im.size[1]))
                    elif transformation[
                            'transformation'] == TRANSFORMATION_ROTATE:
                        # PIL counter degress counter-clockwise, reverse them
                        im = im.rotate(360 - arguments.get('degrees', 0))
        except:
            # Ignore all transformation error
            pass

        if im.mode not in ('L', 'RGB'):
            im = im.convert('RGB')

        im.save(output_filepath, format=file_format)
Exemple #18
0
def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=None, *args, **kwargs):
    size = kwargs.get('size')
    file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT)
    zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL)
    rotation = kwargs.get('rotation', DEFAULT_ROTATION)
    page = kwargs.get('page', DEFAULT_PAGE_NUMBER)
    transformations = kwargs.get('transformations', [])

    if transformations is None:
        transformations = []

    if output_filepath is None:
        output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs)

    if os.path.exists(output_filepath):
        return output_filepath

    if office_converter:
        try:
            office_converter.convert(input_filepath, mimetype=mimetype)
            if office_converter.exists:
                input_filepath = office_converter.output_filepath
                mimetype = 'application/pdf'
            else:
                # Recycle the already detected mimetype
                mimetype = office_converter.mimetype

        except OfficeConversionError:
            raise UnknownFileFormat('office converter exception')

    if size:
        transformations.append(
            {
                'transformation': TRANSFORMATION_RESIZE,
                'arguments': dict(zip([u'width', u'height'], size.split(DIMENSION_SEPARATOR)))
            }
        )

    if zoom != 100:
        transformations.append(
            {
                'transformation': TRANSFORMATION_ZOOM,
                'arguments': {'percent': zoom}
            }
        )

    if rotation != 0 and rotation != 360:
        transformations.append(
            {
                'transformation': TRANSFORMATION_ROTATE,
                'arguments': {'degrees': rotation}
            }
        )

    try:
        backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format, mimetype=mimetype)
    finally:
        if cleanup_files:
            fs_cleanup(input_filepath)

    return output_filepath