def execute(self, input_filename, language=None): """ Execute the command line binary of tesseract """ fd, filepath = tempfile.mkstemp() os.close(fd) ocr_output = os.extsep.join([filepath, u'txt']) command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)] if language is not None: command.extend([u'-l', language]) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: error_text = proc.stderr.read() fs_cleanup(filepath) fs_cleanup(ocr_output) if language: # If tesseract gives an error with a language parameter # re-run it with no parameter again return self.execute(input_filename, language=None) else: raise OCRError(error_text) fd = codecs.open(ocr_output, 'r', 'utf-8') text = fd.read().strip() fd.close() os.unlink(filepath) return text
def soffice(self): """ Executes LibreOffice as a subprocess """ if not os.path.exists(setting_libreoffice_path.value): raise OfficeConversionError( _( 'LibreOffice not installed or not found at path: %s' ) % setting_libreoffice_path.value ) new_file_object, input_filepath = tempfile.mkstemp() self.file_object.seek(0) os.write(new_file_object, self.file_object.read()) self.file_object.seek(0) os.lseek(new_file_object, 0, os.SEEK_SET) os.close(new_file_object) libreoffice_filter = None if self.mime_type == 'text/plain': libreoffice_filter = 'Text (encoded):UTF8,LF,,,' args = (input_filepath, '--outdir', setting_temporary_directory.value) kwargs = {'_env': {'HOME': setting_temporary_directory.value}} if libreoffice_filter: kwargs.update({'infilter': libreoffice_filter}) try: LIBREOFFICE(*args, **kwargs) except sh.ErrorReturnCode as exception: raise OfficeConversionError(exception) finally: fs_cleanup(input_filepath) filename, extension = os.path.splitext( os.path.basename(input_filepath) ) logger.debug('filename: %s', filename) logger.debug('extension: %s', extension) converted_output = os.path.join( setting_temporary_directory.value, os.path.extsep.join( (filename, 'pdf') ) ) logger.debug('converted_output: %s', converted_output) with open(converted_output) as converted_file_object: while True: data = converted_file_object.read(CHUNK_SIZE) if not data: break yield data fs_cleanup(input_filepath)
def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT, **kwargs): tmpfile = None mimetype = kwargs.get('mimetype', None) if not mimetype: mimetype, encoding = get_mimetype(open(input_filepath, 'rb'), input_filepath, mimetype_only=True) try: if mimetype == 'application/pdf' and pdftoppm: image_buffer = io.BytesIO() pdftoppm(input_filepath, f=page, l=page, _out=image_buffer) image_buffer.seek(0) im = Image.open(image_buffer) else: im = Image.open(input_filepath) except Exception as exception: logger.error('Error converting image; %s', exception) # Python Imaging Library doesn't recognize it as an image raise ConvertError except IOError: # cannot identify image file raise UnknownFileFormat finally: if tmpfile: fs_cleanup(tmpfile) current_page = 0 try: while current_page == page - 1: im.seek(im.tell() + 1) current_page += 1 # do something to im except EOFError: # end of sequence pass try: if transformations: aspect = 1.0 * im.size[0] / im.size[1] for transformation in transformations: arguments = transformation.get('arguments') if transformation['transformation'] == TRANSFORMATION_RESIZE: width = int(arguments.get('width', 0)) height = int(arguments.get('height', 1.0 * width * aspect)) im = self.resize(im, (width, height)) elif transformation['transformation'] == TRANSFORMATION_ZOOM: decimal_value = float(arguments.get('percent', 100)) / 100 im = im.transform((int(im.size[0] * decimal_value), int(im.size[1] * decimal_value)), Image.EXTENT, (0, 0, im.size[0], im.size[1])) elif transformation['transformation'] == TRANSFORMATION_ROTATE: # PIL counter degress counter-clockwise, reverse them im = im.rotate(360 - arguments.get('degrees', 0)) except: # Ignore all transformation error pass if im.mode not in ('L', 'RGB'): im = im.convert('RGB') im.save(output_filepath, format=file_format)
def soffice(self): """ Executes LibreOffice as a subprocess """ if not os.path.exists(setting_libreoffice_path.value): raise OfficeConversionError( _('LibreOffice not installed or not found at path: %s') % setting_libreoffice_path.value) new_file_object, input_filepath = tempfile.mkstemp() self.file_object.seek(0) os.write(new_file_object, self.file_object.read()) self.file_object.seek(0) os.lseek(new_file_object, 0, os.SEEK_SET) os.close(new_file_object) libreoffice_filter = None if self.mime_type == 'text/plain': libreoffice_filter = 'Text (encoded):UTF8,LF,,,' args = (input_filepath, '--outdir', setting_temporary_directory.value) kwargs = {'_env': {'HOME': setting_temporary_directory.value}} if libreoffice_filter: kwargs.update({'infilter': libreoffice_filter}) try: LIBREOFFICE(*args, **kwargs) except sh.ErrorReturnCode as exception: raise OfficeConversionError(exception) finally: fs_cleanup(input_filepath) filename, extension = os.path.splitext( os.path.basename(input_filepath)) logger.debug('filename: %s', filename) logger.debug('extension: %s', extension) converted_output = os.path.join(setting_temporary_directory.value, os.path.extsep.join((filename, 'pdf'))) logger.debug('converted_output: %s', converted_output) with open(converted_output) as converted_file_object: while True: data = converted_file_object.read(CHUNK_SIZE) if not data: break yield data fs_cleanup(input_filepath)
def do_document_ocr(document_version): """ Try first to extract text from document pages using the registered parser, if the parser fails or if there is no parser registered for the document mimetype do a visual OCR by calling the corresponding OCR backend """ for document_page in document_version.pages.all(): try: # Try to extract text by means of a parser parse_document_page(document_page) except (ParserError, ParserUnknownFile): # Fall back to doing visual OCR document_filepath = document_page.document.get_image_cache_name( page=document_page.page_number, version=document_page.document_version.pk) logger.debug('document_filepath: %s', document_filepath) unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT) logger.debug('unpaper_input: %s', unpaper_input) unpaper_output = execute_unpaper(input_filepath=unpaper_input) logger.debug('unpaper_output: %s', unpaper_output) # Convert to TIFF pre_ocr_filepath = convert(input_filepath=unpaper_output, file_format=DEFAULT_OCR_FILE_FORMAT) logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath) # Tesseract needs an explicit file extension pre_ocr_filepath_w_ext = os.extsep.join( [pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext) os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: ocr_text = ocr_backend.execute( pre_ocr_filepath_w_ext, document_version.document.language) document_page.content = ocr_cleanup( document_version.document.language, ocr_text) document_page.page_label = _('Text from OCR') document_page.save() finally: fs_cleanup(pre_ocr_filepath_w_ext) fs_cleanup(unpaper_input) fs_cleanup(document_filepath) fs_cleanup(unpaper_output)
def handle(self, *app_labels, **options): # Create the media/convertdb folder convertdb_folder_path = force_text( Path( settings.MEDIA_ROOT, CONVERTDB_FOLDER ) ) try: os.makedirs(convertdb_folder_path) except OSError as exception: if exception.errno == errno.EEXIST: pass convertdb_file_path = force_text( Path( convertdb_folder_path, CONVERTDB_OUTPUT_FILENAME ) ) management.call_command('purgeperiodictasks') management.call_command( 'dumpdata', *app_labels, all=True, database=options['from'], natural_primary=True, natural_foreign=True, output=convertdb_file_path, interactive=False, format='json' ) if DocumentType.objects.using(options['to']).count() and not options['force']: fs_cleanup(convertdb_file_path) raise CommandError( 'There is existing data in the database that will be ' 'used for the import. If you proceed with the conversion ' 'you might lose data. Please check your settings.' ) management.call_command( 'loaddata', convertdb_file_path, database=options['to'], interactive=False, verbosity=3 ) fs_cleanup(convertdb_file_path)
def convert(self, *args, **kwargs): super(Python, self).convert(*args, **kwargs) if self.mime_type == 'application/pdf' and pdftoppm: new_file_object, input_filepath = tempfile.mkstemp() self.file_object.seek(0) os.write(new_file_object, self.file_object.read()) self.file_object.seek(0) os.close(new_file_object) image_buffer = io.BytesIO() try: pdftoppm( input_filepath, f=self.page_number + 1, l=self.page_number + 1, _out=image_buffer ) image_buffer.seek(0) return Image.open(image_buffer) finally: fs_cleanup(input_filepath)
def convert(self, *args, **kwargs): super(Python, self).convert(*args, **kwargs) if self.mime_type == 'application/pdf' and pdftoppm: new_file_object, input_filepath = mkstemp() self.file_object.seek(0) os.write(new_file_object, self.file_object.read()) self.file_object.seek(0) os.close(new_file_object) image_buffer = io.BytesIO() try: pdftoppm(input_filepath, f=self.page_number + 1, l=self.page_number + 1, _out=image_buffer) image_buffer.seek(0) return Image.open(image_buffer) finally: fs_cleanup(input_filepath)
def do_document_ocr(document_version): """ Try first to extract text from document pages using the registered parser, if the parser fails or if there is no parser registered for the document mimetype do a visual OCR by calling the corresponding OCR backend """ for document_page in document_version.pages.all(): try: # Try to extract text by means of a parser parse_document_page(document_page) except (ParserError, ParserUnknownFile): # Fall back to doing visual OCR document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number, version=document_page.document_version.pk) logger.debug('document_filepath: %s', document_filepath) unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT) logger.debug('unpaper_input: %s', unpaper_input) unpaper_output = execute_unpaper(input_filepath=unpaper_input) logger.debug('unpaper_output: %s', unpaper_output) # Convert to TIFF pre_ocr_filepath = convert(input_filepath=unpaper_output, file_format=DEFAULT_OCR_FILE_FORMAT) logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath) # Tesseract needs an explicit file extension pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext) os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language) document_page.content = ocr_cleanup(document_version.document.language, ocr_text) document_page.page_label = _('Text from OCR') document_page.save() finally: fs_cleanup(pre_ocr_filepath_w_ext) fs_cleanup(unpaper_input) fs_cleanup(document_filepath) fs_cleanup(unpaper_output)
def handle(self, *args, **options): # Create the media/convertdb folder convertdb_folder_path = force_text( Path(settings.MEDIA_ROOT, CONVERTDB_FOLDER)) try: os.makedirs(convertdb_folder_path) except OSError as exception: if exception.errno == errno.EEXIST: pass convertdb_file_path = force_text( Path(convertdb_folder_path, CONVERTDB_OUTPUT_FILENAME)) management.call_command('purgeperiodictasks') management.call_command('dumpdata', all=True, database=options['from'], natural_primary=True, natural_foreign=True, output=convertdb_file_path, interactive=False, format='json') if DocumentType.objects.using( 'default').count() and not options['force']: fs_cleanup(convertdb_file_path) raise CommandError( 'There is existing data in the database that will be ' 'used for the import. If you proceed with the conversion ' 'you might lose data. Please check you settings.') management.call_command('loaddata', convertdb_file_path, database=options['to'], interactive=False) fs_cleanup(convertdb_file_path)
def execute(self, file_object, page_number): logger.debug('Parsing PDF page: %d', page_number) destination_descriptor, temp_filepath = mkstemp() copyfile(file_object, temp_filepath) command = [] command.append(self.pdftotext_path) command.append('-f') command.append(str(page_number)) command.append('-l') command.append(str(page_number)) command.append(temp_filepath) command.append('-') proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: logger.error(proc.stderr.readline()) fs_cleanup(temp_filepath, file_descriptor=destination_descriptor) raise ParserError output = proc.stdout.read() fs_cleanup(temp_filepath, file_descriptor=destination_descriptor) if output == b'\x0c': logger.debug('Parser didn\'t return any output') return '' if output[-3:] == b'\x0a\x0a\x0c': return output[:-3] return output
def execute(self, file_object, page_number): logger.debug('Parsing PDF page: %d', page_number) destination_descriptor, temp_filepath = mkstemp() copyfile(file_object, temp_filepath) command = [] command.append(self.pdftotext_path) command.append('-f') command.append(str(page_number)) command.append('-l') command.append(str(page_number)) command.append(temp_filepath) command.append('-') proc = subprocess.Popen( command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE ) return_code = proc.wait() if return_code != 0: logger.error(proc.stderr.readline()) fs_cleanup(temp_filepath, file_descriptor=destination_descriptor) raise ParserError output = proc.stdout.read() fs_cleanup(temp_filepath, file_descriptor=destination_descriptor) if output == b'\x0c': logger.debug('Parser didn\'t return any output') return '' if output[-3:] == b'\x0a\x0a\x0c': return output[:-3] return output
def tearDown(self): fs_cleanup(self.temporary_directory) super(StagingFolderTestCase, self).tearDown()
def tearDown(self): fs_cleanup(self.temporary_directory) super(StagingFolderViewTestCase, self).tearDown()
def soffice(self): """ Executes LibreOffice as a subprocess """ if not LIBREOFFICE: raise OfficeConversionError( _('LibreOffice not installed or not found.') ) new_file_object, input_filepath = mkstemp() self.file_object.seek(0) os.write(new_file_object, self.file_object.read()) self.file_object.seek(0) os.lseek(new_file_object, 0, os.SEEK_SET) os.close(new_file_object) libreoffice_filter = None if self.mime_type == 'text/plain': libreoffice_filter = 'Text (encoded):UTF8,LF,,,' libreoffice_home_directory = mkdtemp() args = ( input_filepath, '--outdir', setting_temporary_directory.value, '-env:UserInstallation=file://{}'.format( os.path.join( libreoffice_home_directory, 'LibreOffice_Conversion' ) ), ) kwargs = {'_env': {'HOME': libreoffice_home_directory}} if libreoffice_filter: kwargs.update({'infilter': libreoffice_filter}) try: LIBREOFFICE(*args, **kwargs) except sh.ErrorReturnCode as exception: raise OfficeConversionError(exception) except Exception as exception: logger.error('Exception launching Libre Office; %s', exception) raise finally: fs_cleanup(input_filepath) fs_cleanup(libreoffice_home_directory) filename, extension = os.path.splitext( os.path.basename(input_filepath) ) logger.debug('filename: %s', filename) logger.debug('extension: %s', extension) converted_output = os.path.join( setting_temporary_directory.value, os.path.extsep.join( (filename, 'pdf') ) ) logger.debug('converted_output: %s', converted_output) with open(converted_output, mode='rb') as converted_file_object: while True: data = converted_file_object.read(CHUNK_SIZE) if not data: break yield data fs_cleanup(input_filepath) fs_cleanup(converted_output)
def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT, **kwargs): tmpfile = None mimetype = kwargs.get('mimetype', None) if not mimetype: mimetype, encoding = get_mimetype(open(input_filepath, 'rb'), input_filepath, mimetype_only=True) if mimetype == 'application/pdf' and USE_GHOSTSCRIPT: # If file is a PDF open it with ghostscript and convert it to # TIFF first_page_tmpl = '-dFirstPage=%d' % page last_page_tmpl = '-dLastPage=%d' % page fd, tmpfile = tempfile.mkstemp() os.close(fd) output_file_tmpl = '-sOutputFile=%s' % tmpfile input_file_tmpl = '-f%s' % input_filepath args = [ 'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE', '-dNOPROMPT', first_page_tmpl, last_page_tmpl, '-sDEVICE=jpeg', '-dJPEGQ=95', '-r150', output_file_tmpl, input_file_tmpl, '-c "60000000 setvmthreshold"', # use 30MB '-dNOGC', # No garbage collection '-dMaxBitmap=500000000', '-dAlignToPixels=0', '-dGridFitTT=0', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4', ] ghostscript.Ghostscript(*args) page = 1 # Don't execute the following while loop input_filepath = tmpfile try: im = Image.open(input_filepath) except Exception: # Python Imaging Library doesn't recognize it as an image raise UnknownFileFormat finally: if tmpfile: fs_cleanup(tmpfile) current_page = 0 try: while current_page == page - 1: im.seek(im.tell() + 1) current_page += 1 # do something to im except EOFError: # end of sequence pass try: if transformations: aspect = 1.0 * im.size[0] / im.size[1] for transformation in transformations: arguments = transformation.get('arguments') if transformation['transformation'] == TRANSFORMATION_RESIZE: width = int(arguments.get('width', 0)) height = int(arguments.get('height', 1.0 * width * aspect)) im = self.resize(im, (width, height)) elif transformation['transformation'] == TRANSFORMATION_ZOOM: decimal_value = float(arguments.get('percent', 100)) / 100 im = im.transform((int(im.size[0] * decimal_value), int(im.size[1] * decimal_value)), Image.EXTENT, (0, 0, im.size[0], im.size[1])) elif transformation['transformation'] == TRANSFORMATION_ROTATE: # PIL counter degress counter-clockwise, reverse them im = im.rotate(360 - arguments.get('degrees', 0)) except: # Ignore all transformation error pass if im.mode not in ('L', 'RGB'): im = im.convert('RGB') im.save(output_filepath, format=file_format)
def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT, **kwargs): tmpfile = None mimetype = kwargs.get('mimetype', None) if not mimetype: mimetype, encoding = get_mimetype(open(input_filepath, 'rb'), input_filepath, mimetype_only=True) if mimetype == 'application/pdf' and USE_GHOSTSCRIPT: # If file is a PDF open it with ghostscript and convert it to # TIFF first_page_tmpl = '-dFirstPage=%d' % page last_page_tmpl = '-dLastPage=%d' % page fd, tmpfile = tempfile.mkstemp() os.close(fd) output_file_tmpl = '-sOutputFile=%s' % tmpfile input_file_tmpl = '-f%s' % input_filepath args = [ 'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE', '-dNOPROMPT', first_page_tmpl, last_page_tmpl, '-sDEVICE=jpeg', '-dJPEGQ=95', '-r150', output_file_tmpl, input_file_tmpl, '-c "60000000 setvmthreshold"', # use 30MB '-dNOGC', # No garbage collection '-dMaxBitmap=500000000', '-dAlignToPixels=0', '-dGridFitTT=0', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4', ] ghostscript.Ghostscript(*args) page = 1 # Don't execute the following while loop input_filepath = tmpfile try: im = Image.open(input_filepath) except Exception: # Python Imaging Library doesn't recognize it as an image raise UnknownFileFormat finally: if tmpfile: fs_cleanup(tmpfile) current_page = 0 try: while current_page == page - 1: im.seek(im.tell() + 1) current_page += 1 # do something to im except EOFError: # end of sequence pass try: if transformations: aspect = 1.0 * im.size[0] / im.size[1] for transformation in transformations: arguments = transformation.get('arguments') if transformation[ 'transformation'] == TRANSFORMATION_RESIZE: width = int(arguments.get('width', 0)) height = int( arguments.get('height', 1.0 * width * aspect)) im = self.resize(im, (width, height)) elif transformation[ 'transformation'] == TRANSFORMATION_ZOOM: decimal_value = float(arguments.get('percent', 100)) / 100 im = im.transform((int(im.size[0] * decimal_value), int(im.size[1] * decimal_value)), Image.EXTENT, (0, 0, im.size[0], im.size[1])) elif transformation[ 'transformation'] == TRANSFORMATION_ROTATE: # PIL counter degress counter-clockwise, reverse them im = im.rotate(360 - arguments.get('degrees', 0)) except: # Ignore all transformation error pass if im.mode not in ('L', 'RGB'): im = im.convert('RGB') im.save(output_filepath, format=file_format)
def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=None, *args, **kwargs): size = kwargs.get('size') file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT) zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL) rotation = kwargs.get('rotation', DEFAULT_ROTATION) page = kwargs.get('page', DEFAULT_PAGE_NUMBER) transformations = kwargs.get('transformations', []) if transformations is None: transformations = [] if output_filepath is None: output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs) if os.path.exists(output_filepath): return output_filepath if office_converter: try: office_converter.convert(input_filepath, mimetype=mimetype) if office_converter.exists: input_filepath = office_converter.output_filepath mimetype = 'application/pdf' else: # Recycle the already detected mimetype mimetype = office_converter.mimetype except OfficeConversionError: raise UnknownFileFormat('office converter exception') if size: transformations.append( { 'transformation': TRANSFORMATION_RESIZE, 'arguments': dict(zip([u'width', u'height'], size.split(DIMENSION_SEPARATOR))) } ) if zoom != 100: transformations.append( { 'transformation': TRANSFORMATION_ZOOM, 'arguments': {'percent': zoom} } ) if rotation != 0 and rotation != 360: transformations.append( { 'transformation': TRANSFORMATION_ROTATE, 'arguments': {'degrees': rotation} } ) try: backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format, mimetype=mimetype) finally: if cleanup_files: fs_cleanup(input_filepath) return output_filepath