def do_document_ocr(queue_document): """ Try first to extract text from document pages using the registered parser, if the parser fails or if there is no parser registered for the document mimetype do a visual OCR by calling tesseract """ for document_page in queue_document.document.pages.all(): try: # Try to extract text by means of a parser parse_document_page(document_page) except (ParserError, ParserUnknownFile): # Fall back to doing visual OCR ocr_transformations, warnings = queue_document.get_transformation_list( ) document_filepath = document_page.document.get_image_cache_name( page=document_page.page_number, version=document_page.document_version.pk) unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % ( document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT) unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename) unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT, transformations=ocr_transformations) execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath) #from PIL import Image, ImageOps #im = Image.open(document_filepath) ##if im.mode=='RGBA': ## im=im.convert('RGB') ##im = im.convert('L') #im = ImageOps.grayscale(im) #im.save(unpaper_output_filepath) # Convert to TIFF pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT) # Tesseract needs an explicit file extension pre_ocr_filepath_w_ext = os.extsep.join( [pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: ocr_text = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE) document_page.content = ocr_cleanup(ocr_text) document_page.page_label = _(u'Text from OCR') document_page.save() finally: cleanup(pre_ocr_filepath_w_ext) cleanup(unpaper_input) cleanup(document_filepath) cleanup(unpaper_output_filepath)
def do_document_ocr(document_version): """ Try first to extract text from document pages using the registered parser, if the parser fails or if there is no parser registered for the document mimetype do a visual OCR by calling the corresponding OCR backend """ for document_page in document_version.pages.all(): try: # Try to extract text by means of a parser parse_document_page(document_page) except (ParserError, ParserUnknownFile): # Fall back to doing visual OCR document_filepath = document_page.document.get_image_cache_name( page=document_page.page_number, version=document_page.document_version.pk) logger.debug('document_filepath: %s', document_filepath) unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT) logger.debug('unpaper_input: %s', unpaper_input) unpaper_output = execute_unpaper(input_filepath=unpaper_input) logger.debug('unpaper_output: %s', unpaper_output) # Convert to TIFF pre_ocr_filepath = convert(input_filepath=unpaper_output, file_format=DEFAULT_OCR_FILE_FORMAT) logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath) # Tesseract needs an explicit file extension pre_ocr_filepath_w_ext = os.extsep.join( [pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext) os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: ocr_text = ocr_backend.execute( pre_ocr_filepath_w_ext, document_version.document.language) document_page.content = ocr_cleanup( document_version.document.language, ocr_text) document_page.page_label = _('Text from OCR') document_page.save() finally: fs_cleanup(pre_ocr_filepath_w_ext) fs_cleanup(unpaper_input) fs_cleanup(document_filepath) fs_cleanup(unpaper_output)
def do_document_ocr(queue_document): """ Try first to extract text from document pages using the registered parser, if the parser fails or if there is no parser registered for the document mimetype do a visual OCR by calling tesseract """ for document_page in queue_document.document.documentpage_set.all(): try: # Try to extract text by means of a parser parse_document_page(document_page) except (ParserError, ParserUnknownFile): # Fall back to doing visual OCR ocr_transformations, warnings = queue_document.get_transformation_list() document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number) unpaper_output_filename = u"%s_unpaper_out_page_%s%s%s" % ( document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT, ) unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename) unpaper_input = convert( document_filepath, file_format=UNPAPER_FILE_FORMAT, transformations=ocr_transformations ) execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath) # from PIL import Image, ImageOps # im = Image.open(document_filepath) ##if im.mode=='RGBA': ## im=im.convert('RGB') ##im = im.convert('L') # im = ImageOps.grayscale(im) # im.save(unpaper_output_filepath) # Convert to TIFF pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT) # Tesseract needs an explicit file extension pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: ocr_text = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE) document_page.content = ocr_cleanup(ocr_text) document_page.page_label = _(u"Text from OCR") document_page.save() finally: cleanup(pre_ocr_filepath_w_ext) cleanup(unpaper_input) cleanup(document_filepath) cleanup(unpaper_output_filepath)
def do_document_ocr(document_version): """ Try first to extract text from document pages using the registered parser, if the parser fails or if there is no parser registered for the document mimetype do a visual OCR by calling the corresponding OCR backend """ for document_page in document_version.pages.all(): try: # Try to extract text by means of a parser parse_document_page(document_page) except (ParserError, ParserUnknownFile): # Fall back to doing visual OCR document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number, version=document_page.document_version.pk) logger.debug('document_filepath: %s', document_filepath) unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT) logger.debug('unpaper_input: %s', unpaper_input) unpaper_output = execute_unpaper(input_filepath=unpaper_input) logger.debug('unpaper_output: %s', unpaper_output) # Convert to TIFF pre_ocr_filepath = convert(input_filepath=unpaper_output, file_format=DEFAULT_OCR_FILE_FORMAT) logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath) # Tesseract needs an explicit file extension pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext) os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language) document_page.content = ocr_cleanup(document_version.document.language, ocr_text) document_page.page_label = _('Text from OCR') document_page.save() finally: fs_cleanup(pre_ocr_filepath_w_ext) fs_cleanup(unpaper_input) fs_cleanup(document_filepath) fs_cleanup(unpaper_output)
def get_image_cache_name(self, page): cache_file_path, transformations = self.get_cached_image_name(page) if os.path.exists(cache_file_path): return cache_file_path else: document_file = document_save_to_temp_dir(self, self.checksum) return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations)
def get_image(self, size, transformations): try: return convert(self.filepath, size=size, cleanup_files=False, transformations=transformations) except UnknownFileFormat: return get_icon_file_path(get_mimetype(self.filepath)) except UnkownConvertError: return get_error_icon_file_path()
def get_valid_image(self, size=DISPLAY_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION, version=None): if not version: version = self.latest_version.pk image_cache_name = self.get_image_cache_name(page=page, version=version) logger.debug('image_cache_name: %s' % image_cache_name) return convert(image_cache_name, cleanup_files=False, size=size, zoom=zoom, rotation=rotation)
def get_image_cache_name(self, page, version): cache_file_path, transformations = self.get_cached_image_name(page, version) if os.path.exists(cache_file_path): return cache_file_path else: document_version = DocumentVersion.objects.get(pk=version) document_file = document_save_to_temp_dir(document_version, document_version.checksum) return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations, mimetype=self.file_mimetype)
def get_image(self, size=DISPLAY_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION): try: image_cache_name = self.get_image_cache_name(page=page) return convert(image_cache_name, cleanup_files=False, size=size, zoom=zoom, rotation=rotation) except UnknownFileFormat: return get_icon_file_path(self.file_mimetype) except UnkownConvertError: return get_error_icon_file_path() except: return get_error_icon_file_path()
def get_image(self, size, page, zoom, rotation, as_base64=True): # TODO: add support for transformations converted_file_path = convert(self.get_full_path(), size=size) if as_base64: mimetype = get_mimetype(open(converted_file_path, 'r'), converted_file_path, mimetype_only=True)[0] image = open(converted_file_path, 'r') base64_data = base64.b64encode(image.read()) image.close() return u'data:%s;base64,%s' % (mimetype, base64_data) else: return converted_file_path
def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT): check_permissions(request.user, "documents", [PERMISSION_DOCUMENT_VIEW]) document = get_object_or_404(Document, pk=document_id) page = int(request.GET.get("page", 1)) transformation_list = [] try: # Catch invalid or non existing pages document_page = DocumentPage.objects.get(document=document, page_number=page) for page_transformation in document_page.documentpagetransformation_set.all(): try: if page_transformation.transformation in TRANFORMATION_CHOICES: output = TRANFORMATION_CHOICES[page_transformation.transformation] % eval( page_transformation.arguments ) transformation_list.append(output) except Exception, e: if request.user.is_staff: messages.warning( request, _(u"Error for transformation %(transformation)s:, %(error)s") % {"transformation": page_transformation.get_transformation_display(), "error": e}, ) else: pass except ObjectDoesNotExist: pass tranformation_string = " ".join(transformation_list) try: filepath = in_image_cache( document.checksum, size=size, quality=quality, extra_options=tranformation_string, page=page - 1 ) if filepath: return serve_file(request, File(file=open(filepath, "r")), content_type="image/jpeg") # Save to a temporary location filepath = document_save_to_temp_dir(document, filename=document.checksum) output_file = convert( filepath, size=size, format="jpg", quality=quality, extra_options=tranformation_string, page=page - 1 ) return serve_file(request, File(file=open(output_file, "r")), content_type="image/jpeg") except UnkownConvertError, e: if request.user.is_staff or request.user.is_superuser: messages.error(request, e) if size == THUMBNAIL_SIZE: return serve_file(request, File(file=open("%simages/%s" % (settings.MEDIA_ROOT, PICTURE_ERROR_SMALL), "r"))) else: return serve_file( request, File(file=open("%simages/%s" % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), "r")) )
def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT): check_permissions(request.user, 'documents', [PERMISSION_DOCUMENT_VIEW]) document = get_object_or_404(Document, pk=document_id) page = int(request.GET.get('page', 1)) transformation_list = [] try: #Catch invalid or non existing pages document_page = DocumentPage.objects.get(document=document, page_number=page) for page_transformation in document_page.documentpagetransformation_set.all(): try: if page_transformation.transformation in TRANFORMATION_CHOICES: output = TRANFORMATION_CHOICES[page_transformation.transformation] % eval(page_transformation.arguments) transformation_list.append(output) except Exception, e: if request.user.is_staff: messages.warning(request, _(u'Error for transformation %(transformation)s:, %(error)s') % {'transformation':page_transformation.get_transformation_display(), 'error':e}) else: pass except ObjectDoesNotExist: pass tranformation_string = ' '.join(transformation_list) try: filepath = in_image_cache(document.checksum, size=size, quality=quality, extra_options=tranformation_string, page=page-1) if filepath: return serve_file(request, File(file=open(filepath, 'r')), content_type='image/jpeg') #Save to a temporary location filepath = document_save_to_temp_dir(document, filename=document.checksum) output_file = convert(filepath, size=size, format='jpg', quality=quality, extra_options=tranformation_string, page=page-1) return serve_file(request, File(file=open(output_file, 'r')), content_type='image/jpeg') except UnkownConvertError, e: if request.user.is_staff or request.user.is_superuser: messages.error(request, e) if size == THUMBNAIL_SIZE: return serve_file(request, File(file=open('%simages/%s' % (settings.MEDIA_ROOT, PICTURE_ERROR_SMALL), 'r'))) else: return serve_file(request, File(file=open('%simages/%s' % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), 'r')))
def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT): check_permissions(request.user, 'documents', [PERMISSION_DOCUMENT_VIEW]) document = get_object_or_404(Document, pk=document_id) page = int(request.GET.get('page', 1)) transformation_list = [] try: #Catch invalid or non existing pages document_page = DocumentPage.objects.get(document=document, page_number=page) for page_transformation in document_page.documentpagetransformation_set.all(): try: if page_transformation.transformation in TRANFORMATION_CHOICES: output = TRANFORMATION_CHOICES[page_transformation.transformation] % eval(page_transformation.arguments) transformation_list.append(output) except Exception, e: if request.user.is_staff: messages.warning(request, _(u'Error for transformation %(transformation)s:, %(error)s') % {'transformation':page_transformation.get_transformation_display(), 'error':e}) else: pass except ObjectDoesNotExist: pass tranformation_string = ' '.join(transformation_list) try: filepath = in_image_cache(document.checksum, size=size, quality=quality, extra_options=tranformation_string, page=page-1) if filepath: return serve_file(request, File(file=open(filepath, 'r')), content_type='image/jpeg') #Save to a temporary location filepath = document_save_to_temp_dir(document, filename=document.checksum) output_file = convert(filepath, size=size, format='jpg', quality=quality, extra_options=tranformation_string, page=page-1) return serve_file(request, File(file=open(output_file, 'r')), content_type='image/jpeg') except UnkownConvertError, e: if request.user.is_staff or request.user.is_superuser: messages.error(request, e) if size == THUMBNAIL_SIZE: return serve_file(request, File(file=open('%simages/picture_error.png' % settings.MEDIA_ROOT, 'r'))) else: return serve_file(request, File(file=open('%simages/1297211435_error.png' % settings.MEDIA_ROOT, 'r')))
transformation_list.append(output) except Exception, e: if request.user.is_staff: messages.warning( request, _(u"Error for transformation %(transformation)s:, %(error)s") % {"transformation": page_transformation.get_transformation_display(), "error": e}, ) else: pass tranformation_string = " ".join(transformation_list) try: filepath = StagingFile.get(staging_file_id).filepath output_file = convert( filepath, size=STAGING_FILES_PREVIEW_SIZE, extra_options=tranformation_string, cleanup_files=False ) return serve_file(request, File(file=open(output_file, "r")), content_type="image/jpeg") except UnkownConvertError, e: if request.user.is_staff or request.user.is_superuser: messages.error(request, e) return serve_file(request, File(file=open(u"%simages/%s" % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), "r"))) except UnknownFormat: return serve_file(request, File(file=open(u"%simages/%s" % (settings.MEDIA_ROOT, PICTURE_UNKNOWN_MEDIUM), "r"))) except Exception, e: if request.user.is_staff or request.user.is_superuser: messages.error(request, e) return serve_file(request, File(file=open(u"%simages/%s" % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), "r"))) # TODO: Need permission
try: if page_transformation['name'] in TRANFORMATION_CHOICES: output = TRANFORMATION_CHOICES[page_transformation['name']] % eval(page_transformation['arguments']) transformation_list.append(output) except Exception, e: if request.user.is_staff: messages.warning(request, _(u'Error for transformation %(transformation)s:, %(error)s') % {'transformation':page_transformation.get_transformation_display(), 'error':e}) else: pass tranformation_string = ' '.join(transformation_list) try: filepath = StagingFile.get(staging_file_id).filepath output_file = convert(filepath, size=STAGING_FILES_PREVIEW_SIZE, extra_options=tranformation_string, cleanup_files=False) return serve_file(request, File(file=open(output_file, 'r')), content_type='image/jpeg') except UnkownConvertError, e: if request.user.is_staff or request.user.is_superuser: messages.error(request, e) return serve_file(request, File(file=open(u'%simages/%s' % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), 'r'))) except UnknownFormat: return serve_file(request, File(file=open(u'%simages/%s' % (settings.MEDIA_ROOT, PICTURE_UNKNOWN_MEDIUM), 'r'))) except Exception, e: if request.user.is_staff or request.user.is_superuser: messages.error(request, e) return serve_file(request, File(file=open(u'%simages/%s' % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), 'r'))) #TODO: Need permission def staging_file_delete(request, staging_file_id):
def get_valid_image(self, size=THUMBNAIL_SIZE, transformations=None): return convert(self.filepath, size=size, cleanup_files=False, transformations=transformations)
def get_valid_image(self, size=DISPLAY_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION): image_cache_name = self.get_image_cache_name(page=page) return convert(image_cache_name, cleanup_files=False, size=size, zoom=zoom, rotation=rotation)
def preview(self): tranformation_string, errors = get_transformation_string(DEFAULT_TRANSFORMATIONS) output_file = convert(self.filepath, size=STAGING_FILES_PREVIEW_SIZE, extra_options=tranformation_string, cleanup_files=False) return output_file, errors