def create_page( processor, parent_asset, document, pdf_orig_path, position ): """ Convert the given PDF file (representing a s single page) to a JPEG and a thumbnail. """ # Stuff we'll need later page = operations.create_page(document, position) base_name = os.path.splitext(pdf_orig_path)[0] jpeg_path = pdf.convert(pdf_orig_path, 'jpeg') thumb_path = '%s-thumbnail.jpeg' % base_name # Save the converted JPEG as a thumbnail JPEG image.save( image.thumbnail( image.load(jpeg_path), settings.THUMBNAIL_SIZE), thumb_path) # Put the assets into the work queue return [ # The oginal full-res page as a PDF operations.create_asset_from_file( owner = document.owner, producer = processor, asset_class = models.AssetClass.PAGE_ORIGINAL, file_name = pdf_orig_path, related_page = page, parent = parent_asset, child_number = page.position, mime_type = models.MimeType.PDF ), # The full-res page as a JPEG operations.create_asset_from_file( owner = document.owner, producer = processor, asset_class = models.AssetClass.PAGE_IMAGE, file_name = jpeg_path, related_page = page, parent = parent_asset, child_number = page.position, mime_type = models.MimeType.JPEG ), # The thumbnail as a JPEG operations.create_asset_from_file( owner = document.owner, producer = processor, asset_class = models.AssetClass.PAGE_THUMBNAIL, file_name = thumb_path, related_page = page, parent = parent_asset, child_number = page.position, mime_type = models.MimeType.JPEG ), ]
def on_savemessage(accountname, username, message_file, flags, label): print 'ON_SAVEMESSAGE: account=%s message file=%s flags=%s' % ( username, message_file, flags) MODULE_NAME = 'mail_parser' # os.path.splitext(os.path.basename(__file__))[0] DEFAULT_INPUTS = [AssetClass.UPLOAD] DEFAULT_OUTPUTS = [AssetClass.MESSAGE_PART] DEFAULT_ACCEPTED_MIME_TYPES = [MimeType.MAIL] owner = User.objects.get(username=username) processor = operations.initialize_processor(MODULE_NAME, DEFAULT_INPUTS, DEFAULT_OUTPUTS, DEFAULT_ACCEPTED_MIME_TYPES)[0] new_item = operations.create_asset_from_file(file_name=message_file, owner=owner, producer=processor, asset_class=AssetClass.UPLOAD, child_number=0, mime_type=MimeType.MAIL) new_item.orig_file_name += ',ACCOUNT=%s' % accountname new_item.orig_file_name += ',LABEL=%s' % label new_item.orig_file_name += ',FLAGS=' for f in flags: new_item.orig_file_name += ',' + f new_item.save() operations.publish_work_item(new_item) return
def on_savemessage(accountname, username, message_file, flags, label): print 'ON_SAVEMESSAGE: account=%s message file=%s flags=%s' % (username, message_file, flags) MODULE_NAME = 'mail_parser' # os.path.splitext(os.path.basename(__file__))[0] DEFAULT_INPUTS = [ AssetClass.UPLOAD ] DEFAULT_OUTPUTS = [ AssetClass.MESSAGE_PART ] DEFAULT_ACCEPTED_MIME_TYPES = [ MimeType.MAIL ] owner = User.objects.get(username=username) processor = operations.initialize_processor( MODULE_NAME, DEFAULT_INPUTS, DEFAULT_OUTPUTS, DEFAULT_ACCEPTED_MIME_TYPES ) [0] new_item = operations.create_asset_from_file( file_name = message_file, owner = owner, producer = processor, asset_class = AssetClass.UPLOAD, child_number = 0, mime_type = MimeType.MAIL ) new_item.orig_file_name += ',ACCOUNT=%s' % accountname new_item.orig_file_name += ',LABEL=%s' % label new_item.orig_file_name += ',FLAGS=' for f in flags: new_item.orig_file_name += ',' + f new_item.save() operations.publish_work_item(new_item) return
def upload_file(processor, user, local_path): """ Transactionally upload a new work item """ operations.publish_work_item( operations.create_asset_from_file( file_name=local_path, owner=user, producer=processor, child_number=0, asset_class=models.AssetClass.UPLOAD))
def upload_file( processor, user, local_path ): """ Transactionally upload a new work item """ operations.publish_work_item( operations.create_asset_from_file( file_name = local_path, owner = user, producer = processor, child_number = 0, asset_class = models.AssetClass.UPLOAD ))
def handle_work_item(processor, item): """ Pick up a (possibly) multipage PDF upload and turn it into a document having (possibly) multiple individual pages. """ asset = item['Asset-Instance'] local_path = item['Local-Path'] work_dir = os.path.dirname(local_path) page_prefix = os.path.join(work_dir, 'page-') asset_list = [] pdf.split_pages( local_path, page_prefix ) if asset.get_children(models.AssetClass.PAGE_ORIGINAL).count() == 0: document = operations.create_document( asset.owner, title = 'Uploaded on %s (%s)' % ( asset.date_created, asset.producer.process )) else: document = None position = 1 all_page_files = glob.glob('%s*.pdf' % page_prefix) all_page_files.sort() for page_pdf_path in all_page_files: if document: page_asset = operations.create_asset_from_file( owner = document.owner, producer = processor, asset_class = models.AssetClass.PAGE_ORIGINAL, file_name = page_pdf_path, related_page = operations.create_page(document, position), parent = asset, child_number = position, mime_type = models.MimeType.PDF ), else: page_asset = asset.children.get(position=position) operations.upload_asset_file(page_asset, page_pdf_path) asset_list.append(page_asset) position += 1 asset_list.append( document.assets.get( asset_class__name = models.AssetClass.DOCUMENT, mime_type__name = models.MimeType.BINARY )) return asset_list
def handle_work_item(processor, item): """ Pick up a (possibly) multipage PDF upload and turn it into a document having (possibly) multiple individual pages. """ asset = item['Asset-Instance'] local_path = item['Local-Path'] work_dir = os.path.dirname(local_path) page_prefix = os.path.join(work_dir, 'page-') asset_list = [] pdf.split_pages(local_path, page_prefix) if asset.get_children(models.AssetClass.PAGE_ORIGINAL).count() == 0: document = operations.create_document( asset.owner, title='Uploaded on %s (%s)' % (asset.date_created, asset.producer.process)) else: document = None position = 1 all_page_files = glob.glob('%s*.pdf' % page_prefix) all_page_files.sort() for page_pdf_path in all_page_files: if document: page_asset = operations.create_asset_from_file( owner=document.owner, producer=processor, asset_class=models.AssetClass.PAGE_ORIGINAL, file_name=page_pdf_path, related_page=operations.create_page(document, position), parent=asset, child_number=position, mime_type=models.MimeType.PDF), else: page_asset = asset.children.get(position=position) operations.upload_asset_file(page_asset, page_pdf_path) asset_list.append(page_asset) position += 1 asset_list.append( document.assets.get(asset_class__name=models.AssetClass.DOCUMENT, mime_type__name=models.MimeType.BINARY)) return asset_list
def _enqueue_message(self, message_file, accountname, label, flags): new_item = operations.create_asset_from_file( file_name=message_file, owner=self.user, producer=self.processor, asset_class=AssetClass.UPLOAD, child_number=0, mime_type=MimeType.MAIL) new_item.orig_file_name += ',ACCOUNT=%s' % accountname new_item.orig_file_name += ',LABEL=%s' % label new_item.orig_file_name += ',FLAGS=' for f in flags: new_item.orig_file_name += ',' + f new_item.save() operations.publish_work_item(new_item)
def _enqueue_message(self, message_file, accountname, label, flags): new_item = operations.create_asset_from_file( file_name = message_file, owner = self.user, producer = self.processor, asset_class = AssetClass.UPLOAD, child_number = 0, mime_type = MimeType.MAIL ) new_item.orig_file_name += ',ACCOUNT=%s' % accountname new_item.orig_file_name += ',LABEL=%s' % label new_item.orig_file_name += ',FLAGS=' for f in flags: new_item.orig_file_name += ',' + f new_item.save() operations.publish_work_item(new_item)
def process_mail(owner, processor, local_path): """ Process an email message, which is assumed to have been stored on the local file system at the path given by local_path. """ return [ operations.create_asset_from_file( file_name = local_path, owner = owner, producer = processor, asset_class = AssetClass.UPLOAD, # parent_asset = None, child_number = 0, mime_type = MimeType.MAIL ), ]
def handle_work_item(processor, item): """ Process a work item. The work item will be provided and its local temp directory will be cleaned up by the process driver framework. If this method does not raise an exception the work item will also be removed from the work queue. """ try: new_work = [] parent_asset = item['Asset-Instance'] owner = item['Owner'] new_work.append( operations.create_asset_from_file( owner = owner, producer = processor, asset_class = models.AssetClass.PAGE_TEXT, file_name = image_to_html( item['Local-Path'] ), related_page = parent_asset.related_page, parent = parent_asset, child_number = 1, mime_type = models.MimeType.HTML )) if not item['Is-New']: new_work.append( parent_asset.related_page.document.assets.get( asset_class = models.AssetClass.DOCUMENT, mime_type = models.MimeType.BINARY )) # do no check for available credit for inactive accounts. # assumption: the account can only be inactive when the trial is in progress if owner.is_active: if donomo.billing.models.process_billable_event(owner, 'ocr.ocropus.page'): return new_work else: raise Exception("Insufficient account balance") else: return new_work except OCRFailed: logging.warning('OCR failed, dropping from processing chain')
def test_create_asset_from_file(self): fd, temp_file_name = tempfile.mkstemp() try: os.write(fd, TEST_DATA) os.close(fd) fd = None asset = operations.create_asset_from_file( owner = self.user, producer = self.producer, asset_class = 'test_data', file_name = temp_file_name, child_number = 0, mime_type = 'text/plain' ) self.assert_( asset is not None ) finally: if fd is not None: os.close(fd) os.remove(temp_file_name) operations.publish_work_item(asset) self._validate_consumer(asset, TEST_DATA)
def handle_page(processor, parent_asset, document, tiff_original_path, position): """ Convert the given TIFF file (representing a s single page) whose path is given to a JPEG (via RGBA). Also create two thumbnails. """ # Stuff we'll need later page = operations.create_page(document, position) base_name = os.path.splitext(tiff_original_path)[0] rgba_path = '%s.rgba' % base_name jpeg_path = '%s.jpeg' % base_name thumb_path = '%s-thumbnail.jpeg' % base_name # Convert original TIFF to RGBA # TODO use convert instead of tiff2rgba os.system('tiff2rgba %r %r' % (tiff_original_path, rgba_path)) # Save the original as JPEG image.save(image.load(rgba_path), jpeg_path) # Save the thumbnail as JPEG image.save(image.thumbnail(image.load(rgba_path), settings.THUMBNAIL_SIZE), thumb_path) # Put the assets into the work queue return [ # The oginal full-res page as a TIFF operations.create_asset_from_file( owner=document.owner, producer=processor, asset_class=models.AssetClass.PAGE_ORIGINAL, file_name=tiff_original_path, related_page=page, parent=parent_asset, child_number=page.position, mime_type=models.MimeType.TIFF), # The full-res page as a JPEG operations.create_asset_from_file( owner=document.owner, producer=processor, asset_class=models.AssetClass.PAGE_IMAGE, file_name=jpeg_path, related_page=page, parent=parent_asset, child_number=page.position, mime_type=models.MimeType.JPEG), # The thumbnail as a JPEG operations.create_asset_from_file( owner=document.owner, producer=processor, asset_class=models.AssetClass.PAGE_THUMBNAIL, file_name=thumb_path, related_page=page, parent=parent_asset, child_number=page.position, mime_type=models.MimeType.JPEG), ]
def handle_page( processor, parent_asset, document, tiff_original_path, position ): """ Convert the given TIFF file (representing a s single page) whose path is given to a JPEG (via RGBA). Also create two thumbnails. """ # Stuff we'll need later page = operations.create_page(document, position) base_name = os.path.splitext(tiff_original_path)[0] rgba_path = '%s.rgba' % base_name jpeg_path = '%s.jpeg' % base_name thumb_path = '%s-thumbnail.jpeg' % base_name # Convert original TIFF to RGBA # TODO use convert instead of tiff2rgba os.system('tiff2rgba %r %r' % (tiff_original_path, rgba_path)) # Save the original as JPEG image.save( image.load(rgba_path), jpeg_path) # Save the thumbnail as JPEG image.save( image.thumbnail( image.load(rgba_path), settings.THUMBNAIL_SIZE), thumb_path) # Put the assets into the work queue return [ # The oginal full-res page as a TIFF operations.create_asset_from_file( owner = document.owner, producer = processor, asset_class = models.AssetClass.PAGE_ORIGINAL, file_name = tiff_original_path, related_page = page, parent = parent_asset, child_number = page.position, mime_type = models.MimeType.TIFF ), # The full-res page as a JPEG operations.create_asset_from_file( owner = document.owner, producer = processor, asset_class = models.AssetClass.PAGE_IMAGE, file_name = jpeg_path, related_page = page, parent = parent_asset, child_number = page.position, mime_type = models.MimeType.JPEG ), # The thumbnail as a JPEG operations.create_asset_from_file( owner = document.owner, producer = processor, asset_class = models.AssetClass.PAGE_THUMBNAIL, file_name = thumb_path, related_page = page, parent = parent_asset, child_number = page.position, mime_type = models.MimeType.JPEG ), ]