Example #1
0
def create_page(
    processor,
    parent_asset,
    document,
    pdf_orig_path,
    position ):

    """ Convert the given PDF file (representing a s single page) to a
        JPEG and a thumbnail.
    """

    # Stuff we'll need later
    page         = operations.create_page(document, position)
    base_name    = os.path.splitext(pdf_orig_path)[0]
    jpeg_path    = pdf.convert(pdf_orig_path, 'jpeg')
    thumb_path   = '%s-thumbnail.jpeg' % base_name

    # Save the converted JPEG as a thumbnail JPEG
    image.save(
        image.thumbnail(
            image.load(jpeg_path),
            settings.THUMBNAIL_SIZE),
        thumb_path)

    # Put the assets into the work queue
    return [

        # The oginal full-res page as a PDF
        operations.create_asset_from_file(
            owner        = document.owner,
            producer     = processor,
            asset_class  = models.AssetClass.PAGE_ORIGINAL,
            file_name    = pdf_orig_path,
            related_page = page,
            parent       = parent_asset,
            child_number = page.position,
            mime_type    = models.MimeType.PDF ),

        # The full-res page as a JPEG
        operations.create_asset_from_file(
            owner        = document.owner,
            producer     = processor,
            asset_class  = models.AssetClass.PAGE_IMAGE,
            file_name    = jpeg_path,
            related_page = page,
            parent       = parent_asset,
            child_number = page.position,
            mime_type    = models.MimeType.JPEG ),

        # The thumbnail as a JPEG
        operations.create_asset_from_file(
            owner        = document.owner,
            producer     = processor,
            asset_class  = models.AssetClass.PAGE_THUMBNAIL,
            file_name    = thumb_path,
            related_page = page,
            parent       = parent_asset,
            child_number = page.position,
            mime_type    = models.MimeType.JPEG ),
        ]
Example #2
0
def on_savemessage(accountname, username, message_file, flags, label):
    print 'ON_SAVEMESSAGE: account=%s message file=%s flags=%s' % (
        username, message_file, flags)

    MODULE_NAME = 'mail_parser'  # os.path.splitext(os.path.basename(__file__))[0]
    DEFAULT_INPUTS = [AssetClass.UPLOAD]
    DEFAULT_OUTPUTS = [AssetClass.MESSAGE_PART]
    DEFAULT_ACCEPTED_MIME_TYPES = [MimeType.MAIL]

    owner = User.objects.get(username=username)

    processor = operations.initialize_processor(MODULE_NAME, DEFAULT_INPUTS,
                                                DEFAULT_OUTPUTS,
                                                DEFAULT_ACCEPTED_MIME_TYPES)[0]
    new_item = operations.create_asset_from_file(file_name=message_file,
                                                 owner=owner,
                                                 producer=processor,
                                                 asset_class=AssetClass.UPLOAD,
                                                 child_number=0,
                                                 mime_type=MimeType.MAIL)
    new_item.orig_file_name += ',ACCOUNT=%s' % accountname
    new_item.orig_file_name += ',LABEL=%s' % label
    new_item.orig_file_name += ',FLAGS='
    for f in flags:
        new_item.orig_file_name += ',' + f
    new_item.save()
    operations.publish_work_item(new_item)

    return
Example #3
0
def on_savemessage(accountname, username, message_file, flags, label):
    print 'ON_SAVEMESSAGE: account=%s message file=%s flags=%s' % (username, message_file, flags)

    MODULE_NAME     = 'mail_parser' # os.path.splitext(os.path.basename(__file__))[0]
    DEFAULT_INPUTS  = [ AssetClass.UPLOAD ]
    DEFAULT_OUTPUTS = [ AssetClass.MESSAGE_PART ]
    DEFAULT_ACCEPTED_MIME_TYPES = [ MimeType.MAIL ]

    owner = User.objects.get(username=username)

    processor = operations.initialize_processor(
        MODULE_NAME,
        DEFAULT_INPUTS,
        DEFAULT_OUTPUTS,
        DEFAULT_ACCEPTED_MIME_TYPES ) [0]
    new_item = operations.create_asset_from_file(
        file_name    = message_file,
        owner        = owner,
        producer     = processor,
        asset_class  = AssetClass.UPLOAD,
        child_number = 0,
        mime_type    = MimeType.MAIL )
    new_item.orig_file_name += ',ACCOUNT=%s' % accountname
    new_item.orig_file_name += ',LABEL=%s' % label
    new_item.orig_file_name += ',FLAGS='
    for f in flags:
        new_item.orig_file_name += ',' + f
    new_item.save()
    operations.publish_work_item(new_item)

    return
Example #4
0
def upload_file(processor, user, local_path):
    """ Transactionally upload a new work item """
    operations.publish_work_item(
        operations.create_asset_from_file(
            file_name=local_path,
            owner=user,
            producer=processor,
            child_number=0,
            asset_class=models.AssetClass.UPLOAD))
Example #5
0
def upload_file( processor, user, local_path ):
    """ Transactionally upload a new work item """
    operations.publish_work_item(
        operations.create_asset_from_file(
            file_name    = local_path,
            owner        = user,
            producer     = processor,
            child_number = 0,
            asset_class  = models.AssetClass.UPLOAD ))
Example #6
0
def handle_work_item(processor, item):

    """ Pick up a (possibly) multipage PDF upload and turn it into a
        document having (possibly) multiple individual pages.

    """

    asset       = item['Asset-Instance']
    local_path  = item['Local-Path']
    work_dir    = os.path.dirname(local_path)
    page_prefix = os.path.join(work_dir, 'page-')
    asset_list  = []

    pdf.split_pages( local_path, page_prefix )

    if asset.get_children(models.AssetClass.PAGE_ORIGINAL).count() == 0:
        document = operations.create_document(
            asset.owner,
            title = 'Uploaded on %s (%s)' % (
                asset.date_created,
                asset.producer.process ))
    else:
        document = None

    position = 1
    all_page_files = glob.glob('%s*.pdf' % page_prefix)
    all_page_files.sort()

    for page_pdf_path in all_page_files:
        if document:
            page_asset = operations.create_asset_from_file(
                owner        = document.owner,
                producer     = processor,
                asset_class  = models.AssetClass.PAGE_ORIGINAL,
                file_name    = page_pdf_path,
                related_page = operations.create_page(document, position),
                parent       = asset,
                child_number = position,
                mime_type    = models.MimeType.PDF ),
        else:
            page_asset = asset.children.get(position=position)
            operations.upload_asset_file(page_asset, page_pdf_path)

        asset_list.append(page_asset)
        position += 1

    asset_list.append(
        document.assets.get(
            asset_class__name = models.AssetClass.DOCUMENT,
            mime_type__name   = models.MimeType.BINARY ))

    return asset_list
Example #7
0
def handle_work_item(processor, item):
    """ Pick up a (possibly) multipage PDF upload and turn it into a
        document having (possibly) multiple individual pages.

    """

    asset = item['Asset-Instance']
    local_path = item['Local-Path']
    work_dir = os.path.dirname(local_path)
    page_prefix = os.path.join(work_dir, 'page-')
    asset_list = []

    pdf.split_pages(local_path, page_prefix)

    if asset.get_children(models.AssetClass.PAGE_ORIGINAL).count() == 0:
        document = operations.create_document(
            asset.owner,
            title='Uploaded on %s (%s)' %
            (asset.date_created, asset.producer.process))
    else:
        document = None

    position = 1
    all_page_files = glob.glob('%s*.pdf' % page_prefix)
    all_page_files.sort()

    for page_pdf_path in all_page_files:
        if document:
            page_asset = operations.create_asset_from_file(
                owner=document.owner,
                producer=processor,
                asset_class=models.AssetClass.PAGE_ORIGINAL,
                file_name=page_pdf_path,
                related_page=operations.create_page(document, position),
                parent=asset,
                child_number=position,
                mime_type=models.MimeType.PDF),
        else:
            page_asset = asset.children.get(position=position)
            operations.upload_asset_file(page_asset, page_pdf_path)

        asset_list.append(page_asset)
        position += 1

    asset_list.append(
        document.assets.get(asset_class__name=models.AssetClass.DOCUMENT,
                            mime_type__name=models.MimeType.BINARY))

    return asset_list
 def _enqueue_message(self, message_file, accountname, label, flags):
     new_item = operations.create_asset_from_file(
         file_name=message_file,
         owner=self.user,
         producer=self.processor,
         asset_class=AssetClass.UPLOAD,
         child_number=0,
         mime_type=MimeType.MAIL)
     new_item.orig_file_name += ',ACCOUNT=%s' % accountname
     new_item.orig_file_name += ',LABEL=%s' % label
     new_item.orig_file_name += ',FLAGS='
     for f in flags:
         new_item.orig_file_name += ',' + f
     new_item.save()
     operations.publish_work_item(new_item)
Example #9
0
 def _enqueue_message(self, message_file, accountname, label, flags):
     new_item = operations.create_asset_from_file(
         file_name    = message_file,
         owner        = self.user,
         producer     = self.processor,
         asset_class  = AssetClass.UPLOAD,
         child_number = 0,
         mime_type    = MimeType.MAIL )
     new_item.orig_file_name += ',ACCOUNT=%s' % accountname
     new_item.orig_file_name += ',LABEL=%s' % label
     new_item.orig_file_name += ',FLAGS='
     for f in flags:
         new_item.orig_file_name += ',' + f
     new_item.save()
     operations.publish_work_item(new_item)
Example #10
0
def process_mail(owner, processor, local_path):
    """ Process an email message, which is assumed to have been stored on
        the local file system at the path given by local_path.
    """

    return [
        operations.create_asset_from_file(
            file_name    = local_path,
            owner        = owner,
            producer     = processor,
            asset_class  = AssetClass.UPLOAD,
#            parent_asset = None,
            child_number = 0,
            mime_type    = MimeType.MAIL ),
        ]
Example #11
0
def handle_work_item(processor, item):

    """ Process a work item.  The work item will be provided and its local
        temp directory will be cleaned up by the process driver framework.
        If this method does not raise an exception the work item will
        also be removed from the work queue.

    """
    try:
        new_work = []
        parent_asset = item['Asset-Instance']
        owner = item['Owner']
        new_work.append(
            operations.create_asset_from_file(
                owner        = owner,
                producer     = processor,
                asset_class  = models.AssetClass.PAGE_TEXT,
                file_name    = image_to_html( item['Local-Path'] ),
                related_page = parent_asset.related_page,
                parent       = parent_asset,
                child_number = 1,
                mime_type    = models.MimeType.HTML ))

        if not item['Is-New']:
            new_work.append(
                parent_asset.related_page.document.assets.get(
                    asset_class = models.AssetClass.DOCUMENT,
                    mime_type   = models.MimeType.BINARY ))

        # do no check for available credit for inactive accounts. 
        # assumption: the account can only be inactive when the trial is in progress
        if owner.is_active:
            if donomo.billing.models.process_billable_event(owner, 
                                                            'ocr.ocropus.page'):
                return new_work
            else:
                raise Exception("Insufficient account balance")
        else:
            return new_work

    except OCRFailed:
        logging.warning('OCR failed, dropping from processing chain')
Example #12
0
    def test_create_asset_from_file(self):
        fd, temp_file_name = tempfile.mkstemp()
        try:
            os.write(fd, TEST_DATA)
            os.close(fd)
            fd = None
            asset = operations.create_asset_from_file(
                owner        = self.user,
                producer     = self.producer,
                asset_class  = 'test_data',
                file_name    = temp_file_name,
                child_number = 0,
                mime_type    = 'text/plain' )

            self.assert_( asset is not None )

        finally:
            if fd is not None:
                os.close(fd)
            os.remove(temp_file_name)

        operations.publish_work_item(asset)

        self._validate_consumer(asset, TEST_DATA)
Example #13
0
def handle_page(processor, parent_asset, document, tiff_original_path,
                position):
    """ Convert the given TIFF file (representing a s single page) whose path
        is given to a JPEG (via RGBA).  Also create two thumbnails.

    """

    # Stuff we'll need later
    page = operations.create_page(document, position)
    base_name = os.path.splitext(tiff_original_path)[0]
    rgba_path = '%s.rgba' % base_name
    jpeg_path = '%s.jpeg' % base_name
    thumb_path = '%s-thumbnail.jpeg' % base_name

    # Convert original TIFF to RGBA
    # TODO use convert instead of tiff2rgba
    os.system('tiff2rgba %r %r' % (tiff_original_path, rgba_path))

    # Save the original as JPEG
    image.save(image.load(rgba_path), jpeg_path)

    # Save the thumbnail as JPEG
    image.save(image.thumbnail(image.load(rgba_path), settings.THUMBNAIL_SIZE),
               thumb_path)

    # Put the assets into the work queue
    return [

        # The oginal full-res page as a TIFF
        operations.create_asset_from_file(
            owner=document.owner,
            producer=processor,
            asset_class=models.AssetClass.PAGE_ORIGINAL,
            file_name=tiff_original_path,
            related_page=page,
            parent=parent_asset,
            child_number=page.position,
            mime_type=models.MimeType.TIFF),

        # The full-res page as a JPEG
        operations.create_asset_from_file(
            owner=document.owner,
            producer=processor,
            asset_class=models.AssetClass.PAGE_IMAGE,
            file_name=jpeg_path,
            related_page=page,
            parent=parent_asset,
            child_number=page.position,
            mime_type=models.MimeType.JPEG),

        # The thumbnail as a JPEG
        operations.create_asset_from_file(
            owner=document.owner,
            producer=processor,
            asset_class=models.AssetClass.PAGE_THUMBNAIL,
            file_name=thumb_path,
            related_page=page,
            parent=parent_asset,
            child_number=page.position,
            mime_type=models.MimeType.JPEG),
    ]
Example #14
0
def handle_page(
    processor,
    parent_asset,
    document,
    tiff_original_path,
    position ):


    """ Convert the given TIFF file (representing a s single page) whose path
        is given to a JPEG (via RGBA).  Also create two thumbnails.

    """

    # Stuff we'll need later
    page     = operations.create_page(document, position)
    base_name    = os.path.splitext(tiff_original_path)[0]
    rgba_path    = '%s.rgba' % base_name
    jpeg_path    = '%s.jpeg' % base_name
    thumb_path   = '%s-thumbnail.jpeg' % base_name

    # Convert original TIFF to RGBA
    # TODO use convert instead of tiff2rgba
    os.system('tiff2rgba %r %r' % (tiff_original_path, rgba_path))

    # Save the original as JPEG
    image.save(
        image.load(rgba_path),
        jpeg_path)

    # Save the thumbnail as JPEG
    image.save(
        image.thumbnail(
            image.load(rgba_path),
            settings.THUMBNAIL_SIZE),
        thumb_path)

    # Put the assets into the work queue
    return [

        # The oginal full-res page as a TIFF
        operations.create_asset_from_file(
            owner        = document.owner,
            producer     = processor,
            asset_class  = models.AssetClass.PAGE_ORIGINAL,
            file_name    = tiff_original_path,
            related_page = page,
            parent       = parent_asset,
            child_number = page.position,
            mime_type    = models.MimeType.TIFF ),

        # The full-res page as a JPEG
        operations.create_asset_from_file(
            owner        = document.owner,
            producer     = processor,
            asset_class  = models.AssetClass.PAGE_IMAGE,
            file_name    = jpeg_path,
            related_page = page,
            parent       = parent_asset,
            child_number = page.position,
            mime_type    = models.MimeType.JPEG ),

        # The thumbnail as a JPEG
        operations.create_asset_from_file(
            owner        = document.owner,
            producer     = processor,
            asset_class  = models.AssetClass.PAGE_THUMBNAIL,
            file_name    = thumb_path,
            related_page = page,
            parent       = parent_asset,
            child_number = page.position,
            mime_type    = models.MimeType.JPEG ),
        ]