Example #1
0
    def test_merge_documents_3(self):
        # Insert doc2 into middle of doc 1
        doc1 = operations.create_document( owner = self.user )
        doc2 = operations.create_document( owner = self.user )

        pages = ( [ operations.create_page(doc1) for _ in xrange(5) ]
                  + [ operations.create_page(doc2) for _ in xrange(5) ] )

        self.assert_( doc1.num_pages == 5 )
        self.assert_( doc2.num_pages == 5 )

        operations.merge_documents(doc1, doc2, 3)

        self.assert_( manager(Document).filter( pk = doc2.pk ).count() == 0 )
        self.assert_( doc1.num_pages == 10 )

        # First 3 pages of doc1 stay first pages of 10 pager
        for i in xrange(0,3):
            self.assert_(pages[i].pk == doc1.pages.get(position=i+1).pk)

        # all pages from doc2 not starting at 4th page of 10 pager
        for i in xrange(3,8):
            self.assert_(pages[i+2].pk == doc1.pages.get(position=i+1).pk)

        # last tow pages of dooc 1 are now last two pages or 10 pager
        for i in xrange(8, 10):
            self.assert_(pages[i-5].pk == doc1.pages.get(position=i+1).pk)
Example #2
0
def handle_work_item(processor, item):
    """ Pick up a (possibly) multipage TIFF upload and turn it into a
        document having (possibly) multiple individual pages.

    """

    new_work = []
    asset = item['Asset-Instance']
    local_path = item['Local-Path']
    work_dir = os.path.dirname(local_path)
    page_prefix = os.path.join(work_dir, 'page-')

    os.system('tiffsplit %r %r' % (local_path, page_prefix))

    document = operations.create_document(
        owner=asset.owner,
        title='Uploaded on %s (%s)' %
        (asset.date_created, asset.producer.process))

    position = 1
    all_page_files = glob.glob('%s*.tif*' % page_prefix)
    all_page_files.sort()
    for page_tiff_path in all_page_files:
        new_work.extend(
            handle_page(processor, asset, document, page_tiff_path, position))
        position += 1

    if document is not None:
        new_work.append(
            document.assets.get(asset_class__name=models.AssetClass.DOCUMENT,
                                mime_type__name=models.MimeType.BINARY))

    return new_work
Example #3
0
    def test_merge_documents_2(self):
        # Prepend doc2 to beginning of doc 1
        doc1 = operations.create_document( owner = self.user )
        doc2 = operations.create_document( owner = self.user )

        pages = ( [ operations.create_page(doc1) for _ in xrange(5) ]
                  + [ operations.create_page(doc2) for _ in xrange(5) ] )

        self.assert_( doc1.num_pages == 5 )
        self.assert_( doc2.num_pages == 5 )

        operations.merge_documents(doc1, doc2, 0)

        self.assert_( manager(Document).filter( pk = doc2.pk ).count() == 0 )
        self.assert_( doc1.num_pages == 10 )

        for i in xrange(5):
            self.assert_(pages[i].pk   == doc1.pages.get(position=i+6).pk)
            self.assert_(pages[i+5].pk == doc1.pages.get(position=i+1).pk)
Example #4
0
def handle_work_item(processor, item):

    """ Pick up a (possibly) multipage PDF upload and turn it into a
        document having (possibly) multiple individual pages.

    """

    asset       = item['Asset-Instance']
    local_path  = item['Local-Path']
    work_dir    = os.path.dirname(local_path)
    page_prefix = os.path.join(work_dir, 'page-')
    asset_list  = []

    pdf.split_pages( local_path, page_prefix )

    if asset.get_children(models.AssetClass.PAGE_ORIGINAL).count() == 0:
        document = operations.create_document(
            asset.owner,
            title = 'Uploaded on %s (%s)' % (
                asset.date_created,
                asset.producer.process ))
    else:
        document = None

    position = 1
    all_page_files = glob.glob('%s*.pdf' % page_prefix)
    all_page_files.sort()

    for page_pdf_path in all_page_files:
        if document:
            page_asset = operations.create_asset_from_file(
                owner        = document.owner,
                producer     = processor,
                asset_class  = models.AssetClass.PAGE_ORIGINAL,
                file_name    = page_pdf_path,
                related_page = operations.create_page(document, position),
                parent       = asset,
                child_number = position,
                mime_type    = models.MimeType.PDF ),
        else:
            page_asset = asset.children.get(position=position)
            operations.upload_asset_file(page_asset, page_pdf_path)

        asset_list.append(page_asset)
        position += 1

    asset_list.append(
        document.assets.get(
            asset_class__name = models.AssetClass.DOCUMENT,
            mime_type__name   = models.MimeType.BINARY ))

    return asset_list
Example #5
0
def handle_work_item(processor, item):
    """ Pick up a (possibly) multipage PDF upload and turn it into a
        document having (possibly) multiple individual pages.

    """

    asset = item['Asset-Instance']
    local_path = item['Local-Path']
    work_dir = os.path.dirname(local_path)
    page_prefix = os.path.join(work_dir, 'page-')
    asset_list = []

    pdf.split_pages(local_path, page_prefix)

    if asset.get_children(models.AssetClass.PAGE_ORIGINAL).count() == 0:
        document = operations.create_document(
            asset.owner,
            title='Uploaded on %s (%s)' %
            (asset.date_created, asset.producer.process))
    else:
        document = None

    position = 1
    all_page_files = glob.glob('%s*.pdf' % page_prefix)
    all_page_files.sort()

    for page_pdf_path in all_page_files:
        if document:
            page_asset = operations.create_asset_from_file(
                owner=document.owner,
                producer=processor,
                asset_class=models.AssetClass.PAGE_ORIGINAL,
                file_name=page_pdf_path,
                related_page=operations.create_page(document, position),
                parent=asset,
                child_number=position,
                mime_type=models.MimeType.PDF),
        else:
            page_asset = asset.children.get(position=position)
            operations.upload_asset_file(page_asset, page_pdf_path)

        asset_list.append(page_asset)
        position += 1

    asset_list.append(
        document.assets.get(asset_class__name=models.AssetClass.DOCUMENT,
                            mime_type__name=models.MimeType.BINARY))

    return asset_list
Example #6
0
    def test_split_document(self):
        doc1 = operations.create_document( owner = self.user )

        pages = [ operations.create_page(doc1) for _ in xrange(10) ]

        self.assert_( doc1.num_pages == 10 )

        doc2 = operations.split_document(doc1, 5)

        self.assert_( doc1.num_pages == 5 )
        self.assert_( doc2.num_pages == 5 )

        for i in xrange(5):
            self.assert_(pages[i].pk   == doc1.pages.get(position=i+1).pk)
            self.assert_(pages[i+5].pk == doc2.pages.get(position=i+1).pk)
Example #7
0
def handle_work_item(processor, item):

    """ Pick up a (possibly) multipage TIFF upload and turn it into a
        document having (possibly) multiple individual pages.

    """

    new_work    = []
    asset       = item['Asset-Instance']
    local_path  = item['Local-Path']
    work_dir    = os.path.dirname(local_path)
    page_prefix = os.path.join(work_dir, 'page-')

    os.system( 'tiffsplit %r %r' % (local_path, page_prefix) )

    document = operations.create_document(
        owner = asset.owner,
        title = 'Uploaded on %s (%s)' % (
            asset.date_created,
            asset.producer.process ))

    position = 1
    all_page_files = glob.glob('%s*.tif*' % page_prefix)
    all_page_files.sort()
    for page_tiff_path in all_page_files:
        new_work.extend(
            handle_page(
                processor,
                asset,
                document,
                page_tiff_path,
                position ))
        position += 1

    if document is not None:
        new_work.append(
            document.assets.get(
                asset_class__name = models.AssetClass.DOCUMENT,
                mime_type__name   = models.MimeType.BINARY ))

    return new_work
Example #8
0
def handle_work_item(processor, item):

    """ Pick up a (possibly) multipage PDF upload and turn it into a
        document having (possibly) multiple individual pages.

    """

    asset       = item['Asset-Instance']
    local_path  = item['Local-Path']
    is_new      = item['Is-New']
    work_dir    = os.path.dirname(local_path)
    page_prefix = os.path.join(work_dir, 'page-')
    asset_list  = []
    pdf.split_pages( local_path, page_prefix )

    document = operations.create_document(
        asset.owner,
        title = 'Uploaded on %s (%s)' % (
            asset.date_created,
            asset.producer.process ))

    position = 1
    all_page_files = glob.glob('%s*.pdf' % page_prefix)
    all_page_files.sort()
    for page_pdf_path in all_page_files:
        asset_list.extend(
            create_page(
                processor,
                asset,
                document,
                page_pdf_path,
                position ))
        position += 1

    asset_list.append(
        document.assets.get(
            asset_class__name = models.AssetClass.DOCUMENT,
            mime_type__name   = models.MimeType.BINARY ))

    return asset_list
Example #9
0
    def test_tag_documents_by_time(self):
        # create an unclassified document
        doc0 = operations.create_document( owner = self.user )

        asset0 = operations.create_asset_from_stream(
            owner        = self.user,
            producer     = self.producer,
            asset_class  = models.AssetClass.DOCUMENT,
            data_stream  = StringIO('some pdf'),
            file_name    = 'create_asset_from_string.txt',
            child_number = 1,
            related_document = doc0,
            mime_type        = models.MimeType.PDF )

        sleep(2)

        doc1 = operations.create_document( owner = self.user )

        now = datetime.date.fromtimestamp(time.time())
        pdf_generator.tag_document(doc1, datetime.timedelta(0, 1))

        asset1 = operations.create_asset_from_stream(
            owner        = self.user,
            producer     = self.producer,
            asset_class  = models.AssetClass.DOCUMENT,
            data_stream  = StringIO('some pdf'),
            file_name    = 'create_asset_from_string.txt',
            child_number = 1,
            related_document = doc1,
            mime_type        = models.MimeType.PDF )

        # do we have a new tag?
        self.assert_( doc1.tags.all().count() == 1 )

        tag1 = doc1.tags.all()[0]

        self.assert_(tag1.tag_class == models.Tag.UPLOAD_AGGREGATE)

        # sleep 3 sec
        sleep(3)

        doc2 = operations.create_document( owner = self.user )
        now = datetime.date.fromtimestamp(time.time())
        pdf_generator.tag_document(doc2, datetime.timedelta(0, 1))
        # is the second document tagged in the different tag?
        self.assert_( doc2.tags.all().count() == 1 )

        tag2 = doc2.tags.all()[0]

        self.assert_(tag2.label != tag1.label)

        asset2 = operations.create_asset_from_stream(
            owner        = self.user,
            producer     = self.producer,
            asset_class  = models.AssetClass.DOCUMENT,
            data_stream  = StringIO('some pdf'),
            file_name    = 'create_asset_from_string.txt',
            child_number = 1,
            related_document = doc2,
            mime_type        = models.MimeType.PDF )



        # sleep another 3 seconds and create the 3rd document,
        # but with a longer threshold
        sleep(3)

        doc3 = operations.create_document( owner = self.user )
        now = datetime.date.fromtimestamp(time.time())
        pdf_generator.tag_document(doc3, datetime.timedelta(0, 10))

        # did this one got tagged with a same tag?
        self.assert_( doc2.tags.all().count() == 1 )
        self.assert_( tag2 == doc3.tags.all()[0] )

        asset3 = operations.create_asset_from_stream(
            owner        = self.user,
            producer     = self.producer,
            asset_class  = models.AssetClass.DOCUMENT,
            data_stream  = StringIO('some pdf'),
            file_name    = 'create_asset_from_string.txt',
            child_number = 1,
            related_document = doc3,
            mime_type        = models.MimeType.PDF )