def test_merge_documents_3(self): # Insert doc2 into middle of doc 1 doc1 = operations.create_document( owner = self.user ) doc2 = operations.create_document( owner = self.user ) pages = ( [ operations.create_page(doc1) for _ in xrange(5) ] + [ operations.create_page(doc2) for _ in xrange(5) ] ) self.assert_( doc1.num_pages == 5 ) self.assert_( doc2.num_pages == 5 ) operations.merge_documents(doc1, doc2, 3) self.assert_( manager(Document).filter( pk = doc2.pk ).count() == 0 ) self.assert_( doc1.num_pages == 10 ) # First 3 pages of doc1 stay first pages of 10 pager for i in xrange(0,3): self.assert_(pages[i].pk == doc1.pages.get(position=i+1).pk) # all pages from doc2 not starting at 4th page of 10 pager for i in xrange(3,8): self.assert_(pages[i+2].pk == doc1.pages.get(position=i+1).pk) # last tow pages of dooc 1 are now last two pages or 10 pager for i in xrange(8, 10): self.assert_(pages[i-5].pk == doc1.pages.get(position=i+1).pk)
def handle_work_item(processor, item): """ Pick up a (possibly) multipage TIFF upload and turn it into a document having (possibly) multiple individual pages. """ new_work = [] asset = item['Asset-Instance'] local_path = item['Local-Path'] work_dir = os.path.dirname(local_path) page_prefix = os.path.join(work_dir, 'page-') os.system('tiffsplit %r %r' % (local_path, page_prefix)) document = operations.create_document( owner=asset.owner, title='Uploaded on %s (%s)' % (asset.date_created, asset.producer.process)) position = 1 all_page_files = glob.glob('%s*.tif*' % page_prefix) all_page_files.sort() for page_tiff_path in all_page_files: new_work.extend( handle_page(processor, asset, document, page_tiff_path, position)) position += 1 if document is not None: new_work.append( document.assets.get(asset_class__name=models.AssetClass.DOCUMENT, mime_type__name=models.MimeType.BINARY)) return new_work
def test_merge_documents_2(self): # Prepend doc2 to beginning of doc 1 doc1 = operations.create_document( owner = self.user ) doc2 = operations.create_document( owner = self.user ) pages = ( [ operations.create_page(doc1) for _ in xrange(5) ] + [ operations.create_page(doc2) for _ in xrange(5) ] ) self.assert_( doc1.num_pages == 5 ) self.assert_( doc2.num_pages == 5 ) operations.merge_documents(doc1, doc2, 0) self.assert_( manager(Document).filter( pk = doc2.pk ).count() == 0 ) self.assert_( doc1.num_pages == 10 ) for i in xrange(5): self.assert_(pages[i].pk == doc1.pages.get(position=i+6).pk) self.assert_(pages[i+5].pk == doc1.pages.get(position=i+1).pk)
def handle_work_item(processor, item): """ Pick up a (possibly) multipage PDF upload and turn it into a document having (possibly) multiple individual pages. """ asset = item['Asset-Instance'] local_path = item['Local-Path'] work_dir = os.path.dirname(local_path) page_prefix = os.path.join(work_dir, 'page-') asset_list = [] pdf.split_pages( local_path, page_prefix ) if asset.get_children(models.AssetClass.PAGE_ORIGINAL).count() == 0: document = operations.create_document( asset.owner, title = 'Uploaded on %s (%s)' % ( asset.date_created, asset.producer.process )) else: document = None position = 1 all_page_files = glob.glob('%s*.pdf' % page_prefix) all_page_files.sort() for page_pdf_path in all_page_files: if document: page_asset = operations.create_asset_from_file( owner = document.owner, producer = processor, asset_class = models.AssetClass.PAGE_ORIGINAL, file_name = page_pdf_path, related_page = operations.create_page(document, position), parent = asset, child_number = position, mime_type = models.MimeType.PDF ), else: page_asset = asset.children.get(position=position) operations.upload_asset_file(page_asset, page_pdf_path) asset_list.append(page_asset) position += 1 asset_list.append( document.assets.get( asset_class__name = models.AssetClass.DOCUMENT, mime_type__name = models.MimeType.BINARY )) return asset_list
def handle_work_item(processor, item): """ Pick up a (possibly) multipage PDF upload and turn it into a document having (possibly) multiple individual pages. """ asset = item['Asset-Instance'] local_path = item['Local-Path'] work_dir = os.path.dirname(local_path) page_prefix = os.path.join(work_dir, 'page-') asset_list = [] pdf.split_pages(local_path, page_prefix) if asset.get_children(models.AssetClass.PAGE_ORIGINAL).count() == 0: document = operations.create_document( asset.owner, title='Uploaded on %s (%s)' % (asset.date_created, asset.producer.process)) else: document = None position = 1 all_page_files = glob.glob('%s*.pdf' % page_prefix) all_page_files.sort() for page_pdf_path in all_page_files: if document: page_asset = operations.create_asset_from_file( owner=document.owner, producer=processor, asset_class=models.AssetClass.PAGE_ORIGINAL, file_name=page_pdf_path, related_page=operations.create_page(document, position), parent=asset, child_number=position, mime_type=models.MimeType.PDF), else: page_asset = asset.children.get(position=position) operations.upload_asset_file(page_asset, page_pdf_path) asset_list.append(page_asset) position += 1 asset_list.append( document.assets.get(asset_class__name=models.AssetClass.DOCUMENT, mime_type__name=models.MimeType.BINARY)) return asset_list
def test_split_document(self): doc1 = operations.create_document( owner = self.user ) pages = [ operations.create_page(doc1) for _ in xrange(10) ] self.assert_( doc1.num_pages == 10 ) doc2 = operations.split_document(doc1, 5) self.assert_( doc1.num_pages == 5 ) self.assert_( doc2.num_pages == 5 ) for i in xrange(5): self.assert_(pages[i].pk == doc1.pages.get(position=i+1).pk) self.assert_(pages[i+5].pk == doc2.pages.get(position=i+1).pk)
def handle_work_item(processor, item): """ Pick up a (possibly) multipage TIFF upload and turn it into a document having (possibly) multiple individual pages. """ new_work = [] asset = item['Asset-Instance'] local_path = item['Local-Path'] work_dir = os.path.dirname(local_path) page_prefix = os.path.join(work_dir, 'page-') os.system( 'tiffsplit %r %r' % (local_path, page_prefix) ) document = operations.create_document( owner = asset.owner, title = 'Uploaded on %s (%s)' % ( asset.date_created, asset.producer.process )) position = 1 all_page_files = glob.glob('%s*.tif*' % page_prefix) all_page_files.sort() for page_tiff_path in all_page_files: new_work.extend( handle_page( processor, asset, document, page_tiff_path, position )) position += 1 if document is not None: new_work.append( document.assets.get( asset_class__name = models.AssetClass.DOCUMENT, mime_type__name = models.MimeType.BINARY )) return new_work
def handle_work_item(processor, item): """ Pick up a (possibly) multipage PDF upload and turn it into a document having (possibly) multiple individual pages. """ asset = item['Asset-Instance'] local_path = item['Local-Path'] is_new = item['Is-New'] work_dir = os.path.dirname(local_path) page_prefix = os.path.join(work_dir, 'page-') asset_list = [] pdf.split_pages( local_path, page_prefix ) document = operations.create_document( asset.owner, title = 'Uploaded on %s (%s)' % ( asset.date_created, asset.producer.process )) position = 1 all_page_files = glob.glob('%s*.pdf' % page_prefix) all_page_files.sort() for page_pdf_path in all_page_files: asset_list.extend( create_page( processor, asset, document, page_pdf_path, position )) position += 1 asset_list.append( document.assets.get( asset_class__name = models.AssetClass.DOCUMENT, mime_type__name = models.MimeType.BINARY )) return asset_list
def test_tag_documents_by_time(self): # create an unclassified document doc0 = operations.create_document( owner = self.user ) asset0 = operations.create_asset_from_stream( owner = self.user, producer = self.producer, asset_class = models.AssetClass.DOCUMENT, data_stream = StringIO('some pdf'), file_name = 'create_asset_from_string.txt', child_number = 1, related_document = doc0, mime_type = models.MimeType.PDF ) sleep(2) doc1 = operations.create_document( owner = self.user ) now = datetime.date.fromtimestamp(time.time()) pdf_generator.tag_document(doc1, datetime.timedelta(0, 1)) asset1 = operations.create_asset_from_stream( owner = self.user, producer = self.producer, asset_class = models.AssetClass.DOCUMENT, data_stream = StringIO('some pdf'), file_name = 'create_asset_from_string.txt', child_number = 1, related_document = doc1, mime_type = models.MimeType.PDF ) # do we have a new tag? self.assert_( doc1.tags.all().count() == 1 ) tag1 = doc1.tags.all()[0] self.assert_(tag1.tag_class == models.Tag.UPLOAD_AGGREGATE) # sleep 3 sec sleep(3) doc2 = operations.create_document( owner = self.user ) now = datetime.date.fromtimestamp(time.time()) pdf_generator.tag_document(doc2, datetime.timedelta(0, 1)) # is the second document tagged in the different tag? self.assert_( doc2.tags.all().count() == 1 ) tag2 = doc2.tags.all()[0] self.assert_(tag2.label != tag1.label) asset2 = operations.create_asset_from_stream( owner = self.user, producer = self.producer, asset_class = models.AssetClass.DOCUMENT, data_stream = StringIO('some pdf'), file_name = 'create_asset_from_string.txt', child_number = 1, related_document = doc2, mime_type = models.MimeType.PDF ) # sleep another 3 seconds and create the 3rd document, # but with a longer threshold sleep(3) doc3 = operations.create_document( owner = self.user ) now = datetime.date.fromtimestamp(time.time()) pdf_generator.tag_document(doc3, datetime.timedelta(0, 10)) # did this one got tagged with a same tag? self.assert_( doc2.tags.all().count() == 1 ) self.assert_( tag2 == doc3.tags.all()[0] ) asset3 = operations.create_asset_from_stream( owner = self.user, producer = self.producer, asset_class = models.AssetClass.DOCUMENT, data_stream = StringIO('some pdf'), file_name = 'create_asset_from_string.txt', child_number = 1, related_document = doc3, mime_type = models.MimeType.PDF )