def test_preservation_pending_museum_package_frozen(self, session): """ Check the 'preservation_pending' status for a MuseumObject that would be eligible for preservation but is frozen """ now = datetime.datetime.now(datetime.timezone.utc) mus_object = MuseumObject( id=1, preserved=True, frozen=True, modified_date=now - datetime.timedelta(days=15), created_date=now - datetime.timedelta(days=90) ) mus_package = MuseumPackage( sip_filename="fake_package.tar", object_modified_date=now - datetime.timedelta(days=50), downloaded=True, packaged=True, uploaded=True ) mus_object.packages.append(mus_package) mus_object.latest_package = mus_package assert mus_object.packages[0] == mus_object.latest_package session.add(mus_object) session.commit() assert not mus_object.preservation_pending assert_preservation_pending_count(session.query(MuseumObject), 0)
def test_museum_object_attachments(self, session): mus_attachment_a = MuseumAttachment( id=10, filename="testAttachment.tar" ) mus_attachment_b = MuseumAttachment( id=20, filename="testAttachment2.tar" ) mus_attachment_c = MuseumAttachment( id=30, filename="testAttachment3.tar" ) mus_object_a = MuseumObject( id=10, attachments=[mus_attachment_a, mus_attachment_b] ) mus_object_b = MuseumObject( id=20, attachments=[mus_attachment_b, mus_attachment_c] ) session.add(mus_object_a) session.add(mus_object_b) session.commit() mus_object_a = session.query(MuseumObject).get(10) mus_object_b = session.query(MuseumObject).get(20) assert mus_object_a.attachments[0].filename == "testAttachment.tar" assert mus_object_a.attachments[1].filename == "testAttachment2.tar" assert mus_object_b.attachments[0].filename == "testAttachment2.tar" assert mus_object_b.attachments[1].filename == "testAttachment3.tar"
def test_preservation_pending_museum_package_no_date(self, session): now = ( datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=50) ) mus_object = MuseumObject( id=1, preserved=True, metadata_hash="new_hash", attachment_metadata_hash="" ) mus_package = MuseumPackage( sip_filename="fake_package.tar", downloaded=True, packaged=True, uploaded=True, preserved=True, metadata_hash="old_hash", attachment_metadata_hash="" ) mus_object.packages.append(mus_package) mus_object.latest_package = mus_package session.add(mus_object) session.commit() assert not mus_object.preservation_pending assert_preservation_pending_count(session.query(MuseumObject), 0) mus_object.modified_date = now session.commit() assert mus_object.preservation_pending assert_preservation_pending_count(session.query(MuseumObject), 1)
def test_preservation_pending_museum_package(self, session): now = datetime.datetime.now(datetime.timezone.utc) # MuseumObject has one package already, but it was modified again # 35 days later. This needs preservation again. mus_object = MuseumObject( id=1, preserved=True, modified_date=now - datetime.timedelta(days=15), created_date=now - datetime.timedelta(days=90), metadata_hash="new_hash", attachment_metadata_hash="" ) mus_package = MuseumPackage( sip_filename="fake_package.tar", object_modified_date=now - datetime.timedelta(days=50), downloaded=True, packaged=True, uploaded=True, metadata_hash="old_hash", attachment_metadata_hash="" ) mus_object.packages.append(mus_package) mus_object.latest_package = mus_package assert mus_object.packages[0] == mus_object.latest_package session.add(mus_object) session.add( MuseumObject( id=2, created_date=datetime.datetime.now(datetime.timezone.utc), modified_date=datetime.datetime.now(datetime.timezone.utc), preserved=True ) ) session.commit() # Check that only the preserved object is found assert ( session.query(MuseumObject) .with_transformation(MuseumObject.exclude_preservation_pending) .one().id == 2 ) assert mus_object.preservation_pending assert ( session.query(MuseumObject) .with_transformation(MuseumObject.filter_preservation_pending) .one().id == 1 ) # If the modification date is still the same, no preservation is needed mus_object.modified_date = now - datetime.timedelta(days=50) assert not mus_object.preservation_pending session.commit() assert_preservation_pending_count(session.query(MuseumObject), 0)
def test_preservation_pending_preserved(self, session): # MuseumObject has no packages and is more than 30 days old; # therefore it should be preserved mus_object = MuseumObject( id=1, preserved=False, modified_date=datetime.datetime.now( datetime.timezone.utc ), created_date=( datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=31) ), metadata_hash="", attachment_metadata_hash="" ) # Add one object pending preservation and one that's not session.add(mus_object) session.add( MuseumObject( id=2, modified_date=datetime.datetime.now(datetime.timezone.utc), created_date=datetime.datetime.now(datetime.timezone.utc), metadata_hash="", attachment_metadata_hash="" ) ) session.commit() # Find the preserved object assert ( session.query(MuseumObject) .with_transformation(MuseumObject.exclude_preservation_pending) .one().id == 2 ) pk = mus_object.id mus_object = ( session.query(MuseumObject) .with_transformation(MuseumObject.filter_preservation_pending) ).one() assert mus_object.id == pk assert mus_object.preservation_pending # If the MuseumObject is too recent (newer than 30 days), # it won't be preserved, even if it has no packages yet mus_object.created_date = datetime.datetime.now(datetime.timezone.utc) assert not mus_object.preservation_pending session.commit() assert_preservation_pending_count(session.query(MuseumObject), 0)
def test_packages(self, session): mus_object = MuseumObject( id=1337, preserved=True) mus_package_a = MuseumPackage( sip_filename="test_one.tar", museum_object=mus_object, ) mus_package_b = MuseumPackage( sip_filename="test_two.tar", museum_object=mus_object, created_date=( datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(minutes=10) ) ) session.add_all([mus_package_a, mus_package_b]) session.commit() mus_object = session.query( MuseumObject ).filter_by(id=1337).first() assert mus_object.packages[0].sip_filename == "test_one.tar" assert mus_object.packages[1].sip_filename == "test_two.tar" assert len(mus_object.packages) == 2 session.delete(mus_package_b) session.commit() assert len(mus_object.packages) == 1 assert mus_object.packages[0].museum_object.id == 1337
def func(**kwargs): if not kwargs.get("created_date"): kwargs["created_date"] = TEST_DATE if not kwargs.get("modified_date"): kwargs["modified_date"] = TEST_DATE museum_object = MuseumObject(**kwargs) session.add(museum_object) session.commit() return museum_object
def func(**kwargs): if "created_date" not in kwargs: kwargs["created_date"] = PLACEHOLDER_DATE if "modified_date" not in kwargs: kwargs["modified_date"] = PLACEHOLDER_DATE museum_object = MuseumObject(**kwargs) session.add(museum_object) session.commit() return museum_object
def test_museum_object_freeze_source(self, session): mus_object = MuseumObject( id=1337, frozen=True, freeze_source=FreezeSource.USER ) session.add(mus_object) assert session.query(MuseumObject).filter( MuseumObject.freeze_source == FreezeSource.USER ).count() == 1 assert session.query(MuseumObject).filter( MuseumObject.freeze_source == FreezeSource.AUTOMATIC ).count() == 0
def test_museum_object(self, session): mus_object = MuseumObject( id=1337, preserved=True ) session.add(mus_object) # Retrieve it mus_object = session.query( MuseumObject ).filter_by(id=1337).first() assert mus_object.preserved
def test_preservation_pending_preserved_no_date(self, session): # If the package has no modification date, it is automatically # eligible for preservation mus_object = MuseumObject( id=1, preserved=False, metadata_hash="", attachment_metadata_hash="" ) session.add(mus_object) session.commit() assert mus_object.preservation_pending assert_preservation_pending_count(session.query(MuseumObject), 1)
def test_sync_processed_sips_accepted(session, museum_packages_dir, sftp_dir, redis, sftp_package_factory, sync_processed_sips): # Create local package directory museum_packages_dir.joinpath("123456", "logs").mkdir(parents=True) museum_packages_dir.joinpath("123456", "sip", "reports").mkdir(parents=True) # Create two accepted SIPs on the mocked SFTP server. # The newer one will be selected according to its newer modification date new_package_dir = sftp_package_factory(status="accepted", date=datetime.datetime(2019, 5, 28), object_id=123456, sip_id="AABBCC2", transfer_id="aabbcc", content="New report") os.utime( new_package_dir / "20190102_Object_123456-AABBCC2.tar-aabbcc-ingest-report.xml", (time.time() - 600, time.time() - 600)) old_package_dir = sftp_package_factory(status="accepted", date=datetime.datetime(2019, 5, 28), object_id=123456, sip_id="CCBBAA2", transfer_id="ccbbaa", content="Old report") os.utime( old_package_dir / "20190102_Object_123456-CCBBAA2.tar-ccbbaa-ingest-report.xml", (time.time() - 1200, time.time() - 1200)) # Object.xml is required to load MuseumObjectPackage locally report_path = Path(__file__).parent.resolve() / "data" / "Object.xml" shutil.copyfile( report_path, museum_packages_dir / "123456" / "sip" / "reports" / "Object.xml") db_museum_object = MuseumObject(id=123456, created_date=TEST_DATE, modified_date=TEST_DATE) db_museum_package = MuseumPackage( sip_filename="20190102_Object_123456-AABBCC2.tar", sip_id="AABBCC2", object_modified_date=TEST_DATE, downloaded=True, packaged=True, uploaded=True, museum_object=db_museum_object) db_museum_object.latest_package = db_museum_package session.add(db_museum_object) session.commit() with freezegun.freeze_time("2019-06-01"): result = sync_processed_sips(["--days", "7"]) assert "Found 2 on 2019-05-28" in result.stdout assert "Found 2 accepted SIPs" in result.stdout assert "Found 0 rejected SIPs" in result.stdout # Ingest reports are downloaded assert museum_packages_dir.joinpath( "123456", "logs", "ingest-report.xml").read_text( ) == "<xml><content>New report</content></xml>" assert museum_packages_dir.joinpath( "123456", "logs", "ingest-report.html").read_text( ) == "<html><body>New report</body></html>" # Museum package is updated db_museum_package = session.query(MuseumPackage).filter_by( sip_filename="20190102_Object_123456-AABBCC2.tar").one() assert db_museum_package.preserved # Status file is created assert ( museum_packages_dir / "123456" / "20190102_Object_123456-AABBCC2.tar.status").read_text() == "accepted" # RQ task is enqueued queue = get_queue(QueueType.CONFIRM_SIP) job = queue.jobs[0] assert job.id == "confirm_sip_123456" assert job.kwargs == {"object_id": 123456, "sip_id": "AABBCC2"}
async def sync_objects(offset=0, limit=None, save_progress=False): """ Synchronize object metadata from MuseumPlus to determine which objects have changed and need to be updated in the DPRES service. This is followed by 'sync_hashes'. :param int offset: Offset to start synchronizing from :param int limit: How many objects to sync before stopping. Default is None, meaning all available objects are synchronized. :param bool save_progress: Whether to save synchronization progress and continue from the last run. Offset and limit are ignored if enabled. """ modify_date_gte = None if save_progress: limit = None sync_status = get_sync_status("sync_objects") offset = sync_status.offset # Start synchronization from objects that changed since the last # sync modify_date_gte = sync_status.prev_start_sync_date print(f"Continuing synchronization from {offset}") museum_session = await get_museum_session() object_iter = iterate_objects(session=museum_session, offset=offset, modify_date_gte=modify_date_gte) all_iterated = False index = offset processed = 0 while True: results = [] all_iterated = True async for result in object_iter: all_iterated = False results.append(result) index += 1 if len(results) >= CHUNK_SIZE: break objects = {result["id"]: result for result in results} object_ids = list(objects.keys()) inserts, updates = 0, 0 with scoped_session() as db: existing_object_ids = set([ result.id for result in db.query(MuseumObject).options( load_only("id")).filter(MuseumObject.id.in_(object_ids)) ]) object_id2attachment_id = defaultdict(set) attachment_ids = set() update_params = [] # Create existing objects, update the rest for result in objects.values(): object_id = int(result["id"]) title = result["title"] modified_date = result["modified_date"] created_date = result["created_date"] multimedia_ids = result["multimedia_ids"] xml_hash = result["xml_hash"] object_id2attachment_id[object_id].update(multimedia_ids) attachment_ids.update(multimedia_ids) if object_id in existing_object_ids: # Don't run the update query instantly; instead, # set the parameters and run them all together later # in bulk update_params.append({ "_id": object_id, "_title": title, "_modified_date": modified_date, "_metadata_hash": xml_hash }) updates += 1 else: # Create mus_object = MuseumObject(id=object_id, title=title, modified_date=modified_date, created_date=created_date, metadata_hash=xml_hash) db.add(mus_object) inserts += 1 processed += 1 if limit is not None and processed == limit: all_iterated = True break if update_params: # Perform updates in bulk stmt_a = (MuseumObject.__table__.update().where( MuseumObject.id == bindparam("_id")).values({ "title": bindparam("_title"), "metadata_hash": bindparam("_metadata_hash") })) stmt_b = (MuseumObject.__table__.update().where( and_( MuseumObject.id == bindparam("_id"), or_( MuseumObject.modified_date == None, MuseumObject.modified_date < bindparam("_modified_date")))).values( {"modified_date": bindparam("_modified_date")})) db.execute(stmt_a, update_params) db.execute(stmt_b, update_params) # Create/update MuseumAttachments with references # to the newly updated MuseumObjects. # For performance reasons update references for a batch # of objects at once objects = (db.query(MuseumObject).filter( MuseumObject.id.in_(object_ids))) attachments = bulk_create_or_get(db, MuseumAttachment, attachment_ids) attachments_by_id = { attachment.id: attachment for attachment in attachments } for museum_object in objects: museum_object.attachments = [ attachments_by_id[attachment_id] for attachment_id in object_id2attachment_id[museum_object.id] ] results = [] print(f"Updated, {inserts} inserts, {updates} " f"updates. Updating from offset: {index}") # Submit heartbeat after each successful iteration instead of once # at the end. This is because this script is designed to be stopped # before it has finished iterating everything. submit_heartbeat(HeartbeatSource.SYNC_OBJECTS) if save_progress: update_offset("sync_objects", offset=index) if all_iterated: if save_progress: finish_sync_progress("sync_objects") break await museum_session.close()