def test_preservation_pending_museum_package_frozen(self, session):
        """
        Check the 'preservation_pending' status for a MuseumObject that
        would be eligible for preservation but is frozen
        """
        now = datetime.datetime.now(datetime.timezone.utc)

        mus_object = MuseumObject(
            id=1,
            preserved=True,
            frozen=True,
            modified_date=now - datetime.timedelta(days=15),
            created_date=now - datetime.timedelta(days=90)
        )
        mus_package = MuseumPackage(
            sip_filename="fake_package.tar",
            object_modified_date=now - datetime.timedelta(days=50),
            downloaded=True,
            packaged=True,
            uploaded=True
        )
        mus_object.packages.append(mus_package)
        mus_object.latest_package = mus_package
        assert mus_object.packages[0] == mus_object.latest_package

        session.add(mus_object)
        session.commit()

        assert not mus_object.preservation_pending
        assert_preservation_pending_count(session.query(MuseumObject), 0)
    def test_museum_object_attachments(self, session):
        mus_attachment_a = MuseumAttachment(
            id=10, filename="testAttachment.tar"
        )
        mus_attachment_b = MuseumAttachment(
            id=20, filename="testAttachment2.tar"
        )
        mus_attachment_c = MuseumAttachment(
            id=30, filename="testAttachment3.tar"
        )

        mus_object_a = MuseumObject(
            id=10,
            attachments=[mus_attachment_a, mus_attachment_b]
        )
        mus_object_b = MuseumObject(
            id=20,
            attachments=[mus_attachment_b, mus_attachment_c]
        )

        session.add(mus_object_a)
        session.add(mus_object_b)
        session.commit()

        mus_object_a = session.query(MuseumObject).get(10)
        mus_object_b = session.query(MuseumObject).get(20)

        assert mus_object_a.attachments[0].filename == "testAttachment.tar"
        assert mus_object_a.attachments[1].filename == "testAttachment2.tar"
        assert mus_object_b.attachments[0].filename == "testAttachment2.tar"
        assert mus_object_b.attachments[1].filename == "testAttachment3.tar"
    def test_preservation_pending_museum_package_no_date(self, session):
        now = (
            datetime.datetime.now(datetime.timezone.utc)
            - datetime.timedelta(days=50)
        )

        mus_object = MuseumObject(
            id=1, preserved=True,
            metadata_hash="new_hash", attachment_metadata_hash=""
        )
        mus_package = MuseumPackage(
            sip_filename="fake_package.tar",
            downloaded=True,
            packaged=True,
            uploaded=True,
            preserved=True,
            metadata_hash="old_hash",
            attachment_metadata_hash=""
        )
        mus_object.packages.append(mus_package)
        mus_object.latest_package = mus_package

        session.add(mus_object)
        session.commit()

        assert not mus_object.preservation_pending
        assert_preservation_pending_count(session.query(MuseumObject), 0)

        mus_object.modified_date = now
        session.commit()

        assert mus_object.preservation_pending
        assert_preservation_pending_count(session.query(MuseumObject), 1)
    def test_preservation_pending_museum_package(self, session):
        now = datetime.datetime.now(datetime.timezone.utc)

        # MuseumObject has one package already, but it was modified again
        # 35 days later. This needs preservation again.
        mus_object = MuseumObject(
            id=1,
            preserved=True,
            modified_date=now - datetime.timedelta(days=15),
            created_date=now - datetime.timedelta(days=90),
            metadata_hash="new_hash",
            attachment_metadata_hash=""
        )
        mus_package = MuseumPackage(
            sip_filename="fake_package.tar",
            object_modified_date=now - datetime.timedelta(days=50),
            downloaded=True,
            packaged=True,
            uploaded=True,
            metadata_hash="old_hash",
            attachment_metadata_hash=""
        )
        mus_object.packages.append(mus_package)
        mus_object.latest_package = mus_package
        assert mus_object.packages[0] == mus_object.latest_package

        session.add(mus_object)
        session.add(
            MuseumObject(
                id=2, created_date=datetime.datetime.now(datetime.timezone.utc),
                modified_date=datetime.datetime.now(datetime.timezone.utc),
                preserved=True
            )
        )
        session.commit()

        # Check that only the preserved object is found
        assert (
            session.query(MuseumObject)
            .with_transformation(MuseumObject.exclude_preservation_pending)
            .one().id == 2
        )
        assert mus_object.preservation_pending
        assert (
            session.query(MuseumObject)
            .with_transformation(MuseumObject.filter_preservation_pending)
            .one().id == 1
        )

        # If the modification date is still the same, no preservation is needed
        mus_object.modified_date = now - datetime.timedelta(days=50)
        assert not mus_object.preservation_pending

        session.commit()

        assert_preservation_pending_count(session.query(MuseumObject), 0)
    def test_preservation_pending_preserved(self, session):
        # MuseumObject has no packages and is more than 30 days old;
        # therefore it should be preserved
        mus_object = MuseumObject(
            id=1,
            preserved=False,
            modified_date=datetime.datetime.now(
                datetime.timezone.utc
            ),
            created_date=(
                datetime.datetime.now(datetime.timezone.utc)
                - datetime.timedelta(days=31)
            ),
            metadata_hash="",
            attachment_metadata_hash=""
        )

        # Add one object pending preservation and one that's not
        session.add(mus_object)
        session.add(
            MuseumObject(
                id=2,
                modified_date=datetime.datetime.now(datetime.timezone.utc),
                created_date=datetime.datetime.now(datetime.timezone.utc),
                metadata_hash="",
                attachment_metadata_hash=""
            )
        )
        session.commit()

        # Find the preserved object
        assert (
            session.query(MuseumObject)
            .with_transformation(MuseumObject.exclude_preservation_pending)
            .one().id == 2
        )

        pk = mus_object.id

        mus_object = (
            session.query(MuseumObject)
            .with_transformation(MuseumObject.filter_preservation_pending)
        ).one()

        assert mus_object.id == pk
        assert mus_object.preservation_pending

        # If the MuseumObject is too recent (newer than 30 days),
        # it won't be preserved, even if it has no packages yet
        mus_object.created_date = datetime.datetime.now(datetime.timezone.utc)
        assert not mus_object.preservation_pending

        session.commit()

        assert_preservation_pending_count(session.query(MuseumObject), 0)
    def test_packages(self, session):
        mus_object = MuseumObject(
            id=1337,
            preserved=True)

        mus_package_a = MuseumPackage(
            sip_filename="test_one.tar",
            museum_object=mus_object,
        )
        mus_package_b = MuseumPackage(
            sip_filename="test_two.tar",
            museum_object=mus_object,
            created_date=(
                datetime.datetime.now(datetime.timezone.utc)
                + datetime.timedelta(minutes=10)
            )
        )
        session.add_all([mus_package_a, mus_package_b])

        session.commit()

        mus_object = session.query(
            MuseumObject
        ).filter_by(id=1337).first()

        assert mus_object.packages[0].sip_filename == "test_one.tar"
        assert mus_object.packages[1].sip_filename == "test_two.tar"
        assert len(mus_object.packages) == 2

        session.delete(mus_package_b)
        session.commit()

        assert len(mus_object.packages) == 1

        assert mus_object.packages[0].museum_object.id == 1337
    def func(**kwargs):
        if not kwargs.get("created_date"):
            kwargs["created_date"] = TEST_DATE
        if not kwargs.get("modified_date"):
            kwargs["modified_date"] = TEST_DATE

        museum_object = MuseumObject(**kwargs)
        session.add(museum_object)
        session.commit()

        return museum_object
    def func(**kwargs):
        if "created_date" not in kwargs:
            kwargs["created_date"] = PLACEHOLDER_DATE
        if "modified_date" not in kwargs:
            kwargs["modified_date"] = PLACEHOLDER_DATE

        museum_object = MuseumObject(**kwargs)
        session.add(museum_object)
        session.commit()

        return museum_object
    def test_museum_object_freeze_source(self, session):
        mus_object = MuseumObject(
            id=1337, frozen=True, freeze_source=FreezeSource.USER
        )
        session.add(mus_object)

        assert session.query(MuseumObject).filter(
            MuseumObject.freeze_source == FreezeSource.USER
        ).count() == 1
        assert session.query(MuseumObject).filter(
            MuseumObject.freeze_source == FreezeSource.AUTOMATIC
        ).count() == 0
    def test_museum_object(self, session):
        mus_object = MuseumObject(
            id=1337,
            preserved=True
        )
        session.add(mus_object)

        # Retrieve it
        mus_object = session.query(
            MuseumObject
        ).filter_by(id=1337).first()

        assert mus_object.preserved
    def test_preservation_pending_preserved_no_date(self, session):
        # If the package has no modification date, it is automatically
        # eligible for preservation
        mus_object = MuseumObject(
            id=1, preserved=False, metadata_hash="",
            attachment_metadata_hash=""
        )

        session.add(mus_object)
        session.commit()

        assert mus_object.preservation_pending
        assert_preservation_pending_count(session.query(MuseumObject), 1)
def test_sync_processed_sips_accepted(session, museum_packages_dir, sftp_dir,
                                      redis, sftp_package_factory,
                                      sync_processed_sips):
    # Create local package directory
    museum_packages_dir.joinpath("123456", "logs").mkdir(parents=True)
    museum_packages_dir.joinpath("123456", "sip",
                                 "reports").mkdir(parents=True)

    # Create two accepted SIPs on the mocked SFTP server.
    # The newer one will be selected according to its newer modification date
    new_package_dir = sftp_package_factory(status="accepted",
                                           date=datetime.datetime(2019, 5, 28),
                                           object_id=123456,
                                           sip_id="AABBCC2",
                                           transfer_id="aabbcc",
                                           content="New report")
    os.utime(
        new_package_dir /
        "20190102_Object_123456-AABBCC2.tar-aabbcc-ingest-report.xml",
        (time.time() - 600, time.time() - 600))

    old_package_dir = sftp_package_factory(status="accepted",
                                           date=datetime.datetime(2019, 5, 28),
                                           object_id=123456,
                                           sip_id="CCBBAA2",
                                           transfer_id="ccbbaa",
                                           content="Old report")
    os.utime(
        old_package_dir /
        "20190102_Object_123456-CCBBAA2.tar-ccbbaa-ingest-report.xml",
        (time.time() - 1200, time.time() - 1200))

    # Object.xml is required to load MuseumObjectPackage locally
    report_path = Path(__file__).parent.resolve() / "data" / "Object.xml"
    shutil.copyfile(
        report_path,
        museum_packages_dir / "123456" / "sip" / "reports" / "Object.xml")

    db_museum_object = MuseumObject(id=123456,
                                    created_date=TEST_DATE,
                                    modified_date=TEST_DATE)
    db_museum_package = MuseumPackage(
        sip_filename="20190102_Object_123456-AABBCC2.tar",
        sip_id="AABBCC2",
        object_modified_date=TEST_DATE,
        downloaded=True,
        packaged=True,
        uploaded=True,
        museum_object=db_museum_object)
    db_museum_object.latest_package = db_museum_package

    session.add(db_museum_object)
    session.commit()

    with freezegun.freeze_time("2019-06-01"):
        result = sync_processed_sips(["--days", "7"])

    assert "Found 2 on 2019-05-28" in result.stdout
    assert "Found 2 accepted SIPs" in result.stdout
    assert "Found 0 rejected SIPs" in result.stdout

    # Ingest reports are downloaded
    assert museum_packages_dir.joinpath(
        "123456", "logs", "ingest-report.xml").read_text(
        ) == "<xml><content>New report</content></xml>"
    assert museum_packages_dir.joinpath(
        "123456", "logs", "ingest-report.html").read_text(
        ) == "<html><body>New report</body></html>"

    # Museum package is updated
    db_museum_package = session.query(MuseumPackage).filter_by(
        sip_filename="20190102_Object_123456-AABBCC2.tar").one()
    assert db_museum_package.preserved

    # Status file is created
    assert (
        museum_packages_dir / "123456" /
        "20190102_Object_123456-AABBCC2.tar.status").read_text() == "accepted"

    # RQ task is enqueued
    queue = get_queue(QueueType.CONFIRM_SIP)
    job = queue.jobs[0]
    assert job.id == "confirm_sip_123456"
    assert job.kwargs == {"object_id": 123456, "sip_id": "AABBCC2"}
Exemple #13
0
async def sync_objects(offset=0, limit=None, save_progress=False):
    """
    Synchronize object metadata from MuseumPlus to determine which
    objects have changed and need to be updated in the DPRES service. This
    is followed by 'sync_hashes'.

    :param int offset: Offset to start synchronizing from
    :param int limit: How many objects to sync before stopping.
        Default is None, meaning all available objects are synchronized.
    :param bool save_progress: Whether to save synchronization progress
                               and continue from the last run. Offset and limit
                               are ignored if enabled.
    """
    modify_date_gte = None

    if save_progress:
        limit = None

        sync_status = get_sync_status("sync_objects")
        offset = sync_status.offset
        # Start synchronization from objects that changed since the last
        # sync
        modify_date_gte = sync_status.prev_start_sync_date
        print(f"Continuing synchronization from {offset}")

    museum_session = await get_museum_session()
    object_iter = iterate_objects(session=museum_session,
                                  offset=offset,
                                  modify_date_gte=modify_date_gte)
    all_iterated = False
    index = offset
    processed = 0

    while True:
        results = []

        all_iterated = True
        async for result in object_iter:
            all_iterated = False
            results.append(result)
            index += 1

            if len(results) >= CHUNK_SIZE:
                break

        objects = {result["id"]: result for result in results}
        object_ids = list(objects.keys())

        inserts, updates = 0, 0

        with scoped_session() as db:
            existing_object_ids = set([
                result.id for result in db.query(MuseumObject).options(
                    load_only("id")).filter(MuseumObject.id.in_(object_ids))
            ])

            object_id2attachment_id = defaultdict(set)
            attachment_ids = set()

            update_params = []

            # Create existing objects, update the rest
            for result in objects.values():
                object_id = int(result["id"])
                title = result["title"]
                modified_date = result["modified_date"]
                created_date = result["created_date"]
                multimedia_ids = result["multimedia_ids"]
                xml_hash = result["xml_hash"]

                object_id2attachment_id[object_id].update(multimedia_ids)
                attachment_ids.update(multimedia_ids)

                if object_id in existing_object_ids:
                    # Don't run the update query instantly; instead,
                    # set the parameters and run them all together later
                    # in bulk
                    update_params.append({
                        "_id": object_id,
                        "_title": title,
                        "_modified_date": modified_date,
                        "_metadata_hash": xml_hash
                    })
                    updates += 1
                else:
                    # Create
                    mus_object = MuseumObject(id=object_id,
                                              title=title,
                                              modified_date=modified_date,
                                              created_date=created_date,
                                              metadata_hash=xml_hash)
                    db.add(mus_object)
                    inserts += 1

                processed += 1

                if limit is not None and processed == limit:
                    all_iterated = True
                    break

            if update_params:
                # Perform updates in bulk
                stmt_a = (MuseumObject.__table__.update().where(
                    MuseumObject.id == bindparam("_id")).values({
                        "title":
                        bindparam("_title"),
                        "metadata_hash":
                        bindparam("_metadata_hash")
                    }))
                stmt_b = (MuseumObject.__table__.update().where(
                    and_(
                        MuseumObject.id == bindparam("_id"),
                        or_(
                            MuseumObject.modified_date == None,
                            MuseumObject.modified_date <
                            bindparam("_modified_date")))).values(
                                {"modified_date":
                                 bindparam("_modified_date")}))
                db.execute(stmt_a, update_params)
                db.execute(stmt_b, update_params)

            # Create/update MuseumAttachments with references
            # to the newly updated MuseumObjects.
            # For performance reasons update references for a batch
            # of objects at once
            objects = (db.query(MuseumObject).filter(
                MuseumObject.id.in_(object_ids)))
            attachments = bulk_create_or_get(db, MuseumAttachment,
                                             attachment_ids)
            attachments_by_id = {
                attachment.id: attachment
                for attachment in attachments
            }

            for museum_object in objects:
                museum_object.attachments = [
                    attachments_by_id[attachment_id] for attachment_id in
                    object_id2attachment_id[museum_object.id]
                ]

        results = []

        print(f"Updated, {inserts} inserts, {updates} "
              f"updates. Updating from offset: {index}")

        # Submit heartbeat after each successful iteration instead of once
        # at the end. This is because this script is designed to be stopped
        # before it has finished iterating everything.
        submit_heartbeat(HeartbeatSource.SYNC_OBJECTS)

        if save_progress:
            update_offset("sync_objects", offset=index)

        if all_iterated:
            if save_progress:
                finish_sync_progress("sync_objects")

            break

    await museum_session.close()