Esempio n. 1
0
 def setUpClass(cls):
     super().setUpClass()
     AnatomyToolOAIPMHFactory.create_common_anatomy_tool_responses()
     cls.begin_of_time = make_aware(datetime(year=1970, month=1, day=1))
     cls.set_spec = "anatomy_tool"
     cls.seeds = get_harvest_seeds(Repositories.ANATOMY_TOOL, cls.set_spec,
                                   cls.begin_of_time)
Esempio n. 2
0
 def test_handle_upsert_seeds(self):
     dataset_version = DatasetVersion.objects.last()
     collection = Collection.objects.create(
         name=self.spec_set,
         dataset_version=dataset_version,
         referee="external_id"
     )
     command = self.get_command_instance()
     upserts = [
         seed
         for seed in get_harvest_seeds(self.repository, self.spec_set,
                                       make_aware(datetime(year=1970, month=1, day=1)))
         if seed.get("state", "active") == "active"
     ]
     documents_count = command.handle_upsert_seeds(collection, upserts)
     # When dealing with an entirely new Dataset
     # Then the document count should equal output of handle_upsert_seeds
     self.assertEqual(collection.document_set.count(), documents_count)
     for document in collection.document_set.all():
         self.assertEqual(document.reference, document.properties["external_id"])
         metadata_pipeline = document.pipeline["metadata"]
         self.assertEqual(metadata_pipeline["resource"], self.repository.lower())
         self.assertIsInstance(metadata_pipeline["id"], int)
         self.assertTrue(metadata_pipeline["success"])
         if "5af0e26f-c4d2-4ddd-94ab-7dd0bd531751" in document.reference:
             self.assertIsNotNone(document.extension)
         else:
             self.assertIsNone(document.extension)
Esempio n. 3
0
 def test_get_complete_set_without_deletes(self):
     seeds = get_harvest_seeds(Repositories.EDUREP,
                               self.set_spec,
                               self.begin_of_time,
                               include_deleted=False)
     self.assertEqual(len(seeds), 14)
     self.check_seed_integrity(seeds, include_deleted=False)
Esempio n. 4
0
 def test_get_partial_set(self):
     seeds = get_harvest_seeds(
         Repositories.EDUREP, self.set_spec,
         make_aware(datetime(year=2020, month=2, day=10, hour=22,
                             minute=22)))
     self.assertEqual(len(seeds), 6)
     self.check_seed_integrity(seeds)
Esempio n. 5
0
def sync_sharekit_metadata():
    # Select which data to sync this run
    latest_active_dataset = Dataset.objects.filter(is_active=True).last()
    if not latest_active_dataset:
        return
    dataset_version = DatasetVersion.objects.get_current_version()
    harvest_queryset = Harvest.objects.filter(
        dataset=latest_active_dataset,
        source__repository=Repositories.SHAREKIT,
        stage=HarvestStages.
        COMPLETE  # prevents syncing materials half way a full harvest
    )
    # First we acquire a permanent lock on Harvests,
    # because if latest_update_at is a while ago this command will run a long time.
    # We don't want to keep all those syncing changes waiting in that one transaction.
    try:
        with atomic():
            harvest_queryset.filter(is_syncing=False).select_for_update(
                nowait=True).update(is_syncing=True)
    except DatabaseError:
        logger.warning(
            "Did not acquire lock on Harvester when syncing Sharekit metadata")
        return
    # Now that we're the only ones starting the sync we execute it
    for harvest in harvest_queryset.filter(is_syncing=True):
        # Check that a non-valid harvest source didn't slip through the lock
        if harvest.stage != HarvestStages.COMPLETE:
            logging.warning(
                "Encountered a non-complete harvest source during sync")
            continue
        # Recording which time will become latest_update_at
        current_time = make_aware(datetime.now())
        # Getting metadata from Sharekit and stop immediately if anything went wrong
        send_config = create_config("http_resource", {
            "resource": harvest.source.repository,
            "continuation_limit": 10000,
        })
        set_specification = harvest.source.spec
        scc, err = send(set_specification,
                        f"{harvest.latest_update_at:%Y-%m-%dT%H:%M:%SZ}",
                        config=send_config,
                        method="get")
        if len(err) or not len(scc):
            continue
        # Now parse the metadata and update current Collection for this Harvest
        seeds = get_harvest_seeds(Repositories.SHAREKIT,
                                  set_specification,
                                  harvest.latest_update_at,
                                  include_no_url=True)
        collection = dataset_version.collection_set.filter(
            name=harvest.source.spec).last()
        for seeds_batch in ibatch(seeds, batch_size=32):
            collection.update(seeds_batch, "external_id")
        # Last but not least we update the harvest update time to get a different delta later
        harvest.latest_update_at = current_time
        harvest.save()
    # And we release the syncing lock
    with atomic():
        harvest_queryset.filter(is_syncing=True).select_for_update().update(
            is_syncing=False)
Esempio n. 6
0
 def preprocess_seeds(self, harvest_queryset):
     self.logger.start("preprocess")
     seeds_by_collection = defaultdict(list)
     source_count = harvest_queryset.count()
     for harvest in harvest_queryset:
         set_specification = harvest.source.spec
         upserts = []
         deletes = []
         for seed in get_harvest_seeds(harvest.source.repository,
                                       harvest.source.spec,
                                       harvest.latest_update_at,
                                       include_no_url=True):
             if seed.get("state", "active") == "active":
                 upserts.append(seed)
             else:
                 deletes.append(seed)
         seeds_by_collection[(harvest.source.repository,
                              harvest.source.spec)] += (
                                  upserts,
                                  deletes,
                              )
         self.logger.progress(f"preprocess.{set_specification}",
                              source_count,
                              success=len(upserts) + len(deletes))
     self.logger.end("preprocess")
     return seeds_by_collection
Esempio n. 7
0
 def setUpClass(cls):
     super().setUpClass()
     cls.set_spec = "surfsharekit"
     cls.begin_of_time = make_aware(datetime(year=1970, month=1, day=1))
     EdurepOAIPMHFactory.create_common_edurep_responses(include_delta=True)
     cls.seeds = get_harvest_seeds(Repositories.EDUREP, cls.set_spec,
                                   cls.begin_of_time)
Esempio n. 8
0
 def test_get_copyright(self):
     seeds = self.seeds
     self.assertEqual(len(seeds), 200, "Expected get_harvest_seeds to filter differently based on copyright")
     self.assertEqual(seeds[0]["copyright"], "open-access")
     self.assertIsNone(seeds[2]["copyright"], "Expected deleted record to have no copyright")
     seeds = get_harvest_seeds(Repositories.HAN, SET_SPECIFICATION, self.begin_of_time, include_deleted=False)
     self.assertEqual(len(seeds), 67, "Expected get_harvest_seeds to delete invalid copyright")
     self.assertEqual(seeds[1]["copyright"], "open-access")
Esempio n. 9
0
 def test_get_copyright(self):
     seeds = self.seeds
     self.assertEqual(len(seeds), 20)
     self.assertEqual(seeds[0]["copyright"], "yes")
     self.assertEqual(seeds[3]["copyright"], "open-access")
     seeds = get_harvest_seeds(Repositories.HVA, SET_SPECIFICATION, self.begin_of_time, include_deleted=False)
     self.assertEqual(len(seeds), 6, "Expected get_harvest_seeds to delete invalid copyright")
     self.assertEqual(seeds[0]["copyright"], "open-access")
Esempio n. 10
0
 def test_get_files(self):
     seeds = self.seeds
     self.assertEqual(seeds[0]["files"], [
         {
             "mime_type": "application/pdf",
             "url": "https://octo.hku.nl/octo/repository/getfile?id=zZQC1ZBu8c4",
             "hash": "9ac373e877133f0c00173bd02d82b1861c9934a2",
             "title": "Budapest2005.pdf"
         }
     ])
     all_seeds = get_harvest_seeds(Repositories.HKU, SET_SPECIFICATION, self.begin_of_time, include_no_url=True)
     self.assertEqual(all_seeds[21]["files"], [])
Esempio n. 11
0
 def test_get_partial_set_without_deletes(self):
     seeds = get_harvest_seeds(Repositories.EDUREP,
                               self.set_spec,
                               make_aware(
                                   datetime(year=2020,
                                            month=2,
                                            day=10,
                                            hour=22,
                                            minute=22)),
                               include_deleted=False)
     self.assertEqual(len(seeds), 4)
     self.check_seed_integrity(seeds, include_deleted=False)
Esempio n. 12
0
 def test_handle_deletion_seeds(self):
     dataset_version = DatasetVersion.objects.last()
     collection = Collection.objects.create(name=self.spec_set, dataset_version=dataset_version)
     command = self.get_command_instance()
     deletes = [
         seed
         for seed in get_harvest_seeds(self.repository, self.spec_set,
                                       make_aware(datetime(year=1970, month=1, day=1)))
         if seed.get("state", "active") != "active"
     ]
     # Basically we're testing that deletion seeds are not triggering errors when their targets do not exist.
     command.handle_deletion_seeds(collection, deletes)
     self.assertEqual(collection.document_set.count(), 0)
Esempio n. 13
0
 def test_handle_upsert_seeds(self):
     dataset_version = DatasetVersion.objects.last()
     collection = Collection.objects.get(name=self.spec_set, dataset_version=dataset_version, referee="external_id")
     command = self.get_command_instance()
     # Checking the state before the test
     document_count = collection.document_set.count()
     vortex_queryset = collection.documents.filter(properties__title="Using a Vortex | Wageningen UR")
     self.assertEqual(vortex_queryset.count(), 1,
                      "Expected the start state to contain 'Using a Vortex'")
     for doc in collection.documents.all():
         self.assertEqual(doc.created_at, doc.modified_at, f"Document is unexpectedly updated: {doc.id}")
     # Perform the test
     upserts = [
         seed
         for seed in get_harvest_seeds(self.repository, self.spec_set,
                                       make_aware(datetime(year=2019, month=12, day=31)))
         if seed.get("state", "active") == "active"
     ]
     command.handle_upsert_seeds(collection, upserts)
     # Checking the state after the test
     self.assertEqual(collection.document_set.count(), document_count+3)
     # Check video documents content updates
     vortex_updateset = collection.documents.filter(properties__title="Using a Vortex (responsibly) | Wageningen UR")
     self.assertEqual(vortex_updateset.count(), 1)
     self.assertEqual(vortex_queryset.count(), 0)
     # Check regular document content updates
     handson_insertset = collection.documents.filter(
         properties__title="Hands-off exercise based on WEKA - Tuning and Testing"
     )
     self.assertEqual(handson_insertset.count(), 1)
     processed_ids = set()
     for update in vortex_updateset:
         self.assertNotEqual(update.created_at, update.modified_at,
                             f"Document is unexpectedly not updated: {update.id}")
         self.assertEqual(update.reference, update.properties["external_id"])
         processed_ids.add(update.id)
     for insert in handson_insertset:
         self.assertEqual(insert.created_at.replace(microsecond=0), insert.modified_at.replace(microsecond=0),
                          f"Document is unexpectedly not inserted: {insert.id}")
         self.assertEqual(insert.reference, insert.properties["external_id"])
         processed_ids.add(insert.id)
     not_updated = collection.documents.exclude(id__in=processed_ids)
     self.assertNotEqual(not_updated.count(), 0)
     for not_update in not_updated:
         self.assertEqual(
             not_update.created_at.replace(microsecond=0), not_update.modified_at.replace(microsecond=0),
             f"Document is unexpectedly updated after upsert: {not_update.id}"
         )
Esempio n. 14
0
 def test_get_copyright(self):
     seeds = self.seeds
     self.assertEqual(
         len(seeds), 100,
         "Expected get_harvest_seeds to filter differently based on copyright"
     )
     self.assertEqual(seeds[0]["copyright"], "open-access")
     self.assertEqual(seeds[1]["copyright"], "yes")
     seeds = get_harvest_seeds(Repositories.GREENI,
                               SET_SPECIFICATION,
                               self.begin_of_time,
                               include_deleted=False)
     self.assertEqual(
         len(seeds), 97,
         "Expected get_harvest_seeds to delete invalid copyright")
     self.assertEqual(seeds[1]["copyright"], "open-access")
Esempio n. 15
0
 def test_handle_deletion_seeds(self):
     if self.spec_set == "edusources":
         self.skipTest("Deletion not supported by Sharekit backend")
     dataset_version = DatasetVersion.objects.last()
     collection = Collection.objects.get(name=self.spec_set, dataset_version=dataset_version)
     command = self.get_command_instance()
     document_count = collection.document_set.count()
     deletes = [
         seed
         for seed in get_harvest_seeds(self.repository, self.spec_set,
                                       make_aware(datetime(year=2019, month=12, day=31)))
         if seed.get("state", "active") != "active"
     ]
     document_deletes = command.handle_deletion_seeds(collection, deletes)
     self.assertEqual(document_deletes, 1)
     self.assertEqual(collection.document_set.count(), document_count - document_deletes)
Esempio n. 16
0
 def setUpClass(cls):
     super().setUpClass()
     cls.begin_of_time = make_aware(datetime(year=1970, month=1, day=1))
     HanOAIPMHFactory.create_common_responses()
     cls.seeds = get_harvest_seeds(Repositories.HAN, SET_SPECIFICATION, cls.begin_of_time)
 def setUpClass(cls):
     super().setUpClass()
     cls.set_spec = "edusources"
     cls.begin_of_time = make_aware(datetime(year=1970, month=1, day=1))
     SharekitMetadataHarvestFactory.create_common_sharekit_responses(include_delta=True)
     cls.seeds = get_harvest_seeds(Repositories.SHAREKIT, cls.set_spec, cls.begin_of_time)
 def test_analysis_allowed_property(self):
     seeds = get_harvest_seeds(Repositories.SHAREKIT, "edusourcesprivate", self.begin_of_time)
     for seed in seeds:
         self.assertFalse(seed["analysis_allowed"])
 def test_is_restricted(self):
     seeds = get_harvest_seeds(Repositories.SHAREKIT, "edusourcesprivate", self.begin_of_time)
     for seed in seeds:
         self.assertTrue(seed["is_restricted"])