def setUpClass(cls): super().setUpClass() AnatomyToolOAIPMHFactory.create_common_anatomy_tool_responses() cls.begin_of_time = make_aware(datetime(year=1970, month=1, day=1)) cls.set_spec = "anatomy_tool" cls.seeds = get_harvest_seeds(Repositories.ANATOMY_TOOL, cls.set_spec, cls.begin_of_time)
def test_handle_upsert_seeds(self): dataset_version = DatasetVersion.objects.last() collection = Collection.objects.create( name=self.spec_set, dataset_version=dataset_version, referee="external_id" ) command = self.get_command_instance() upserts = [ seed for seed in get_harvest_seeds(self.repository, self.spec_set, make_aware(datetime(year=1970, month=1, day=1))) if seed.get("state", "active") == "active" ] documents_count = command.handle_upsert_seeds(collection, upserts) # When dealing with an entirely new Dataset # Then the document count should equal output of handle_upsert_seeds self.assertEqual(collection.document_set.count(), documents_count) for document in collection.document_set.all(): self.assertEqual(document.reference, document.properties["external_id"]) metadata_pipeline = document.pipeline["metadata"] self.assertEqual(metadata_pipeline["resource"], self.repository.lower()) self.assertIsInstance(metadata_pipeline["id"], int) self.assertTrue(metadata_pipeline["success"]) if "5af0e26f-c4d2-4ddd-94ab-7dd0bd531751" in document.reference: self.assertIsNotNone(document.extension) else: self.assertIsNone(document.extension)
def test_get_complete_set_without_deletes(self): seeds = get_harvest_seeds(Repositories.EDUREP, self.set_spec, self.begin_of_time, include_deleted=False) self.assertEqual(len(seeds), 14) self.check_seed_integrity(seeds, include_deleted=False)
def test_get_partial_set(self): seeds = get_harvest_seeds( Repositories.EDUREP, self.set_spec, make_aware(datetime(year=2020, month=2, day=10, hour=22, minute=22))) self.assertEqual(len(seeds), 6) self.check_seed_integrity(seeds)
def sync_sharekit_metadata(): # Select which data to sync this run latest_active_dataset = Dataset.objects.filter(is_active=True).last() if not latest_active_dataset: return dataset_version = DatasetVersion.objects.get_current_version() harvest_queryset = Harvest.objects.filter( dataset=latest_active_dataset, source__repository=Repositories.SHAREKIT, stage=HarvestStages. COMPLETE # prevents syncing materials half way a full harvest ) # First we acquire a permanent lock on Harvests, # because if latest_update_at is a while ago this command will run a long time. # We don't want to keep all those syncing changes waiting in that one transaction. try: with atomic(): harvest_queryset.filter(is_syncing=False).select_for_update( nowait=True).update(is_syncing=True) except DatabaseError: logger.warning( "Did not acquire lock on Harvester when syncing Sharekit metadata") return # Now that we're the only ones starting the sync we execute it for harvest in harvest_queryset.filter(is_syncing=True): # Check that a non-valid harvest source didn't slip through the lock if harvest.stage != HarvestStages.COMPLETE: logging.warning( "Encountered a non-complete harvest source during sync") continue # Recording which time will become latest_update_at current_time = make_aware(datetime.now()) # Getting metadata from Sharekit and stop immediately if anything went wrong send_config = create_config("http_resource", { "resource": harvest.source.repository, "continuation_limit": 10000, }) set_specification = harvest.source.spec scc, err = send(set_specification, f"{harvest.latest_update_at:%Y-%m-%dT%H:%M:%SZ}", config=send_config, method="get") if len(err) or not len(scc): continue # Now parse the metadata and update current Collection for this Harvest seeds = get_harvest_seeds(Repositories.SHAREKIT, set_specification, harvest.latest_update_at, include_no_url=True) collection = dataset_version.collection_set.filter( name=harvest.source.spec).last() for seeds_batch in ibatch(seeds, batch_size=32): collection.update(seeds_batch, "external_id") # Last but not least we update the harvest update time to get a different delta later harvest.latest_update_at = current_time harvest.save() # And we release the syncing lock with atomic(): harvest_queryset.filter(is_syncing=True).select_for_update().update( is_syncing=False)
def preprocess_seeds(self, harvest_queryset): self.logger.start("preprocess") seeds_by_collection = defaultdict(list) source_count = harvest_queryset.count() for harvest in harvest_queryset: set_specification = harvest.source.spec upserts = [] deletes = [] for seed in get_harvest_seeds(harvest.source.repository, harvest.source.spec, harvest.latest_update_at, include_no_url=True): if seed.get("state", "active") == "active": upserts.append(seed) else: deletes.append(seed) seeds_by_collection[(harvest.source.repository, harvest.source.spec)] += ( upserts, deletes, ) self.logger.progress(f"preprocess.{set_specification}", source_count, success=len(upserts) + len(deletes)) self.logger.end("preprocess") return seeds_by_collection
def setUpClass(cls): super().setUpClass() cls.set_spec = "surfsharekit" cls.begin_of_time = make_aware(datetime(year=1970, month=1, day=1)) EdurepOAIPMHFactory.create_common_edurep_responses(include_delta=True) cls.seeds = get_harvest_seeds(Repositories.EDUREP, cls.set_spec, cls.begin_of_time)
def test_get_copyright(self): seeds = self.seeds self.assertEqual(len(seeds), 200, "Expected get_harvest_seeds to filter differently based on copyright") self.assertEqual(seeds[0]["copyright"], "open-access") self.assertIsNone(seeds[2]["copyright"], "Expected deleted record to have no copyright") seeds = get_harvest_seeds(Repositories.HAN, SET_SPECIFICATION, self.begin_of_time, include_deleted=False) self.assertEqual(len(seeds), 67, "Expected get_harvest_seeds to delete invalid copyright") self.assertEqual(seeds[1]["copyright"], "open-access")
def test_get_copyright(self): seeds = self.seeds self.assertEqual(len(seeds), 20) self.assertEqual(seeds[0]["copyright"], "yes") self.assertEqual(seeds[3]["copyright"], "open-access") seeds = get_harvest_seeds(Repositories.HVA, SET_SPECIFICATION, self.begin_of_time, include_deleted=False) self.assertEqual(len(seeds), 6, "Expected get_harvest_seeds to delete invalid copyright") self.assertEqual(seeds[0]["copyright"], "open-access")
def test_get_files(self): seeds = self.seeds self.assertEqual(seeds[0]["files"], [ { "mime_type": "application/pdf", "url": "https://octo.hku.nl/octo/repository/getfile?id=zZQC1ZBu8c4", "hash": "9ac373e877133f0c00173bd02d82b1861c9934a2", "title": "Budapest2005.pdf" } ]) all_seeds = get_harvest_seeds(Repositories.HKU, SET_SPECIFICATION, self.begin_of_time, include_no_url=True) self.assertEqual(all_seeds[21]["files"], [])
def test_get_partial_set_without_deletes(self): seeds = get_harvest_seeds(Repositories.EDUREP, self.set_spec, make_aware( datetime(year=2020, month=2, day=10, hour=22, minute=22)), include_deleted=False) self.assertEqual(len(seeds), 4) self.check_seed_integrity(seeds, include_deleted=False)
def test_handle_deletion_seeds(self): dataset_version = DatasetVersion.objects.last() collection = Collection.objects.create(name=self.spec_set, dataset_version=dataset_version) command = self.get_command_instance() deletes = [ seed for seed in get_harvest_seeds(self.repository, self.spec_set, make_aware(datetime(year=1970, month=1, day=1))) if seed.get("state", "active") != "active" ] # Basically we're testing that deletion seeds are not triggering errors when their targets do not exist. command.handle_deletion_seeds(collection, deletes) self.assertEqual(collection.document_set.count(), 0)
def test_handle_upsert_seeds(self): dataset_version = DatasetVersion.objects.last() collection = Collection.objects.get(name=self.spec_set, dataset_version=dataset_version, referee="external_id") command = self.get_command_instance() # Checking the state before the test document_count = collection.document_set.count() vortex_queryset = collection.documents.filter(properties__title="Using a Vortex | Wageningen UR") self.assertEqual(vortex_queryset.count(), 1, "Expected the start state to contain 'Using a Vortex'") for doc in collection.documents.all(): self.assertEqual(doc.created_at, doc.modified_at, f"Document is unexpectedly updated: {doc.id}") # Perform the test upserts = [ seed for seed in get_harvest_seeds(self.repository, self.spec_set, make_aware(datetime(year=2019, month=12, day=31))) if seed.get("state", "active") == "active" ] command.handle_upsert_seeds(collection, upserts) # Checking the state after the test self.assertEqual(collection.document_set.count(), document_count+3) # Check video documents content updates vortex_updateset = collection.documents.filter(properties__title="Using a Vortex (responsibly) | Wageningen UR") self.assertEqual(vortex_updateset.count(), 1) self.assertEqual(vortex_queryset.count(), 0) # Check regular document content updates handson_insertset = collection.documents.filter( properties__title="Hands-off exercise based on WEKA - Tuning and Testing" ) self.assertEqual(handson_insertset.count(), 1) processed_ids = set() for update in vortex_updateset: self.assertNotEqual(update.created_at, update.modified_at, f"Document is unexpectedly not updated: {update.id}") self.assertEqual(update.reference, update.properties["external_id"]) processed_ids.add(update.id) for insert in handson_insertset: self.assertEqual(insert.created_at.replace(microsecond=0), insert.modified_at.replace(microsecond=0), f"Document is unexpectedly not inserted: {insert.id}") self.assertEqual(insert.reference, insert.properties["external_id"]) processed_ids.add(insert.id) not_updated = collection.documents.exclude(id__in=processed_ids) self.assertNotEqual(not_updated.count(), 0) for not_update in not_updated: self.assertEqual( not_update.created_at.replace(microsecond=0), not_update.modified_at.replace(microsecond=0), f"Document is unexpectedly updated after upsert: {not_update.id}" )
def test_get_copyright(self): seeds = self.seeds self.assertEqual( len(seeds), 100, "Expected get_harvest_seeds to filter differently based on copyright" ) self.assertEqual(seeds[0]["copyright"], "open-access") self.assertEqual(seeds[1]["copyright"], "yes") seeds = get_harvest_seeds(Repositories.GREENI, SET_SPECIFICATION, self.begin_of_time, include_deleted=False) self.assertEqual( len(seeds), 97, "Expected get_harvest_seeds to delete invalid copyright") self.assertEqual(seeds[1]["copyright"], "open-access")
def test_handle_deletion_seeds(self): if self.spec_set == "edusources": self.skipTest("Deletion not supported by Sharekit backend") dataset_version = DatasetVersion.objects.last() collection = Collection.objects.get(name=self.spec_set, dataset_version=dataset_version) command = self.get_command_instance() document_count = collection.document_set.count() deletes = [ seed for seed in get_harvest_seeds(self.repository, self.spec_set, make_aware(datetime(year=2019, month=12, day=31))) if seed.get("state", "active") != "active" ] document_deletes = command.handle_deletion_seeds(collection, deletes) self.assertEqual(document_deletes, 1) self.assertEqual(collection.document_set.count(), document_count - document_deletes)
def setUpClass(cls): super().setUpClass() cls.begin_of_time = make_aware(datetime(year=1970, month=1, day=1)) HanOAIPMHFactory.create_common_responses() cls.seeds = get_harvest_seeds(Repositories.HAN, SET_SPECIFICATION, cls.begin_of_time)
def setUpClass(cls): super().setUpClass() cls.set_spec = "edusources" cls.begin_of_time = make_aware(datetime(year=1970, month=1, day=1)) SharekitMetadataHarvestFactory.create_common_sharekit_responses(include_delta=True) cls.seeds = get_harvest_seeds(Repositories.SHAREKIT, cls.set_spec, cls.begin_of_time)
def test_analysis_allowed_property(self): seeds = get_harvest_seeds(Repositories.SHAREKIT, "edusourcesprivate", self.begin_of_time) for seed in seeds: self.assertFalse(seed["analysis_allowed"])
def test_is_restricted(self): seeds = get_harvest_seeds(Repositories.SHAREKIT, "edusourcesprivate", self.begin_of_time) for seed in seeds: self.assertTrue(seed["is_restricted"])