def test_creation_of_dataset_from_graph(self): ds = DataSet.get_dataset_from_graph( dataset_graph_uri=self.dataset_graph_uri, store=self.store ) self.assertIsNotNone(ds) self.assertIsInstance(ds, DataSet) self.assertEquals( ds.named_graph, URIRef(self.dataset_graph_uri) ) self.assertIn( "Ton Smits Huis", [group.name for group in ds.groups.all()] ) ds_from_db = DataSet.objects.get(named_graph=self.dataset_graph_uri) self.assertEquals( ds_from_db.spec, str(ds.spec) ) self.assertTrue( self.store.ask( named_graph=self.dataset_graph_uri, query="""where {{?s <http://schemas.delving.eu/narthex/terms/synced> true }}""" ) )
def purge_deleted_datasets(store): """Find datasets which are deleted and purge all their information in Nave. """ nr_deleted, datasets_uris = find_datasets_by_sync_or_deleted_status(store, deleted=True) if nr_deleted == 0: logger.info("No deleted datasets found.") return 0 for dataset_uri in datasets_uris: ds = DataSet.get_dataset_from_graph(store, dataset_uri) delete_dataset_with_all_triples(ds, store) ds.delete() logger.info("Purged {} datasets from Nave and Narthex: {}".format(nr_deleted, datasets_uris)) return nr_deleted
def test_context_graph(self): ds = DataSet.get_dataset_from_graph( dataset_graph_uri=self.dataset_graph_uri, store=self.store ) es_actions = [] edm_record, es_action = tasks.synchronise_record( graph_uri="http://localhost:8000/resource/aggregation/ton-smits-huis/454/graph", ds=ds, store=self.store, es_actions=es_actions ) context_graph, nr_levels = edm_record.get_context_graph(store=self.store, named_graph=edm_record.named_graph) self.assertIsNotNone(context_graph) self.assertIsInstance(context_graph, Graph) predicates = set(list(context_graph.predicates())) assert URIRef('http://www.openarchives.org/ore/terms/aggregates') in predicates
def test_graph_indexing(self): ds = DataSet.get_dataset_from_graph( dataset_graph_uri=self.dataset_graph_uri, store=self.store ) es_actions = [] edm_record, es_action = tasks.synchronise_record( graph_uri="http://localhost:8000/resource/aggregation/ton-smits-huis/454/graph", ds=ds, store=self.store, es_actions=es_actions ) self.assertTrue( edm_record.hub_id.endswith("ton-smits-huis_454") ) action = edm_record.create_es_action( index=self.index_name, record_type="Aggregation", store=self.store, exclude_fields=['dc_rights'] ) self.assertIsNotNone(action) assert 'dc_rights' not in action['_source'] assert action['_source']['system']['delving_recordType'] == "Aggregation" required_fields = [ "_op_type", "_index", "_type", "_id", "_source" ] # "graph", "slug", "delving_hubId", "delving_spec", "delving_recordType" assert set(list(es_action.keys())).issuperset(set(required_fields)) assert 'about' in es_action['_source'] assert 'edm_object' in es_action['_source'] assert 'rdf' in es_action['_source'] assert 'system' in es_action['_source'] subjects = es_action['_source']['dc_subject'] assert 'dc_rights' in es_action['_source'] inline_id = 'http://data.cultureelerfgoed.nl/semnet/7403e26d-cf33-4372-ad72-a2f9fcf8f63b' inlined_example = [subject for subject in subjects if 'id' in subject and subject['id'] in [inline_id]][0] assert inlined_example assert inlined_example['id'] == inline_id assert inlined_example['value'] == "bomen" assert inlined_example['lang'] == "nl"
def test_synchronise_record(self): ds = DataSet.get_dataset_from_graph( dataset_graph_uri=self.dataset_graph_uri, store=self.store ) graph_list = tasks.get_out_of_sync_dataset_record_graph_uris( self.dataset_graph_uri, self.store ) es_actions = [] edm_record, es_action = tasks.synchronise_record( graph_uri=graph_list[0], ds=ds, store=self.store, es_actions=es_actions ) self.assertIsNotNone(edm_record) self.assertIsNotNone(es_action) self.assertEqual( EDMRecord.objects.count(), 1, "Only one record should be saved" ) self.assertEquals( edm_record.dataset, ds ) self.assertTrue( edm_record.hub_id.endswith("ton-smits-huis_454"), ) self.assertRegex( edm_record.hub_id, "(.*?)_(.*?)_(.*?)" ) assert edm_record.document_uri == 'http://localhost:8000/resource/aggregation/ton-smits-huis/454' self.assertEquals( edm_record.named_graph, URIRef('http://localhost:8000/resource/aggregation/ton-smits-huis/454/graph') )
def schedule_out_of_sync_datasets(acceptance=False, store=None): """Find all out of sync datasets and schedule synchronisation tasks for each.""" if not store: store = rdfstore.get_rdfstore(acceptance) nr_datasets, datasets = find_datasets_with_records_out_of_sync(store) if nr_datasets == 0: return 0 logger.info("Found {} datasets that have records that are out of sync".format(nr_datasets)) scheduled_for_indexing = 0 for dataset_uri in datasets: ds = DataSet.get_dataset_from_graph(dataset_graph_uri=dataset_uri, store=store) ds.records_in_sync = False if ds.can_be_synchronised: logger.info("status: {}, {}, {}".format(ds.stay_in_sync, ds.sync_in_progress, ds.has_sync_error)) process_key = str(uuid.uuid1()) ds.process_key = process_key ds.save() async_result = synchronise_dataset_records.apply_async(kwargs={'store': store, 'ds': ds}, task_id=process_key) scheduled_for_indexing += 1 logger.info("Scheduled {} for indexing with {} records".format(ds.spec, ds.valid)) return scheduled_for_indexing
def synchronise_dataset_metadata(store, dataset_graph_uri): """Synchronise the metadata of the dataset between Narthex and Nave.""" ds = DataSet.get_dataset_from_graph(dataset_graph_uri=dataset_graph_uri, store=store) ds.save() return ds
def test_synchronise_dataset(self): from search import get_es_client client = get_es_client() s = Search(client).index(self.index_name) del_response = client.delete_by_query(index=self.index_name, q="*:*") es_response = s.execute() self.assertEquals( es_response.hits.total, 0 ) self.assertEquals( EDMRecord.objects.count(), 0 ) assert self.store.ask(query="""where {{ GRAPH <http://localhost:8000/resource/aggregation/ton-smits-huis/454/graph> {{?s <http://schemas.delving.eu/narthex/terms/synced> false}} }}""" ) response = tasks.synchronise_dataset_records( dataset_graph_uri=self.dataset_graph_uri, store=self.store, index=self.index_name ) # self.assertTrue(response.successful) # self.assertEquals( # response.result, # 1 # ) self.assertEquals( EDMRecord.objects.count(), 1 ) time.sleep(2) es_response = s.execute() self.assertEquals( es_response.hits.total, 1, "there should be one record in the test index" ) record = es_response.hits[0] self.assertEquals( record.meta.doc_type, "void_edmrecord" ) self.assertEquals( "_".join(record.meta.id.split('_')[1:]), "ton-smits-huis_454" ) # test if switch is flipped assert self.store.ask(query="""where {{ GRAPH <http://localhost:8000/resource/aggregation/ton-smits-huis/454/graph> {{?s <http://schemas.delving.eu/narthex/terms/synced> true}} }}""" ) # test if dataset is deleted from index ds = DataSet.get_dataset_from_graph( dataset_graph_uri=self.dataset_graph_uri, store=self.store) ds.delete_from_index(self.index_name) es_response = s.execute() self.assertEquals( es_response.hits.total, 0, "there should be no records in the test index after the dataset is deleted" ) rdf_store_response = ds.delete_from_triple_store(self.store) assert rdf_store_response assert not self.store.ask(query="""where {{ GRAPH <http://localhost:8000/resource/dataset/ton-smits-huis/graph> {{?s ?p ?o}} }}""" )
def synchronise_dataset_records(store, dataset_graph_uri=None, ds=None, index=settings.SITE_NAME): """Iterate over all records that are out of sync for a dataset and update them in the index and database. """ if not ds and dataset_graph_uri: ds = DataSet.get_dataset_from_graph(dataset_graph_uri=dataset_graph_uri, store=store) elif ds and not dataset_graph_uri: dataset_graph_uri = ds.document_uri elif not dataset_graph_uri and not ds: raise ValueError("Unable to find dataset due to missing value in dataset_graph_uri and/or ds") logger.info("Graph uri to synchronise: {}".format(dataset_graph_uri)) # materialize nodes # ore:aggregates + remove ore:isAggregatedBy graph_list = get_out_of_sync_dataset_record_graph_uris(dataset_graph_uri, store, 200) if not ds.stay_in_sync: logger.warn("Should not start synchronization for {} when marked as not stay in sync".format(ds.spec)) return 0 elif ds.has_sync_error: logger.warn("Can't start synchronization of {} due to previous sync error.".format(ds.spec)) return 0 ds.has_sync_error = False ds.sync_error_message = None ds.records_in_sync = False ds.processed_records = 0 ds.save() records_processed = 0 try: valid_records = ds.valid while len(graph_list) > 0: actions = [] # todo use the graphs instead of the URIs for graph_uri in graph_list: synchronise_record(graph_uri, ds, store, actions, index=index) # index actions logger.info("number of actions scheduled: {}".format(len(actions))) response = helpers.bulk(client=get_es(), actions=actions, stats_only=True) records_processed += len(graph_list) logger.info("processed {}/{} for {}".format(records_processed, valid_records, ds.spec)) logger.debug("ElasticSearch bulk update: {}".format(response)) update_switch = [QueryType.remove_insert.format( named_graph=g, remove="?s <http://schemas.delving.eu/narthex/terms/synced> false", insert="?s <http://schemas.delving.eu/narthex/terms/synced> true" ) for g in graph_list] response = store.update(query="\n".join(update_switch)) logger.debug("SPARQL update succeeded: {}".format(response)) ds = DataSet.objects.get(id=ds.id) ds.processed_records = records_processed ds.save() graph_list = get_out_of_sync_dataset_record_graph_uris(dataset_graph_uri, store, 200) ds.process_key = None ds.records_in_sync = True ds.dataset_type = DataSetType.aggregated if not ds.oai_pmh.real > 0: ds.oai_pmh = OaiPmhPublished.none ds.save() logger.info("Finishing synchronising {} records from dataset: {}".format(records_processed, dataset_graph_uri)) except Exception as e: logger.error("Unable to index all records for dataset {} due to {} at record {}.".format(ds.spec, e, graph_uri)) ds.sync_error_message = "{} with error: {}".format(graph_uri, e) ds.has_sync_error = True ds.process_key = None ds.save() logger.warn("Only index {} of {} valid for dataset {}".format(records_processed, valid_records, ds.spec)) return records_processed