def get_list_empty_datasets(self): # list_datasets = ["http://data.europa.eu/88u/dataset/cordisH2020projects"] from ckanext.ecportal.virtuoso.utils_triplestore_crud_helpers import TripleStoreCRUDHelpers try: tsch = TripleStoreCRUDHelpers() sparql_query = """ select ?dataset from <dcatapop-public> where { ?dataset a <http://www.w3.org/ns/dcat#Dataset>. optional{ ?cr <http://xmlns.com/foaf/0.1/primaryTopic> ?dataset . ?cr <http://purl.org/dc/terms/modified> ?o . } filter (!bound(?o)) } """ result = tsch.execute_select_query_auth(sparql_query) list_datasets = [ds.get('dataset').get('value') for ds in result] return list_datasets except BaseException as e: log.error(traceback.print_exc(e)) log.error('[Rollback Dataset] [get_list_empty_datasets] [Failed]')
def get_list_incorrect_datasets_issued_date(self): from ckanext.ecportal.virtuoso.utils_triplestore_crud_helpers import TripleStoreCRUDHelpers try: tsch = TripleStoreCRUDHelpers() sparql_query = """ PREFIX dcat: <http://www.w3.org/ns/dcat#> select ?dataset ?cr ?issued ?modified from <dcatapop-public> from <dcatapop-private> where { ?cr a <http://www.w3.org/ns/dcat#CatalogRecord> . ?cr <http://xmlns.com/foaf/0.1/primaryTopic> ?dataset . ?cr <http://purl.org/dc/terms/issued> ?issued . ?cr <http://purl.org/dc/terms/modified> ?modified filter ((?issued = ?modified) && (str(?issued) > "2019-07-10T12:59:20")) } order by ?modified """ result = tsch.execute_select_query_auth(sparql_query) list_datasets = [ds.get('dataset').get('value') for ds in result] return list_datasets except BaseException as e: log.error(traceback.print_exc(e)) log.error('[get_list_incorrect_datasets_issued_date] [Failed]')
def save_to_ts(self): """ To insert or update the description of the Catalog in the TS. all the existing description in TS will be removed. :rtype: Boolean """ try: tsch = TripleStoreCRUDHelpers() source_graph = self.graph_name target_graph_to_save = DCATAPOP_PUBLIC_GRAPH_NAME ttl_ds_from_ts = self.ttl_as_in_ts ttl_ds_last_version = self.build_the_graph().serialize(format="nt") r = tsch.execute_update_without_condition(source_graph, target_graph_to_save, ttl_ds_from_ts, ttl_ds_last_version) log.info("[Catalog]. Save catalog successful [{0}]".format( self.catalog_uri)) self.ttl_as_in_ts = ttl_ds_last_version active_cache = config.get('ckan.cache.active', 'false') if active_cache == 'true': redis_cache.set_value_in_cache(self.cache_id, pickle.dumps(self), 864000, pool=redis_cache.MISC_POOL) return r except BaseException as e: log.error("[Catalog]. Save catalog failed [{0}]".format( self.catalog_uri)) log.error(traceback.print_exc(e)) return False
def _get_label_from_mdr( uri, property_uri="<http://www.w3.org/2004/02/skos/core#prefLabel>" ): ''' Get the label of the URI :param str uri: :return: ''' resource_uri = "<{0}>".format(uri) # property_uri = "http://www.w3.org/2004/02/skos/core#prefLabel" graph_names = [ "http://eurovoc.europa.eu", "http://publications.europa.eu/resource/authority/data-theme", "http://publications.europa.eu/resource/authority/corporate-body" ] from ckanext.ecportal.virtuoso.utils_triplestore_crud_helpers import TripleStoreCRUDHelpers label = uri tsch = TripleStoreCRUDHelpers() try: label_structure = tsch.get_all_different_text_value_by_language_or_without( graph_names, resource_uri, property_uri, 'en') label = label_structure[0].get("value").get("value") except BaseException as e: log.error("get_label_from_mdr failed for {0}".format(uri)) log.error(traceback.print_exc(e)) return label
def save_revision_to_ts(self): ''' optimized saving of th revision by creating the ttl manually. the serializer of rdflib takes long time in the case of a huge literal :return: ''' try: from ckanext.ecportal.virtuoso.utils_triplestore_crud_helpers import TripleStoreCRUDHelpers content = self.contentDataset_revision.get( '0', ResourceValue).value_or_uri content = content.encode('string_escape') self.contentDataset_revision = {} tripleStoreCRUDHelpers = TripleStoreCRUDHelpers() g = self.convert_to_graph_ml() ttl_before_content = self.convert_to_graph_ml().serialize( format='nt') #build the triple manually to content_triple = ' <{0}> <{1}> "{2}" .'.format( self.uri, "http://data.europa.eu/88u/revision#contentDataset", content) ttl_revision = ttl_before_content + content_triple r = tripleStoreCRUDHelpers.set_all_properties_values_as_ttl( self.graph_name, "<" + self.uri + ">", ttl_revision) except BaseException as e: log.error("Revision can not be saved. uri: {0}".format( self.isRevisionOf_revision.get('0').uri)) log.error(traceback.print_exc(e)) raise e
def get_list_datasets_update_number_download(): # list_datasets = ["http://data.europa.eu/88u/dataset/cordisH2020projects"] from ckanext.ecportal.virtuoso.utils_triplestore_crud_helpers import TripleStoreCRUDHelpers try: tsch = TripleStoreCRUDHelpers() sparql_query = """ select ?dataset from <dcatapop-public> from <dcatapop-private> where { ?dataset a <http://www.w3.org/ns/dcat#Dataset> } """ result = tsch.execute_select_query_auth(sparql_query) list_datasets = [ ds.get('dataset').get('value') for ds in result ] log.info( '[Dataset] [get_list_datasets_update_number_download] [SUCCESS] [Number of datasets: {0}]' .format(len(list_datasets))) list_datasets = list( map(lambda ds: ds.split('/')[-1], list_datasets)) return list_datasets except BaseException as e: log.error(traceback.print_exc(e)) log.error( '[Dataset] [get_list_datasets_update_number_download] [Failed]' )
class TripleStoreIngestionCore(VirtuosoCRUDCore): def __init__(self): self.crud_helper = TripleStoreCRUDHelpers() def ingest_graph_to_virtuoso(self, graph_name, graph_object): try: # TODO: Optimize with Turtle if possible ttl_formated = graph_object.serialize(format='nt') self.crud_helper.execute_insert_ttl(graph_name, ttl_formated) except BaseException as e: return False
def test_find_any_for_where_clauses(self): triple_store_CRUD_helper = TripleStoreCRUDHelpers() triplet_list = [ Triplet(predicate=IN_SCHEME_PREDICATE, object=GRAPH_CORPORATE_BODY), Triplet(predicate=AUTHORITY_CODE_PREDICATE), ] searched_fields = SUBJECT_WITH_SPACES + OBJECT_WITH_SPACES properties_values = triple_store_CRUD_helper.find_any_for_where_clauses( GRAPH_CORPORATE_BODY, triplet_list, searched_fields) assert properties_values is not None assert len(properties_values) > 0
def retrieve_all_datasets_titles(self, c): triple_store_CRUD_helper = TripleStoreCRUDHelpers() triplet_list = [ Triplet(predicate=NAMESPACE_DCATAPOP.rdf + 'type', object=NAMESPACE_DCATAPOP.dcat + 'Dataset'), Triplet(predicate=NAMESPACE_DCATAPOP.dcterms + 'title', filter="FILTER(lang(?o) = '{0}')".format(c.language)), ] searched_fields = SUBJECT_WITH_SPACES + OBJECT_WITH_SPACES datasets_titles_dict = {} datasets = triple_store_CRUD_helper.find_any_in_graphs_for_where_clauses( [DCATAPOP_PUBLIC_GRAPH_NAME, DCATAPOP_PRIVATE_GRAPH_NAME], triplet_list, searched_fields) for dataset in datasets: datasets_titles_dict[dataset.get('s').get(VIRTUOSO_VALUE_KEY)] = dataset.get('o').get(VIRTUOSO_VALUE_KEY) return datasets_titles_dict
def retrieve_authority_codes(controlled_vocabulary): triplet_list = [Triplet(predicate=AUTHORITY_CODE_PREDICATE)] properties_values = TripleStoreCRUDHelpers().find_any_for_where_clauses(controlled_vocabulary, triplet_list) # Transform object retrieved from the helper to a dict mapping frequency identifier to their uri dict = build_dict_mapping_object_to_subject(properties_values) return dict
def retrieve_all_frequencies(): triplet_list = [Triplet(predicate=IDENTIFIER_URI)] properties_values = TripleStoreCRUDHelpers().find_any_for_where_clauses(Controlled_Vocabulary.frequency, triplet_list) # Transform object retrieved from the helper to a dict mapping frequency identifier to their uri frequencies = build_dict_mapping_object_to_subject(properties_values) return frequencies
def retrieve_all_datasets_status(): triplet_list = [Triplet(predicate=AUTHORITY_CODE_PREDICATE)] properties_values = TripleStoreCRUDHelpers().find_any_for_where_clauses(Controlled_Vocabulary.dataset_status, triplet_list) # Transform object retrieved from the helper to a dict mapping frequency identifier to their uri status_dict = build_dict_mapping_object_to_subject(properties_values) return status_dict
def get_list_catalogs(): ''' get the list of catalogs in the the triple stores :return: ''' try: tsch = TripleStoreCRUDHelpers() list_uris_catalogs = tsch.get_list_resources_by_class( DCATAPOP_PUBLIC_GRAPH_NAME, CATALOG_CLASS_URI) list_catalogs = {} for uri_catalog in list_uris_catalogs: catalog = CatalogDcatApOp(uri_catalog) catalog.get_description_from_ts() list_catalogs[uri_catalog] = catalog return list_catalogs except BaseException as e: log.error("Can not get the list of catalogs")
def get_map_vocabulary(graph_name, vocabulary_class_uri=DEFAULT_VOCABULARY_CLASS): ''' get the list of catalogs in the the triple stores :return: ''' ts_host = config.get(MDR_HOST_NAME) ts_host_auth = config.get(MDR_HOST_NAME_AUTHENTICATED) try: tsch = TripleStoreCRUDHelpers(ts_host, ts_host_auth) list_uris_vocabulary = tsch.get_list_resources_by_class(graph_name, vocabulary_class_uri) map_vocabulary = {} for uri_vocabulary in list_uris_vocabulary: vocabulary = ConceptSchemeSchemaSkosWrapper(uri_vocabulary,graph_name) vocabulary.get_description_from_ts() map_vocabulary[uri_vocabulary] = vocabulary return map_vocabulary except BaseException as e: log.error("Can not get the list of vocabulary")
def retrieve_all_file_types(): triplet_list = [Triplet(subject=BLANKNODE_VARIABLE, predicate=LEGACY_CODE), Triplet(predicate=OP_MAPPED_CODE, object=BLANKNODE_VARIABLE), ] searched_fields = SUBJECT_WITH_SPACES + OBJECT_WITH_SPACES properties_values = TripleStoreCRUDHelpers().find_any_for_where_clauses(Controlled_Vocabulary.file_types, triplet_list, searched_fields) # Transform object retrieved from the helper to a dict mapping legacy code to the uri of the parent node file_types = build_dict_mapping_object_to_subject(properties_values) return file_types
def retrieve_all_publishers(): triplet_list = [Triplet(predicate=IN_SCHEME_PREDICATE, object=Controlled_Vocabulary.publishers), Triplet(predicate=AUTHORITY_CODE_PREDICATE), ] searched_fields = SUBJECT_WITH_SPACES + OBJECT_WITH_SPACES properties_values = TripleStoreCRUDHelpers().find_any_for_where_clauses(Controlled_Vocabulary.publishers, triplet_list, searched_fields) # Transform object retrieved from the helper to a dict mapping authority-code to the value of the corporate-body publisher_controller = build_dict_mapping_object_to_subject(properties_values) return publisher_controller
def save_to_ts(self, graph_name=None): """ Insert or update the description of the current schema in the TS. all the existing description in TS will be removed. :param str graph_name: :rtype: bool|None """ try: tripleStoreCRUDHelpers = TripleStoreCRUDHelpers() gn = graph_name if graph_name is None: gn = self.graph_name g = self.convert_to_graph_ml() ttl_schema = g.serialize(format='nt') tripleStoreCRUDHelpers.set_all_properties_values_as_ttl( gn, "<" + self.uri + ">", ttl_schema) return True except BaseException as e: log.error("Save schema to ts failed. [URI:<{0}>]".format(self.uri)) log.error(traceback.print_exc(e)) return None
def convert_to_json_ld(self, graph_name=None): """ Convert the graph to json-ld format :param str graph_name: :rtype: bool|None """ try: tripleStoreCRUDHelpers = TripleStoreCRUDHelpers() gn = graph_name if graph_name is None: gn = self.graph_name g = self.convert_to_graph_ml() context = {"@vocab": "https://schema.org/Dataset"} ttl_schema = g.serialize(format='json-ld', context=context, indent=4) tripleStoreCRUDHelpers.set_all_properties_values_as_ttl( gn, "<" + self.uri + ">", ttl_schema) return True except BaseException as e: log.error("Save schema to ts failed. [uri {0}]".format(self.uri)) return None
def delete_from_ts(self): """ To delete the Catalog from the TS :rtype: Boolean """ try: tsch = TripleStoreCRUDHelpers() source_graph = self.graph_name ttl_ds_from_ts = self.ttl_as_in_ts status = tsch.execute_delete_ttl(source_graph, ttl_ds_from_ts) if status: # initialization of the schemas self.schema = CatalogSchemaDcatApOp(self.catalog_uri, self.graph_name) self.ttl_as_in_ts = "" active_cache = config.get('ckan.cache.active', 'false') if active_cache == 'true': redis_cache.delete_value_from_cache(self.cache_id) log.info( "[Catalog]. [delete Catalog from ts] successful [uri: {0}]" .format(self.catalog_uri)) return True else: log.info( "[Catalog]. [delete Catalog from ts] failed [uri: {0}]. [Message from TS: {1}]" .format(self.catalog_uri, tsch.get_virtuoso_query_return())) return False except BaseException as e: log.error("[Catalog]. [delete Catalog from ts] failed. [uri: {0}]". format(self.catalog_uri)) log.error(traceback.print_exc(e)) return None
def migrate_delta(configuration_file_path): triplet_list = [ Triplet(predicate=TYPE_PREDICATE, object=NAMESPACE_DCATAPOP.dcat + DATASET) ] properties_values = TripleStoreCRUDHelpers( ).find_any_in_graphs_for_where_clauses( [DCATAPOP_PUBLIC_GRAPH_NAME, DCATAPOP_PRIVATE_GRAPH_NAME], triplet_list, result_clause=SUBJECT_WITH_SPACES) # Transform to only get a list of uri migrated_datasets = [] for properties_value in properties_values: migrated_datasets.append( properties_value.get("s").get("value").split('/')[-1]) condition = Package.state == "active" all_packages = postgresql_helper.find_any_in_database( config_file_path=configuration_file_path, condition=condition, table=Package, result_clause=[Package.name]) # Transform to only get a list of uri all_packages_list = [] for package in all_packages: all_packages_list.append(package.name) not_migrated_datasets_names = set(all_packages_list) - set( migrated_datasets) for not_migrated_dataset_name in not_migrated_datasets_names: dataset = DatasetDcatApOp(DATASET_URI_PREFIX + not_migrated_dataset_name) DatasetDcatApOp.delete_from_ts(dataset) migrate_with_package_name_to_virtuoso(configuration_file_path, not_migrated_dataset_name) return migrated_datasets
def __get_description_from_ts_current_level(self, graph_name=None): """ Get the description of the schema from TS limited to the current level. :param graph_name: :return: """ from ckanext.ecportal.model.schemas import MAPPER_RDF_TYPE_CLASS as MAPPER_RDF_TYPE_CLASS final_description_dict = dict() try: tripleStoreCRUDHelpers = TripleStoreCRUDHelpers() gn = graph_name if graph_name is None: gn = self.graph_name crud_uri = "<" + self.uri + ">" resource_description_from_TS = tripleStoreCRUDHelpers.get_all_properties_value( gn, crud_uri) dict_prop_values = dict() # type: Dict[str,list] value_of_property = dict() # type : dict[str,str] for desc in resource_description_from_TS: model_value = None prop = None value_type = None value_of_property = None value = None prop = desc['property']['value'] value_of_property = desc['value'] value_type = value_of_property['type'] value = value_of_property['value'] if value_type == 'uri': schema = MAPPER_RDF_TYPE_CLASS.get(value, None) if schema: model_value = schema(uri=value, graph_name=self.graph_name) model_value.type_rdf = {} else: model_value = SchemaGeneric(resource_uri=value, graph_name=self.graph_name) key_special = "0" if value_type == 'literal': datatype = 'string' if value_of_property.has_key('xml:lang'): lang = value_of_property['xml:lang'] key_special = "0" model_value = ResourceValue(value, lang=lang) else: key_special = '0' model_value = ResourceValue(value) if value_type == 'typed-literal': datatype = value_of_property['datatype'] key_special = '0' model_value = ResourceValue(value, type='typed-literal', datatype=datatype) dict_final_pv = dict() # add as member of the class member_name = NAMESPACE_DCATAPOP.get_member_name(prop) if final_description_dict.has_key(prop): key = "{0}{1}".format( "", final_description_dict[prop].__len__()) dict_final_pv[key] = model_value final_description_dict[prop].update(dict_final_pv) if member_name: setattr(self, member_name, final_description_dict[prop]) else: dict_final_pv[key_special] = model_value final_description_dict[prop] = dict_final_pv if member_name: setattr(self, member_name, dict_final_pv) return final_description_dict except BaseException as e: import traceback log.error(traceback.print_exc()) log.error("Error. {0}".format(e.message)) log.error( "[Dataset]. get_description_from_ts_current_level failed {0}". format(self.uri)) return None
def build_embargo_datasets_from_string_content(self, rdf_string_content, dataset_description_map, format_input="xml", doi_flag=True): """ To build a dict of datasets in embargo mode. The key of the dict is the uri, the value is the dataset object :param unicode rdf_string_content: :param map dataset_description_map: :rtype: dict[str, DatasetDcatApOp] """ def create_name_of_graph(rdf_string_content): """ :param rdf_string_content: string that represents a rdf. :return: None if failure, a graph name otherwise. """ try: if isinstance(rdf_string_content, unicode): content_md5 = hashlib.md5( rdf_string_content.encode('utf8')).hexdigest() else: content_md5 = hashlib.md5( rdf_string_content.decode('utf8').encode( 'utf8')).hexdigest() graph_name = DCATAPOP_EMBARGO_NAMESPACE + content_md5 return graph_name except BaseException as e: import traceback log.error(traceback.print_exc()) log.error("Create name of graph failed") return None try: name_ingestion_graph = create_name_of_graph(rdf_string_content) list_embargo_datasets = {} if name_ingestion_graph: # Create the embargo graph for the current job of ingestion tripleStoreCRUDHelpers = TripleStoreCRUDHelpers() tripleStoreCRUDHelpers.graph_remove(name_ingestion_graph) tripleStoreCRUDHelpers.graph_create(name_ingestion_graph) # load one time the content of rdf to virtuoso. if self.ingest_graph_from_string(name_ingestion_graph, rdf_string_content, format_input): for dataset_uri, dataset_description in dataset_description_map.items( ): embargo_dataset = DatasetDcatApOp( dataset_uri, DCATAPOP_INGESTION_DATASET, name_ingestion_graph) embargo_dataset.privacy_state = DCATAPOP_INGESTION_DATASET if embargo_dataset.get_description_from_ts(): # Generate DOI if requested list_embargo_datasets[ dataset_uri] = embargo_dataset if dataset_description.generate_doi and doi_flag: doi = generate_doi_for_dataset( embargo_dataset, dataset_description.generate_doi) if doi: embargo_dataset.set_doi(doi) else: log.error( "Ingest dataset from string error. Can not extract embargo dataset from graph. graph name" " [{0}]. dataset uri [{1}]".format( name_ingestion_graph, dataset_uri)) return None return list_embargo_datasets else: log.error( "Ingest dataset from string failed. The ingestion to the embargo graph failed. graph " "name [{0}]. content: [{1}]".format( name_ingestion_graph, rdf_string_content)) return None else: log.error( "Ingest dataset from string failed. Can not create a the name of the embargo graph. Content " "[{0}]".format(rdf_string_content.encode('utf-8'))) except BaseException as e: import traceback log.error(traceback.print_exc()) log.error( u"Ingest dataset from string failed. Exception {0}".format( str(e))) log.error( u"Ingest dataset from string failed. file content: [{0}]". format(rdf_string_content)) return None
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # # contact: <https://publications.europa.eu/en/web/about-us/contact> import logging, codecs from ckanext.ecportal.test.virtuoso.test_with_virtuoso_configuration import TestWithVirtuosoConfiguration logging.basicConfig(level=logging.DEBUG) from ckanext.ecportal.virtuoso.utils_triplestore_crud_core import VirtuosoCRUDCore from ckanext.ecportal.virtuoso.utils_triplestore_crud_helpers import TripleStoreCRUDHelpers vsc = VirtuosoCRUDCore() vsh = TripleStoreCRUDHelpers() # build the data for the test class TestVirtuosoCRUDCore(TestWithVirtuosoConfiguration): def setUp(self): vsc = VirtuosoCRUDCore() a = 10 vsc = VirtuosoCRUDCore() sparql_query = """ drop silent graph <graphToDrop> drop silent graph <graphpublic> drop silent graph <testGraph> drop silent graph <createdGraph> drop silent graph <testPerformance>
def __init__(self): self.crud_helper = TripleStoreCRUDHelpers()