def collect(details): """ For this source, one SPARQL endpoint is given for a series of vocabs which are all separate ConceptSchemes 'ga-jena-fuseki': { 'source': VocabSource.SPARQL, 'sparql_endpoint': 'http://dev2.nextgen.vocabs.ga.gov.au/fuseki/vocabs', 'sparql_username': '******', # Optional username for SPARQL endpoint 'sparql_password': '******', # Optional password for SPARQL endpoint #'uri_filter_regex': '.*', # Regular expression to filter vocabulary URIs - Everything #'uri_filter_regex': '^http(s?)://pid.geoscience.gov.au/def/voc/ga/', # Regular expression to filter vocabulary URIs - GA #'uri_filter_regex': '^https://gcmdservices.gsfc.nasa.gov', # Regular expression to filter vocabulary URIs - GCMD 'uri_filter_regex': '^http(s?)://resource.geosciml.org/', # Regular expression to filter vocabulary URIs - CGI }, """ logging.debug("SPARQL collect()...") # Get all the ConceptSchemes from the SPARQL endpoint # Interpret each CS as a Vocab q = """PREFIX skos: <http://www.w3.org/2004/02/skos/core#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX dcterms: <http://purl.org/dc/terms/> PREFIX owl: <http://www.w3.org/2002/07/owl#> SELECT * WHERE {{ {{ GRAPH ?g {{ ?cs a skos:ConceptScheme . OPTIONAL {{ ?cs skos:prefLabel ?title . FILTER(lang(?title) = "{language}" || lang(?title) = "") }} OPTIONAL {{ ?cs dcterms:created ?created }} OPTIONAL {{ ?cs dcterms:issued ?issued }} OPTIONAL {{ ?cs dcterms:modified ?modified }} OPTIONAL {{ ?cs owl:versionInfo ?version }} OPTIONAL {{ ?cs skos:definition ?description . FILTER(lang(?description) = "{language}" || lang(?description) = "") }} }} }} UNION {{ ?cs a skos:ConceptScheme . OPTIONAL {{ ?cs skos:prefLabel ?title . FILTER(lang(?title) = "{language}" || lang(?title) = "") }} OPTIONAL {{ ?cs dcterms:created ?created }} OPTIONAL {{ ?cs dcterms:issued ?issued }} OPTIONAL {{ ?cs dcterms:modified ?modified }} OPTIONAL {{ ?cs owl:versionInfo ?version }} OPTIONAL {{ ?cs skos:definition ?description . FILTER(lang(?description) = "{language}" || lang(?description) = "") }} }} }} ORDER BY ?title""".format(language=DEFAULT_LANGUAGE) # record just the IDs & title for the VocPrez in-memory vocabs list concept_schemes = Source.sparql_query( details["sparql_endpoint"], q, sparql_username=details.get("sparql_username"), sparql_password=details.get("sparql_password"), ) assert concept_schemes is not None, "Unable to query conceptSchemes" sparql_vocabs = {} for cs in concept_schemes: # handling CS URIs that end with '/' vocab_id = cs["cs"]["value"].replace("/conceptScheme", "").split("/")[-1] # TODO: Investigate putting regex into SPARQL query # print("re.search('{}', '{}')".format(details.get('uri_filter_regex'), cs['cs']['value'])) if details.get("uri_filter_regex") and not re.search( details["uri_filter_regex"], cs["cs"]["value"]): logging.debug("Skipping vocabulary {}".format(vocab_id)) continue if len(vocab_id) < 2: vocab_id = cs["cs"]["value"].split("/")[-2] sparql_vocabs[vocab_id] = Vocabulary( vocab_id, cs["cs"]["value"], cs["title"].get("value") or vocab_id if cs.get("title") else vocab_id, # Need string value for sorting, not None cs["description"].get("value") if cs.get("description") is not None else None, None, # none of these SPARQL vocabs have creator info yet # TODO: add creator info to GSQ vocabs dateutil.parser.parse(cs.get("created").get("value")) if cs.get("created") is not None else None, # dct:issued not in Vocabulary # dateutil.parser.parse(cs.get('issued').get('value')) if cs.get('issued') is not None else None, dateutil.parser.parse(cs.get("modified").get("value")) if cs.get("modified") is not None else None, cs["version"].get("value") if cs.get("version") is not None else None, # versionInfo config.VocabSource.SPARQL, cs["cs"]["value"], sparql_endpoint=details["sparql_endpoint"], sparql_username=details["sparql_username"], sparql_password=details["sparql_password"], ) g.VOCABS = {**g.VOCABS, **sparql_vocabs} logging.debug("SPARQL collect() complete.")
def get_top_concepts(self): # same as parent query, only running against rdflib in-memory graph, not SPARQL endpoint vocab = g.VOCABS[self.vocab_id] q = """ PREFIX skos: <http://www.w3.org/2004/02/skos/core#> SELECT DISTINCT ?tc ?pl WHERE {{ {{ GRAPH ?g {{ {{ <{concept_scheme_uri}> skos:hasTopConcept ?tc . }} UNION {{ ?tc skos:topConceptOf <{concept_scheme_uri}> . }} {{ ?tc skos:prefLabel ?pl . FILTER(lang(?pl) = "{language}" || lang(?pl) = "") }} }} }} UNION {{ {{ <{concept_scheme_uri}> skos:hasTopConcept ?tc . }} UNION {{ ?tc skos:topConceptOf <{concept_scheme_uri}> . }} {{ ?tc skos:prefLabel ?pl . FILTER(lang(?pl) = "{language}" || lang(?pl) = "") }} }} }} ORDER BY ?pl """.format(concept_scheme_uri=vocab.concept_scheme_uri, language=self.language) top_concepts = Source.sparql_query(vocab.sparql_endpoint, q, vocab.sparql_username, vocab.sparql_password) if top_concepts is not None: # cache prefLabels and do not add duplicates. This prevents Concepts with sameAs properties appearing twice pl_cache = [] tcs = [] for tc in top_concepts: if tc[1] not in pl_cache: # only add if not already in cache tcs.append((tc[0], tc[1])) pl_cache.append(tc[1]) if len(tcs) == 0: q = """ PREFIX skos: <http://www.w3.org/2004/02/skos/core#> SELECT DISTINCT ?tc ?pl WHERE {{ {{ GRAPH ?g {{ {{ <{concept_scheme_uri}> skos:hasTopConcept ?tc . }} UNION {{ ?tc skos:inScheme <{concept_scheme_uri}> . }} {{ ?tc skos:prefLabel ?pl . FILTER(lang(?pl) = "{language}" || lang(?pl) = "") }} }} }} UNION {{ {{ <{concept_scheme_uri}> skos:hasTopConcept ?tc . }} UNION {{ ?tc skos:inScheme <{concept_scheme_uri}> . }} {{ ?tc skos:prefLabel ?pl . FILTER(lang(?pl) = "{language}" || lang(?pl) = "") }} }} }} ORDER BY ?pl """.format(concept_scheme_uri=vocab.concept_scheme_uri, language=self.language) for tc in self.gr.query(q): if tc[1] not in pl_cache: # only add if not already in cache tcs.append((tc[0], tc[1])) pl_cache.append(tc[1]) return tcs else: return None
def collect(details): """ For this source, one SPARQL endpoint is given for a series of vocabs which are all separate ConceptSchemes 'gsq-graphdb': { 'source': VocabSource.SPARQL, 'sparql_endpoint': 'http://graphdb.gsq.digital:7200/repositories/GSQ_Vocabularies_core' }, """ logging.debug('SPARQL collect()...') # Get all the ConceptSchemes from the SPARQL endpoint # Interpret each CS as a Vocab q = ''' PREFIX skos: <http://www.w3.org/2004/02/skos/core#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX dcterms: <http://purl.org/dc/terms/> SELECT * WHERE {{ GRAPH ?g {{ ?cs a skos:ConceptScheme . OPTIONAL {{ ?cs skos:prefLabel ?title . FILTER(lang(?title) = "{language}" || lang(?title) = "") }} OPTIONAL {{ ?cs dcterms:created ?created }} OPTIONAL {{ ?cs dcterms:issued ?issued }} OPTIONAL {{ ?cs dcterms:modified ?modified }} OPTIONAL {{ ?cs skos:definition ?description . FILTER(lang(?description) = "{language}" || lang(?description) = "") }} }} }} ORDER BY ?l '''.format(language=DEFAULT_LANGUAGE) # record just the IDs & title for the VocPrez in-memory vocabs list concept_schemes = Source.sparql_query( details['sparql_endpoint'], q, sparql_username=details.get('sparql_username'), sparql_password=details.get('sparql_password')) or {} sparql_vocabs = {} for cs in concept_schemes: # handling CS URIs that end with '/' vocab_id = cs['cs']['value'].replace('/conceptScheme', '').split('/')[-1] #print("re.search('{}', '{}')".format(details.get('uri_filter_regex'), cs['cs']['value'])) if details.get('uri_filter_regex') and not re.search( details['uri_filter_regex'], cs['cs']['value']): logging.debug('Skipping vocabulary {}'.format(vocab_id)) continue if len(vocab_id) < 2: vocab_id = cs['cs']['value'].split('/')[-2] sparql_vocabs[vocab_id] = Vocabulary( vocab_id, cs['cs']['value'].replace('/conceptScheme', ''), cs['title'].get('value') or vocab_id if cs.get('title') else vocab_id, # Need string value for sorting, not None cs['description'].get('value') if cs.get('description') is not None else None, None, # none of these SPARQL vocabs have creator info yet # TODO: add creator info to GSQ vocabs dateutil.parser.parse(cs.get('created').get('value')) if cs.get('created') is not None else None, # dct:issued not in Vocabulary # dateutil.parser.parse(cs.get('issued').get('value')) if cs.get('issued') is not None else None, dateutil.parser.parse(cs.get('modified').get('value')) if cs.get('modified') is not None else None, None, # versionInfo config.VocabSource.SPARQL, cs['cs']['value'], sparql_endpoint=details['sparql_endpoint'], sparql_username=details['sparql_username'], sparql_password=details['sparql_password']) g.VOCABS = {**g.VOCABS, **sparql_vocabs} logging.debug('SPARQL collect() complete.')