def _get_curie_prefix(uself, uri, existing_curies): ns_count = 0 from curies import CURIES # TODO: replace this with a once-per run update CURIES function def get_curie_online(uri): import requests try: r = requests.get("http://prefix.cc/reverse", params={ "uri": uri, "format": "txt" }) if r.status_code == 200: # primitive check to see if it really is prefix.cc replying with a text/plain response if r.headers["Content-Type"] == "text/plain": return r.text.split("\t")[0] else: return None else: return None except requests.exceptions.ConnectionError: # presumably this module can't access the internet or prefix.cc is down return None def get_curie_from_namespace(uri, existing_curies, ns_count): # strip off trailing hash or slash and return last path segment c = uri.rstrip("#/").split("/")[-1] # prevent CURIE collision = return nsX (X int) if we already have this one if c in existing_curies: ns_count += 1 return "ns" + str(ns_count) return c # attempt to look up the well-known curie for this Namespace in http://prefix.cc dump for k, v in CURIES.items(): if v == uri: return k # attempt to look up the well-known CURIE for this Namespace using http://prefix.cc online (more up-to-date) c = get_curie_online(uri) if c is not None: return c # can't find CURIE online so make up one c = get_curie_from_namespace(uri, existing_curies, ns_count) return c if c is not None else ""
def _get_curie_prefix(uri, ns): ns_count = 0 from curies import CURIES, EXTRA_CURIES # TODO: replace this with a once-per run update CURIES function def get_curie_online(uri): try: r = requests.get('http://prefix.cc/reverse', params={ 'uri': uri, 'format': 'txt' }) if r.status_code == 200: return r.text.split('\t')[0] else: return None except requests.exceptions.ConnectionError: # presumably this module can't access the internet or prefix.cc is down return None def get_curie_from_namespace(uri, ns_count): # strip off trailing hash or slash and return last path segment c = uri.rstrip('#/').split('/')[-1] # prevent CURIE collision = return nsX (x int) if we already have this one for k, v in ns.items(): if c == v: ns_count += 1 return 'ns' + str(ns_count) return c # attempt to look up the well-known curie for this Namespace in http://prefix.cc dump for k, v in CURIES.items(): if v == uri: return k for k, v in EXTRA_CURIES.items(): if v == uri: return k # attempt to look up the well-known CURIE for this Namespace using http://prefix.cc online (more up-to-date) c = get_curie_online(uri) if c is not None: return c # can't fund CURIE online so make up one c = get_curie_from_namespace(uri, ns_count) return c if c is not None else ''
def _extract_namespaces(self): """ First we get the namespaces from rdflib Then we cycle through all the URIs in the graph (all s, p & o), create a set of them, extract their base URIS (i.e. a non-duplicative list of them) see if they are in the namespaces, if not, generate their CURIE and add them to namespaces """ # get declared namespaces, keyed by URI ns = {} uri_bases = set() for k, v in self.G.namespaces(): ns[str(v)] = k # get other namespaces by extracting base URIs from all URIs for s, p, o in self.G: # exclude certain annotation URIs # and individuals (SDO.identifier) # exclude known annoying URIs (ORCID) if ( p == OWL.versionIRI or p == OWL.imports or p == SDO.identifier or str(o).startswith("https://orcid") ): pass else: # add only URI subjects (not Blank Nodes) if type(s) == URIRef: uri_bases.add(self._get_namespace_from_uri(str(s))) # predicates are always URIs so add them all uri_bases.add(self._get_namespace_from_uri(str(p))) # add only URI objects (not Blank Nodes or Literals), exclude emails if type(o) == URIRef and "@" not in str(o): uri_bases.add(self._get_namespace_from_uri(str(o))) # for the de-duplicated URIs, if the uri_base is not in namespaces, get CURIE and add it for uri_base in uri_bases: if ns.get(uri_base) is None: found = False # try to match uri_base to stored CURIES first if self.use_curies_stored: from pylode.curies import CURIES try: ns[uri_base] = list(CURIES.keys())[list(CURIES.values()).index(uri_base)] found = True except ValueError: pass if not found: if self.get_curies_online: print(f"getting CURIE for {uri_base} online") uri_prefix = self._get_curie_prefix(uri_base, [x for x in ns.values()]) ns[uri_base] = uri_prefix found = True # if not found: # ns[uri_base] = "ns" + str(len(ns)) # invert the key/values in instances for k, v in sorted(ns.items(), key=lambda x: x[1]): if v == "": # can't use empty dict keys in Python self.NAMESPACES[":"] = k else: self.NAMESPACES[v] = k del(self.NAMESPACES["xml"]) # that bloody XML namespace has to go!