Example #1
0
    def _get_curie_prefix(uself, uri, existing_curies):
        ns_count = 0

        from curies import CURIES

        # TODO: replace this with a once-per run update CURIES function
        def get_curie_online(uri):
            import requests
            try:
                r = requests.get("http://prefix.cc/reverse",
                                 params={
                                     "uri": uri,
                                     "format": "txt"
                                 })
                if r.status_code == 200:
                    # primitive check to see if it really is prefix.cc replying with a text/plain response
                    if r.headers["Content-Type"] == "text/plain":
                        return r.text.split("\t")[0]
                    else:
                        return None
                else:
                    return None
            except requests.exceptions.ConnectionError:
                # presumably this module can't access the internet or prefix.cc is down
                return None

        def get_curie_from_namespace(uri, existing_curies, ns_count):
            # strip off trailing hash or slash and return last path segment
            c = uri.rstrip("#/").split("/")[-1]

            # prevent CURIE collision = return nsX (X int) if we already have this one
            if c in existing_curies:
                ns_count += 1
                return "ns" + str(ns_count)

            return c

        # attempt to look up the well-known curie for this Namespace in http://prefix.cc dump
        for k, v in CURIES.items():
            if v == uri:
                return k

        # attempt to look up the well-known CURIE for this Namespace using http://prefix.cc online (more up-to-date)
        c = get_curie_online(uri)
        if c is not None:
            return c

        # can't find CURIE online so make up one
        c = get_curie_from_namespace(uri, existing_curies, ns_count)
        return c if c is not None else ""
Example #2
0
def _get_curie_prefix(uri, ns):
    ns_count = 0

    from curies import CURIES, EXTRA_CURIES

    # TODO: replace this with a once-per run update CURIES function
    def get_curie_online(uri):
        try:
            r = requests.get('http://prefix.cc/reverse',
                             params={
                                 'uri': uri,
                                 'format': 'txt'
                             })
            if r.status_code == 200:
                return r.text.split('\t')[0]
            else:
                return None
        except requests.exceptions.ConnectionError:
            # presumably this module can't access the internet or prefix.cc is down
            return None

    def get_curie_from_namespace(uri, ns_count):
        # strip off trailing hash or slash and return last path segment
        c = uri.rstrip('#/').split('/')[-1]

        # prevent CURIE collision = return nsX (x int) if we already have this one
        for k, v in ns.items():
            if c == v:
                ns_count += 1
                return 'ns' + str(ns_count)

        return c

    # attempt to look up the well-known curie for this Namespace in http://prefix.cc dump
    for k, v in CURIES.items():
        if v == uri:
            return k

    for k, v in EXTRA_CURIES.items():
        if v == uri:
            return k

    # attempt to look up the well-known CURIE for this Namespace using http://prefix.cc online (more up-to-date)
    c = get_curie_online(uri)
    if c is not None:
        return c

    # can't fund CURIE online so make up one
    c = get_curie_from_namespace(uri, ns_count)
    return c if c is not None else ''
Example #3
0
    def _extract_namespaces(self):
        """
        First we get the namespaces from rdflib

        Then we cycle through all the URIs in the graph (all s, p & o),
            create a set of them,
            extract their base URIS (i.e. a non-duplicative list of them)
            see if they are in the namespaces,
                if not, generate their CURIE and add them to namespaces
        """
        # get declared namespaces, keyed by URI
        ns = {}
        uri_bases = set()
        for k, v in self.G.namespaces():
            ns[str(v)] = k

        # get other namespaces by extracting base URIs from all URIs
        for s, p, o in self.G:
            # exclude certain annotation URIs
            # and individuals (SDO.identifier)
            # exclude known annoying URIs (ORCID)
            if (
                p == OWL.versionIRI
                or p == OWL.imports
                or p == SDO.identifier
                or str(o).startswith("https://orcid")
            ):
                pass
            else:
                # add only URI subjects (not Blank Nodes)
                if type(s) == URIRef:
                    uri_bases.add(self._get_namespace_from_uri(str(s)))

                # predicates are always URIs so add them all
                uri_bases.add(self._get_namespace_from_uri(str(p)))

                # add only URI objects (not Blank Nodes or Literals), exclude emails
                if type(o) == URIRef and "@" not in str(o):
                    uri_bases.add(self._get_namespace_from_uri(str(o)))

        # for the de-duplicated URIs, if the uri_base is not in namespaces, get CURIE and add it
        for uri_base in uri_bases:
            if ns.get(uri_base) is None:
                found = False
                # try to match uri_base to stored CURIES first
                if self.use_curies_stored:
                    from pylode.curies import CURIES

                    try:
                        ns[uri_base] = list(CURIES.keys())[list(CURIES.values()).index(uri_base)]
                        found = True
                    except ValueError:
                        pass

                if not found:
                    if self.get_curies_online:
                        print(f"getting CURIE for {uri_base} online")
                        uri_prefix = self._get_curie_prefix(uri_base, [x for x in ns.values()])
                        ns[uri_base] = uri_prefix
                        found = True

                # if not found:
                #     ns[uri_base] = "ns" + str(len(ns))

        # invert the key/values in instances
        for k, v in sorted(ns.items(), key=lambda x: x[1]):
            if v == "":  # can't use empty dict keys in Python
                self.NAMESPACES[":"] = k
            else:
                self.NAMESPACES[v] = k

        del(self.NAMESPACES["xml"])  # that bloody XML namespace has to go!