Ejemplo n.º 1
0
    def curiefy_result(self, result):
        """
        Convert subject, predicate and object IRIs to their respective CURIEs, where applicable
        """
        if result['subject']['type'] == 'uri':
            subject_curie = make_curie(result['subject']['value'])
            if subject_curie != result['subject']['value']:
                result['subject']['value'] = subject_curie
                result['subject']['type'] = 'curie'
            else:
                logging.warning("Could not CURIEfy {}".format(
                    result['subject']['value']))

        if result['object']['type'] == 'curie':
            object_curie = make_curie(result['object']['value'])
            if object_curie != result['object']['value']:
                result['object']['value'] = object_curie
                result['object']['type'] = 'curie'
            else:
                logging.warning("Could not CURIEfy {}".format(
                    result['object']['value']))

        predicate_curie = make_curie(result['predicate']['value'])
        if predicate_curie != result['predicate']['value']:
            result['predicate']['value'] = predicate_curie
            result['predicate']['type'] = 'curie'
        else:
            result['predicate']['type'] = 'uri'

        return result
Ejemplo n.º 2
0
    def add_edge_attribute(self, subject_iri: Union[URIRef, str],
                           object_iri: URIRef, predicate_iri: URIRef, key: str,
                           value: str) -> None:
        """
        Adds an attribute to an edge, while taking into account whether the attribute
        should be multi-valued.
        Multi-valued properties will not contain duplicates.

        The key may be a rdflib.URIRef or a URI string that maps onto a property name
        as defined in `rdf_utils.property_mapping`.

        If the nodes in the edge does not exist then they will be created
        using subject_iri and object_iri.

        If the edge itself does not exist then it will be created using
        subject_iri, object_iri and predicate_iri.

        Parameters
        ----------
        subject_iri: [rdflib.URIRef, str]
            The IRI of the subject node of an edge in rdflib.Graph
        object_iri: rdflib.URIRef
            The IRI of the object node of an edge in rdflib.Graph
        predicate_iri: rdflib.URIRef
            The IRI of the predicate representing an edge in rdflib.Graph
        key: str
            The name of the attribute. Can be a rdflib.URIRef or URI string
        value: str
            The value of the attribute

        """
        if key.lower() in is_property_multivalued:
            key = key.lower()
        else:
            if not isinstance(key, URIRef):
                key = URIRef(key)
            key = property_mapping.get(key)

        if key is not None:
            subject_curie = make_curie(subject_iri)
            object_curie = make_curie(object_iri)
            edge_label = process_iri(predicate_iri)
            if is_curie(edge_label):
                edge_label = curie_lookup(edge_label)
            edge_key = generate_edge_key(subject_curie, edge_label,
                                         object_curie)
            attr_dict = self.graph.get_edge_data(subject_curie,
                                                 object_curie,
                                                 key=edge_key)
            self._add_attribute(attr_dict, key, value)
Ejemplo n.º 3
0
    def add_node(self, iri: URIRef) -> str:
        """
        This method should be used by all derived classes when adding a node to
        the networkx.MultiDiGraph. This ensures that a node's identifier is a CURIE,
        and that it's 'iri' property is set.

        Returns the CURIE identifier for the node in the networkx.MultiDiGraph

        Parameters
        ----------
        iri : rdflib.URIRef
            IRI of a node

        Returns
        -------
        str
            The CURIE identifier of a node

        """
        kwargs = {
            'iri': str(iri),
        }
        if 'provided_by' in self.graph_metadata:
            kwargs['provided_by'] = self.graph_metadata['provided_by']

        n = make_curie(iri)

        if n not in self.graph:
            self.graph.add_node(n, **kwargs)

        return n
Ejemplo n.º 4
0
    def add_edge(self, subject_iri: URIRef, object_iri: URIRef, predicate_iri: URIRef) -> Tuple[str, str, str]:
        """
        This method should be used by all derived classes when adding an edge to the networkx.MultiDiGraph.
        This ensures that the `subject` and `object` identifiers are CURIEs, and that `edge_label` is in the correct form.

        Returns the CURIE identifiers used for the `subject` and `object` in the
        networkx.MultiDiGraph, and the processed `edge_label`.

        Parameters
        ----------
        subject_iri: rdflib.URIRef
            Subject IRI for the subject in a triple
        object_iri: rdflib.URIRef
            Object IRI for the object in a triple
        predicate_iri: rdflib.URIRef
            Predicate IRI for the predicate in a triple

        Returns
        -------
        Tuple[str, str, str]
            A 3-nary tuple (of the form subject, object, predicate) that represents the edge

        """
        s = self.add_node(subject_iri)
        o = self.add_node(object_iri)
        relation = make_curie(predicate_iri)
        edge_label = process_iri(predicate_iri)
        if ' ' in edge_label:
            logging.debug("predicate IRI '{}' yields edge_label '{}' that not in snake_case form; replacing ' ' with '_'".format(predicate_iri, edge_label))
        if edge_label.startswith(self.BIOLINK):
            logging.debug("predicate IRI '{}' yields edge_label '{}' that starts with '{}'; removing IRI prefix".format(predicate_iri, edge_label, self.BIOLINK))
            edge_label = edge_label.replace(self.BIOLINK, '')

        if PrefixManager.is_curie(edge_label):
            name = curie_lookup(edge_label)
            if name:
                logging.debug("predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; Using its mapping instead: {}".format(predicate_iri, edge_label, name))
                edge_label = name
            else:
                logging.debug("predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; defaulting back to {}".format(predicate_iri, edge_label, self.DEFAULT_EDGE_LABEL))
                edge_label = self.DEFAULT_EDGE_LABEL

        kwargs = {
            'subject': s,
            'predicate': predicate_iri,
            'object': o,
            'relation': relation,
            'edge_label': edge_label
        }
        if 'provided_by' in self.graph_metadata:
            kwargs['provided_by'] = self.graph_metadata['provided_by']

        key = generate_edge_key(s, edge_label, o)
        if not self.graph.has_edge(s, o, key=key):
            self.graph.add_edge(s, o, key=key, **kwargs)

        return s, o, edge_label
Ejemplo n.º 5
0
    def _add_attribute(self, attr_dict: Dict, key: str, value: str) -> None:
        """
        Adds an attribute to the attribute dictionary, respecting whether or not
        that attribute should be multi-valued.
        Multi-valued attributes will not contain duplicates.

        Some attributes are singular form of others. In such cases overflowing values
        will be placed into the correlating multi-valued attribute.
        For example, 'name' attribute will hold only one value while any additional
        value will be stored as 'synonym' attribute.

        Parameters
        ----------
        attr_dict: dict
            Dictionary representing the attribute set of a node or an edge in a networkx graph
        key: str
            The name of the attribute
        value: str
            The value of the attribute

        """
        if key is None or key not in is_property_multivalued:
            logging.warning(
                "Discarding key {} as it is not a valid property.".format(key))
            return

        value = make_curie(process_iri(value))

        if is_property_multivalued[key]:
            if key not in attr_dict:
                attr_dict[key] = [value]
            elif value not in attr_dict[key]:
                attr_dict[key].append(value)
        else:
            if key not in attr_dict:
                attr_dict[key] = value
            elif key == 'name':
                self._add_attribute(attr_dict, 'synonym', value)
Ejemplo n.º 6
0
    def add_edge(self, subject_iri: URIRef, object_iri: URIRef,
                 predicate_iri: URIRef) -> Tuple[str, str, str]:
        """
        This method should be used by all derived classes when adding an edge to the networkx.MultiDiGraph.
        This ensures that the subject and object identifiers are CURIEs, and that edge_label is in the correct form.

        Returns the CURIE identifiers used for the subject and object in the
        networkx.MultiDiGraph, and the processed edge_label.

        Parameters
        ----------
        subject_iri: rdflib.URIRef
            Subject IRI for the subject in a triple
        object_iri: rdflib.URIRef
            Object IRI for the object in a triple
        predicate_iri: rdflib.URIRef
            Predicate IRI for the predicate in a triple

        Returns
        -------
        Tuple[str, str, str]
            A 3-nary tuple (of the form subject, object, predicate) that represents the edge

        """
        s = self.add_node(subject_iri)
        o = self.add_node(object_iri)

        relation = make_curie(predicate_iri)
        edge_label = process_iri(predicate_iri)
        if ' ' in edge_label:
            logging.debug(
                "predicate IRI '{}' yields edge_label '{}' that not in snake_case form; replacing ' ' with '_'"
                .format(predicate_iri, edge_label))
        # TODO: shouldn't this move to the utilities function process_uri()
        if edge_label.startswith(self.BIOLINK):
            logging.debug(
                "predicate IRI '{}' yields edge_label '{}' that starts with '{}'; removing IRI prefix"
                .format(predicate_iri, edge_label, self.BIOLINK))
            edge_label = edge_label.replace(self.BIOLINK, '')

        # TODO: is there no way to get label of a CURIE?
        # TODO: this should also move to the utilities function
        # Any service? or preload required ontologies by prefix?
        if ':' in edge_label:
            logging.debug(
                "edge label '{}' is a CURIE; defaulting back to 'related_to'".
                format(edge_label))
            logging.debug(
                "predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; defaulting back to {}"
                .format(predicate_iri, edge_label, self.DEFAULT_EDGE_LABEL))
            edge_label = self.DEFAULT_EDGE_LABEL

        kwargs = {'relation': relation, 'edge_label': edge_label}
        if 'provided_by' in self.graph_metadata:
            kwargs['provided_by'] = self.graph_metadata['provided_by']

        if self.graph.has_edge(s, o, key=edge_label):
            logging.debug("{} -- {} --> {} edge already exists".format(
                s, edge_label, o))
        else:
            self.graph.add_edge(s, o, key=edge_label, **kwargs)

        return s, o, edge_label
Ejemplo n.º 7
0
    "shortened life span",
    "http://purl.obolibrary.org/obo/CHEBI_23367": "molecular entity",
    "http://purl.obolibrary.org/obo/CHEBI_23888": "drug",
    "http://purl.obolibrary.org/obo/CHEBI_51086": "chemical role",
    "http://purl.obolibrary.org/obo/UPHENO_0001001": "phenotypic feature",
    "http://purl.obolibrary.org/obo/GO_0008150": "biological_process",
    "http://purl.obolibrary.org/obo/GO_0005575": "cellular component",
    "http://purl.obolibrary.org/obo/SO_0000704": "gene",
    "http://purl.obolibrary.org/obo/SO_0000110": "sequence feature",
    "http://purl.obolibrary.org/obo/GENO_0000536": "genotype",
}

mapping = defaultdict(set)

for key, value in iri_mapping.items():
    mapping[make_curie(key)].add(value)

for key, value in toolkit.generator.mappings.items():
    mapping[key].update(value)


def walk(node, next_node_generator):
    to_visit = {node: 0}  # Dict[URIRef, Integer]
    visited = {}  # Dict[URIRef, Integer]

    while to_visit != {}:
        m, score = to_visit.popitem()
        visited[m] = score
        for t in next_node_generator(m):
            if isinstance(t, tuple) and len(t) > 1:
                n, s = t
Ejemplo n.º 8
0
    def load_edges(self,
                   association='bl:ChemicalToGeneAssociation',
                   limit=None):
        sparql = SPARQLWrapper(self.url)
        query = render(self.count_query, {'association': association})
        logging.debug(query)
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        count = int(results['results']['bindings'][0]['triples']['value'])
        logging.info("Expected triples for query: {}".format(count))
        step = 1000
        start = 0
        for i in range(step, count + step, step):
            end = i
            query = render(self.edge_query, {
                'association': association,
                'offset': start,
                'limit': step
            })
            sparql.setQuery(query)
            results = sparql.query().convert()
            node_list = set()
            for r in results['results']['bindings']:
                node_list.add("<{}>".format(r['subject']['value']))
                node_list.add("<{}>".format(r['object']['value']))
            start = end
            self.load_nodes(node_list)
            logging.info("Fetching edges...")
            map = {}
            for r in results['results']['bindings']:
                s = r['subject']['value']
                p = r['predicate']['value']
                o = r['object']['value']
                self.add_edge(s, o, p)
                continue

                # make_curie_result(r)
                key = ((r['subject']['value'], r['object']['value']),
                       r['predicate']['value'])
                if key in map:
                    # seen this triple before. look at properties
                    edge_property_key = r['edge_property_key']
                    edge_property_key_curie = make_curie(
                        edge_property_key['value'])
                    if edge_property_key_curie.startswith('bl:'):
                        edge_property_key_curie = edge_property_key_curie.split(
                            ':')[1]
                    edge_property_value = r['edge_property_value']
                    if edge_property_value['type'] == 'uri':
                        edge_property_value_curie = make_curie(
                            edge_property_value['value'])
                    else:
                        edge_property_value_curie = edge_property_value[
                            'value']
                    map[key][
                        edge_property_key_curie] = edge_property_value_curie
                else:
                    map[key] = {}
                    edge_property_key = r['edge_property_key']
                    edge_property_key_curie = make_curie(
                        edge_property_key['value'])
                    if edge_property_key_curie.startswith('bl:'):
                        edge_property_key_curie = edge_property_key_curie.split(
                            ':')[1]

                    edge_property_value = r['edge_property_value']
                    if edge_property_value['type'] == 'uri':
                        edge_property_value_curie = make_curie(
                            edge_property_value['value'])
                    else:
                        edge_property_value_curie = edge_property_value[
                            'value']
                    map[key][
                        edge_property_key_curie] = edge_property_value_curie

            logging.info("Loading edges...")
            for key, properties in map.items():
                self.graph.add_node(key[0][0])
                self.graph.add_node(key[0][1])
                if 'is_defined_by' not in properties and self.IS_DEFINED_BY:
                    properties['is_defined_by'] = self.IS_DEFINED_BY
                if key[1].startswith('bl:'):
                    relation = key[1].split(':')[1]
                else:
                    relation = key[1]
                properties['edge_label'] = relation
                if 'relation' not in properties:
                    properties['relation'] = relation
                self.graph.add_edge(key[0][0], key[0][1], **properties)
            map.clear()

            if limit is not None and i > limit:
                break

        self.set_categories()