Ejemplo n.º 1
0
def test_clique_merge8():
    """
    Test for clique merge where same_as appear as both node and edge properties.
    """
    ppm = {"biolink:Gene": ["HGNC", "NCBIGene", "ENSEMBL", "OMIM"]}
    g1 = NxGraph()
    g1.add_node("HGNC:1", **{"category": ["biolink:Gene"]})
    g1.add_node("OMIM:2", **{"category": ["biolink:Gene"], "same_as": ["HGNC:1"]})
    g1.add_node("NCBIGene:3", **{"category": ["biolink:NamedThing"]})
    g1.add_node("ENSEMBL:4", **{"category": ["biolink:Gene"], "same_as": ["HGNC:1"]})

    g1.add_node(
        "ENSEMBL:6", **{"category": ["biolink:Gene"], "same_as": ["NCBIGene:8"]}
    )
    g1.add_node("HGNC:7", **{"category": ["biolink:Gene"]})
    g1.add_node("NCBIGene:8", **{"category": ["biolink:Gene"]})

    g1.add_edge(
        "NCBIGene:3",
        "HGNC:1",
        edge_key=generate_edge_key("NCBIGene:3", "biolink:same_as", "HGNC:1"),
        **{"predicate": "biolink:same_as", "relation": "owl:equivalentClass"}
    )

    g1.add_edge(
        "ENSEMBL:6",
        "NCBIGene:8",
        edge_key=generate_edge_key("ENSEMBL:6", "biolink:same_as", "NCBIGene:8"),
        **{"predicate": "biolink:same_as", "relation": "owl:equivalentClass"}
    )
    g1.add_edge(
        "HGNC:7",
        "NCBIGene:8",
        edge_key=generate_edge_key("HGNC:7", "biolink:same_as", "NCBIGene:8"),
        **{"predicate": "biolink:same_as", "relation": "owl:equivalentClass"}
    )

    updated_graph, clique_graph = clique_merge(
        target_graph=g1, prefix_prioritization_map=ppm
    )
    assert updated_graph.number_of_nodes() == 2
    assert updated_graph.number_of_edges() == 0
    assert updated_graph.has_node("HGNC:1")
    assert updated_graph.has_node("HGNC:7")

    n1 = updated_graph.nodes()["HGNC:1"]
    assert "OMIM:2" in n1["same_as"]
    assert "NCBIGene:3" in n1["same_as"]
    assert "ENSEMBL:4" in n1["same_as"]

    n2 = updated_graph.nodes()["HGNC:7"]
    assert "ENSEMBL:6" in n2["same_as"]
    assert "NCBIGene:8" in n2["same_as"]

    assert not updated_graph.has_node("OMIM:2")
    assert not updated_graph.has_node("NCBIGene:3")
    assert not updated_graph.has_node("ENSEMBL:4")
    assert not updated_graph.has_node("ENSEMBL:6")
    assert not updated_graph.has_node("NCBIGene:8")
Ejemplo n.º 2
0
    def load_edge(self, edge: Dict) -> None:
        """
        Load an edge into a networkx.MultiDiGraph

        .. Note::
            This methods transformers Reasoner Std API format fields to Biolink Model fields.

        Parameters
        ----------
        edge : dict
            An edge

        """
        if 'source_id' in edge:
            edge['subject'] = edge['source_id']
        if 'target_id' in edge:
            edge['object'] = edge['target_id']
        if 'relation_label' in edge:
            edge['edge_label'] = edge['relation_label'][0]

        edge = self.validate_edge(edge)
        kwargs = PandasTransformer._build_kwargs(edge.copy())
        if 'subject' in kwargs and 'object' in kwargs:
            s = kwargs['subject']
            o = kwargs['object']
            key = generate_edge_key(s, kwargs['edge_label'], o)
            self.graph.add_edge(s, o, key, **kwargs)
        else:
            logging.info(
                "Ignoring edge with either a missing 'subject' or 'object': {}"
                .format(kwargs))
Ejemplo n.º 3
0
    def load_edge(self, edge_record: List) -> Tuple:
        """
        Load an edge into an instance of BaseGraph

        Parameters
        ----------
        edge_record: List
            A 4-tuple edge record

        Returns
        -------
        Tuple
            A tuple with subject ID, object ID, edge key, and edge data

        """

        subject_node = edge_record[0]
        edge = edge_record[1]
        object_node = edge_record[2]

        if 'provided_by' in self.graph_metadata and 'provided_by' not in edge.keys():
            edge['provided_by'] = self.graph_metadata['provided_by']
        if 'id' not in edge.keys():
            edge['id'] = generate_uuid()
        key = generate_edge_key(subject_node['id'], edge['predicate'], object_node['id'])
        edge = validate_edge(edge)
        edge = sanitize_import(edge.copy())
        self.edge_properties.update(edge.keys())
        return subject_node['id'], object_node['id'], key, edge
Ejemplo n.º 4
0
    def read_edge(self, edge: Dict) -> Optional[Tuple]:
        """
        Load an edge into an instance of BaseGraph.

        Parameters
        ----------
        edge: Dict
            An edge

        Returns
        -------
        Optional[Tuple]
            A tuple that contains subject id, object id, edge key, and edge data

        """
        edge = validate_edge(edge)
        edge_data = sanitize_import(edge.copy())
        if 'id' not in edge_data:
            edge_data['id'] = generate_uuid()
        s = edge_data['subject']
        o = edge_data['object']
        if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys():
            edge_data['provided_by'] = self.graph_metadata['provided_by']
        key = generate_edge_key(s, edge_data['predicate'], o)
        self.edge_properties.update(list(edge_data.keys()))
        if self.check_edge_filter(edge_data):
            self.node_properties.update(edge_data.keys())
            return s, o, key, edge_data
Ejemplo n.º 5
0
    def load_edge(self, edge_record: List) -> Tuple:
        """
        Load an edge into an instance of BaseGraph

        Parameters
        ----------
        edge_record: List
            A 4-tuple edge record

        Returns
        -------
        Tuple
            A tuple with subject ID, object ID, edge key, and edge data

        """

        subject_node = edge_record[0]
        edge_data = edge_record[1]
        object_node = edge_record[2]

        self.set_edge_provenance(edge_data)

        if "id" not in edge_data.keys():
            edge_data["id"] = generate_uuid()
        key = generate_edge_key(subject_node["id"], edge_data["predicate"],
                                object_node["id"])

        edge_data = self.validate_edge(edge_data)
        if not edge_data:
            return ()

        edge_data = sanitize_import(edge_data.copy())
        self.edge_properties.update(edge_data.keys())
        return subject_node["id"], object_node["id"], key, edge_data
Ejemplo n.º 6
0
    def load_ontologies(self):
        """
        Load all required ontologies.
        """
        for ontology in self.ontologies.values():
            rdfgraph = rdflib.Graph()
            input_format = rdflib.util.guess_format(ontology)
            rdfgraph.parse(ontology, format=input_format)
            triples = rdfgraph.triples((None, rdflib.RDFS.subClassOf, None))
            for s, p, o in triples:
                subject_curie = contract(s)
                object_curie = contract(o)
                self.ontology_graph.add_node(subject_curie)
                self.ontology_graph.add_node(object_curie)
                key = generate_edge_key(subject_curie, 'subclass_of',
                                        object_curie)
                self.ontology_graph.add_edge(
                    subject_curie, object_curie, key, **{
                        'edge_label': 'subclass_of',
                        'relation': 'rdfs:subClassOf'
                    })

            triples = rdfgraph.triples((None, rdflib.RDFS.label, None))
            for s, p, o in triples:
                key = contract(s)
                value = o.value
                value = value.replace(' ', '_')
                self.curie_map[key] = value
                self.ontology_graph.add_node(key, name=value)
Ejemplo n.º 7
0
    def load_edge(self, edge: Dict) -> None:
        """
        Load an edge into a networkx.MultiDiGraph

        Parameters
        ----------
        edge : dict
            An edge

        """
        if self.check_edge_filter(edge):
            edge = Transformer.validate_edge(edge)
            kwargs = PandasTransformer._build_kwargs(edge.copy())
            if 'subject' in kwargs and 'object' in kwargs:
                s = kwargs['subject']
                o = kwargs['object']
                if 'provided_by' in self.graph_metadata and 'provided_by' not in kwargs.keys(
                ):
                    kwargs['provided_by'] = self.graph_metadata['provided_by']
                key = generate_edge_key(s, kwargs['edge_label'], o)
                self.graph.add_edge(s, o, key, **kwargs)
                self._edge_properties.update(list(kwargs.keys()))
            else:
                logging.info(
                    "Ignoring edge with either a missing 'subject' or 'object': {}"
                    .format(kwargs))
        else:
            logging.debug(f"Edge fails edge filters: {edge}")
Ejemplo n.º 8
0
    def read_edge(self, edge: Dict) -> Optional[Tuple]:
        """
        Load an edge into an instance of BaseGraph.

        Parameters
        ----------
        edge: Dict
            An edge

        Returns
        -------
        Optional[Tuple]
            A tuple that contains subject id, object id, edge key, and edge data

        """
        edge = self.validate_edge(edge)
        if not edge:
            return None

        edge_data = sanitize_import(edge.copy(), self.list_delimiter)

        if "id" not in edge_data:
            edge_data["id"] = generate_uuid()
        s = edge_data["subject"]
        o = edge_data["object"]

        self.set_edge_provenance(edge_data)

        key = generate_edge_key(s, edge_data["predicate"], o)
        self.edge_properties.update(list(edge_data.keys()))
        if self.check_edge_filter(edge_data):
            self.node_properties.update(edge_data.keys())
            return s, o, key, edge_data
Ejemplo n.º 9
0
    def add_edge(self, subject_iri: URIRef, object_iri: URIRef, predicate_iri: URIRef) -> Tuple[str, str, str]:
        """
        This method should be used by all derived classes when adding an edge to the networkx.MultiDiGraph.
        This ensures that the `subject` and `object` identifiers are CURIEs, and that `edge_label` is in the correct form.

        Returns the CURIE identifiers used for the `subject` and `object` in the
        networkx.MultiDiGraph, and the processed `edge_label`.

        Parameters
        ----------
        subject_iri: rdflib.URIRef
            Subject IRI for the subject in a triple
        object_iri: rdflib.URIRef
            Object IRI for the object in a triple
        predicate_iri: rdflib.URIRef
            Predicate IRI for the predicate in a triple

        Returns
        -------
        Tuple[str, str, str]
            A 3-nary tuple (of the form subject, object, predicate) that represents the edge

        """
        s = self.add_node(subject_iri)
        o = self.add_node(object_iri)
        relation = self.prefix_manager.contract(predicate_iri)
        edge_label = process_iri(predicate_iri)
        if ' ' in edge_label:
            logging.debug("predicate IRI '{}' yields edge_label '{}' that not in snake_case form; replacing ' ' with '_'".format(predicate_iri, edge_label))
        if edge_label.startswith(self.BIOLINK):
            logging.debug("predicate IRI '{}' yields edge_label '{}' that starts with '{}'; removing IRI prefix".format(predicate_iri, edge_label, self.BIOLINK))
            edge_label = edge_label.replace(self.BIOLINK, '')

        if PrefixManager.is_curie(edge_label):
            name = curie_lookup(edge_label)
            if name:
                logging.debug("predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; Using its mapping instead: {}".format(predicate_iri, edge_label, name))
                edge_label = name
            else:
                logging.debug("predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; defaulting back to {}".format(predicate_iri, edge_label, self.DEFAULT_EDGE_LABEL))
                edge_label = self.DEFAULT_EDGE_LABEL

        kwargs = {
            'subject': s,
            'predicate': str(predicate_iri),
            'object': o,
            'relation': relation,
            'edge_label': f"biolink:{edge_label}"
        }
        if 'provided_by' in self.graph_metadata:
            kwargs['provided_by'] = self.graph_metadata['provided_by']

        key = generate_edge_key(s, edge_label, o)
        if not self.graph.has_edge(s, o, key=key):
            self.graph.add_edge(s, o, key=key, **kwargs)
        # TODO: support append
        return s, o, edge_label
Ejemplo n.º 10
0
    def write_edge(self, record: Dict) -> None:
        """
        Write an edge record to graph.

        Parameters
        ----------
        record: Dict
            An edge record

        """
        key = (record['key'] if 'key' in record else generate_edge_key(
            record['subject'], record['predicate'], record['object']))
        self.graph.add_edge(record['subject'], record['object'], key, **record)
Ejemplo n.º 11
0
    def add_edge_attribute(self, subject_iri: Union[URIRef, str],
                           object_iri: URIRef, predicate_iri: URIRef, key: str,
                           value: str) -> None:
        """
        Adds an attribute to an edge, while taking into account whether the attribute
        should be multi-valued.
        Multi-valued properties will not contain duplicates.

        The key may be a rdflib.URIRef or a URI string that maps onto a property name
        as defined in `rdf_utils.property_mapping`.

        If the nodes in the edge does not exist then they will be created
        using subject_iri and object_iri.

        If the edge itself does not exist then it will be created using
        subject_iri, object_iri and predicate_iri.

        Parameters
        ----------
        subject_iri: [rdflib.URIRef, str]
            The IRI of the subject node of an edge in rdflib.Graph
        object_iri: rdflib.URIRef
            The IRI of the object node of an edge in rdflib.Graph
        predicate_iri: rdflib.URIRef
            The IRI of the predicate representing an edge in rdflib.Graph
        key: str
            The name of the attribute. Can be a rdflib.URIRef or URI string
        value: str
            The value of the attribute

        """
        if key.lower() in is_property_multivalued:
            key = key.lower()
        else:
            if not isinstance(key, URIRef):
                key = URIRef(key)
            key = property_mapping.get(key)

        if key is not None:
            subject_curie = make_curie(subject_iri)
            object_curie = make_curie(object_iri)
            edge_label = process_iri(predicate_iri)
            if is_curie(edge_label):
                edge_label = curie_lookup(edge_label)
            edge_key = generate_edge_key(subject_curie, edge_label,
                                         object_curie)
            attr_dict = self.graph.get_edge_data(subject_curie,
                                                 object_curie,
                                                 key=edge_key)
            self._add_attribute(attr_dict, key, value)
Ejemplo n.º 12
0
    def write_edge(self, record: Dict) -> None:
        """
        Write an edge record to graph.

        Parameters
        ----------
        record: Dict
            An edge record

        """
        if "key" in record:
            key = (record["key"])
        else:
            key = generate_edge_key(
                 record["subject"], record["predicate"], record["object"]
            )
        self.graph.add_edge(record["subject"], record["object"], key, **record)
Ejemplo n.º 13
0
    def load_edge(self, edge: Dict) -> None:
        """
        Load an edge into a networkx.MultiDiGraph

        Parameters
        ----------
        edge : dict
            An edge

        """
        edge = Transformer.validate_edge(edge)
        kwargs = PandasTransformer._build_kwargs(edge.copy())
        if 'subject' in kwargs and 'object' in kwargs:
            s = kwargs['subject']
            o = kwargs['object']
            key = generate_edge_key(s, kwargs['edge_label'], o)
            self.graph.add_edge(s, o, key, **kwargs)
        else:
            logging.info(
                "Ignoring edge with either a missing 'subject' or 'object': {}"
                .format(kwargs))
Ejemplo n.º 14
0
    def load_edge(self, edge: Relationship) -> None:
        """
        Load an edge from neo4jrestclient.client.Relationship into networkx.MultiDiGraph

        Parameters
        ----------
        edge: neo4jrestclient.client.Relationship
            An edge

        """
        edge_subject = edge.start
        edge_predicate = edge.properties
        edge_object = edge.end

        subject_id = edge_subject[
            'id'] if 'id' in edge_subject else edge_subject.id
        object_id = edge_object['id'] if 'id' in edge_object else edge_object.id

        attributes = {}

        for key, value in edge_predicate.items():
            attributes[key] = value

        if 'subject' not in attributes:
            attributes['subject'] = subject_id
        if 'object' not in attributes:
            attributes['object'] = object_id
        if 'edge_label' not in attributes:
            attributes['edge_label'] = edge.type

        if not self.graph.has_node(subject_id):
            self.load_node(edge_subject)

        if not self.graph.has_node(object_id):
            self.load_node(edge_object)

        key = generate_edge_key(subject_id, attributes['edge_label'],
                                object_id)
        self.graph.add_edge(subject_id, object_id, key, **attributes)
Ejemplo n.º 15
0
def consolidate_edges(
    target_graph: BaseGraph, clique_graph: nx.MultiDiGraph, leader_annotation: str
) -> BaseGraph:
    """
    Move all edges from nodes in a clique to the clique leader.

    Original subject and object of a node are preserved via ``ORIGINAL_SUBJECT_PROPERTY`` and ``ORIGINAL_OBJECT_PROPERTY``

    Parameters
    ----------
    target_graph: kgx.graph.base_graph.BaseGraph
        The original graph
    clique_graph: networkx.MultiDiGraph
        The clique graph
    leader_annotation: str
        The field on a node that signifies that the node is the leader of a clique

    Returns
    -------
    kgx.graph.base_graph.BaseGraph
        The target graph where all edges from nodes in a clique are moved to clique leader

    """
    cliques = list(nx.strongly_connected_components(clique_graph))
    log.info(f"Consolidating edges in {len(cliques)} cliques")
    for clique in cliques:
        log.debug(f"Processing clique: {clique}")
        leaders: List = [
            x
            for x in clique
            if leader_annotation in clique_graph.nodes()[x]
            and clique_graph.nodes()[x][leader_annotation]
        ]
        if len(leaders) == 0:
            log.debug("No leader elected for clique {}; skipping".format(clique))
            continue
        leader: str = leaders[0]
        # update nodes in target graph
        target_graph.set_node_attributes(
            target_graph,
            {
                leader: {
                    leader_annotation: clique_graph.nodes()[leader].get(
                        leader_annotation
                    ),
                    "election_strategy": clique_graph.nodes()[leader].get(
                        "election_strategy"
                    ),
                }
            },
        )
        leader_equivalent_identifiers = set([x for x in clique_graph.neighbors(leader)])
        for node in clique:
            if node == leader:
                continue
            log.debug(f"Looking for in_edges for {node}")
            in_edges = target_graph.in_edges(node, keys=False, data=True)
            filtered_in_edges = [x for x in in_edges if x[2]["predicate"] != SAME_AS]
            equiv_in_edges = [x for x in in_edges if x[2]["predicate"] == SAME_AS]
            log.debug(f"Moving {len(in_edges)} in-edges from {node} to {leader}")
            for u, v, edge_data in filtered_in_edges:
                key = generate_edge_key(u, edge_data["predicate"], v)
                target_graph.remove_edge(u, v, edge_key=key)
                edge_data[ORIGINAL_SUBJECT_PROPERTY] = edge_data["subject"]
                edge_data[ORIGINAL_OBJECT_PROPERTY] = edge_data["object"]
                edge_data["object"] = leader
                key = generate_edge_key(u, edge_data["predicate"], leader)
                if (
                    edge_data["subject"] == edge_data["object"]
                    and edge_data["predicate"] == SUBCLASS_OF
                ):
                    continue
                target_graph.add_edge(
                    edge_data["subject"], edge_data["object"], key, **edge_data
                )

            log.debug(f"Looking for out_edges for {node}")
            out_edges = target_graph.out_edges(node, keys=False, data=True)
            filtered_out_edges = [x for x in out_edges if x[2]["predicate"] != SAME_AS]
            equiv_out_edges = [x for x in out_edges if x[2]["predicate"] == SAME_AS]
            log.debug(f"Moving {len(out_edges)} out-edges from {node} to {leader}")
            for u, v, edge_data in filtered_out_edges:
                key = generate_edge_key(u, edge_data["predicate"], v)
                target_graph.remove_edge(u, v, edge_key=key)
                edge_data[ORIGINAL_SUBJECT_PROPERTY] = edge_data["subject"]
                edge_data[ORIGINAL_OBJECT_PROPERTY] = edge_data["object"]
                edge_data["subject"] = leader
                key = generate_edge_key(leader, edge_data["predicate"], v)
                if (
                    edge_data["subject"] == edge_data["object"]
                    and edge_data["predicate"] == SUBCLASS_OF
                ):
                    continue
                target_graph.add_edge(
                    edge_data["subject"], edge_data["object"], key, **edge_data
                )

            log.debug(f"equiv out edges: {equiv_out_edges}")
            equivalent_identifiers = set()
            for u, v, edge_data in equiv_in_edges:
                if u != leader:
                    equivalent_identifiers.add(u)
                if v != leader:
                    equivalent_identifiers.add(v)
                target_graph.remove_edge(
                    u, v, edge_key=generate_edge_key(u, SAME_AS, v)
                )

            log.debug(f"equiv out edges: {equiv_out_edges}")
            for u, v, edge_data in equiv_out_edges:
                if u != leader:
                    log.debug(f"{u} is an equivalent identifier of leader {leader}")
                    equivalent_identifiers.add(u)
                if v != leader:
                    log.debug(f"{v} is an equivalent identifier of leader {leader}")
                    equivalent_identifiers.add(v)
                target_graph.remove_edge(
                    u, v, edge_key=generate_edge_key(u, SAME_AS, v)
                )

            leader_equivalent_identifiers.update(equivalent_identifiers)

        log.debug(
            f"setting same_as property to leader node with {leader_equivalent_identifiers}"
        )
        target_graph.set_node_attributes(
            target_graph, {leader: {"same_as": list(leader_equivalent_identifiers)}}
        )
        log.debug(
            f"removing equivalent nodes of leader: {leader_equivalent_identifiers}"
        )
        for n in leader_equivalent_identifiers:
            target_graph.remove_node(n)
    return target_graph
Ejemplo n.º 16
0
def test_generate_edge_key():
    key = generate_edge_key('S:CURIE', 'related_to', 'O:CURIE')
    assert key == 'S:CURIE-related_to-O:CURIE'
Ejemplo n.º 17
0
def test_clique_merge7():
    """
    Test for clique merge where each clique has a node that has
    a disjoint category from other nodes in a clique and the node is
    not a participant in same_as edges.
    """
    ppm = {'biolink:Gene': ['HGNC', 'NCBIGene', 'ENSEMBL', 'OMIM']}
    g1 = NxGraph()
    g1.add_node('HGNC:1', **{'category': ['biolink:Gene']})
    g1.add_node('OMIM:2', **{'category': ['biolink:Disease']})
    g1.add_node('NCBIGene:3', **{'category': ['biolink:NamedThing']})
    g1.add_node('ENSEMBL:4', **{'category': ['biolink:Gene']})

    g1.add_node('ENSEMBL:6', **{'category': ['biolink:Gene']})
    g1.add_node('HGNC:7', **{'category': ['biolink:Disease']})
    g1.add_node('NCBIGene:8', **{'category': ['biolink:Gene']})

    g1.add_edge('ENSEMBL:4',
                'HGNC:1',
                edge_key=generate_edge_key('ENSEMBL:4', 'biolink:same_as',
                                           'HGNC:1'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })
    g1.add_edge('NCBIGene:3',
                'HGNC:1',
                edge_key=generate_edge_key('NCBIGene:3', 'biolink:same_as',
                                           'HGNC:1'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })
    g1.add_edge('OMIM:2',
                'HGNC:1',
                edge_key=generate_edge_key('OMIM:2', 'biolink:same_as',
                                           'HGNC:1'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })

    g1.add_edge('ENSEMBL:6',
                'NCBIGene:8',
                edge_key=generate_edge_key('ENSEMBL:6', 'biolink:same_as',
                                           'NCBIGene:8'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })
    g1.add_edge('HGNC:7',
                'NCBIGene:8',
                edge_key=generate_edge_key('HGNC:7', 'biolink:same_as',
                                           'NCBIGene:8'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })

    updated_graph, clique_graph = clique_merge(target_graph=g1,
                                               prefix_prioritization_map=ppm)
    assert updated_graph.number_of_nodes() == 4
    assert updated_graph.number_of_edges() == 2
    assert updated_graph.has_node('HGNC:1')
    assert updated_graph.has_node('NCBIGene:8')

    n1 = updated_graph.nodes()['HGNC:1']
    assert 'NCBIGene:3' in n1['same_as']
    assert 'ENSEMBL:4' in n1['same_as']
    assert 'OMIM:2' not in n1['same_as']

    n2 = updated_graph.nodes()['NCBIGene:8']
    assert 'ENSEMBL:6' in n2['same_as']

    assert updated_graph.has_node('OMIM:2')
    assert not updated_graph.has_node('NCBIGene:3')
    assert not updated_graph.has_node('ENSEMBL:4')
    assert updated_graph.has_node('HGNC:7')
Ejemplo n.º 18
0
def test_clique_merge9():
    """
    Test for clique merge where same_as appear as both node and edge properties,
    but an invalid node also has a same_as property and participates in same_as edge.
    """
    ppm = {'biolink:Gene': ['HGNC', 'NCBIGene', 'ENSEMBL', 'OMIM']}
    g1 = NxGraph()
    g1.add_node('HGNC:1', **{'category': ['biolink:Gene']})
    g1.add_node('OMIM:2', **{
        'category': ['biolink:Disease'],
        'same_as': ['HGNC:1']
    })
    g1.add_node('NCBIGene:3', **{'category': ['biolink:NamedThing']})
    g1.add_node('ENSEMBL:4', **{
        'category': ['biolink:Gene'],
        'same_as': ['HGNC:1']
    })

    g1.add_node('ENSEMBL:6', **{
        'category': ['biolink:Gene'],
        'same_as': ['NCBIGene:8']
    })
    g1.add_node('HGNC:7', **{'category': ['biolink:Gene']})
    g1.add_node('NCBIGene:8', **{'category': ['biolink:Gene']})

    g1.add_edge('X:00001',
                'OMIM:2',
                edge_key=generate_edge_key('X:00001', 'biolink:same_as',
                                           'OMIM:2'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })
    g1.add_edge('NCBIGene:3',
                'HGNC:1',
                edge_key=generate_edge_key('NCBIGene:3', 'biolink:same_as',
                                           'HGNC:1'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })

    g1.add_edge('ENSEMBL:6',
                'NCBIGene:8',
                edge_key=generate_edge_key('ENSEMBL:6', 'biolink:same_as',
                                           'NCBIGene:8'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })
    g1.add_edge('HGNC:7',
                'NCBIGene:8',
                edge_key=generate_edge_key('HGNC:7', 'biolink:same_as',
                                           'NCBIGene:8'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })

    updated_graph, clique_graph = clique_merge(target_graph=g1,
                                               prefix_prioritization_map=ppm)
    assert updated_graph.number_of_nodes() == 4
    assert updated_graph.number_of_edges() == 1
    assert updated_graph.has_node('HGNC:1')
    assert updated_graph.has_node('HGNC:7')

    n1 = updated_graph.nodes()['HGNC:1']
    assert 'OMIM:2' not in n1['same_as']
    assert 'NCBIGene:3' in n1['same_as']
    assert 'ENSEMBL:4' in n1['same_as']

    n2 = updated_graph.nodes()['HGNC:7']
    assert 'ENSEMBL:6' in n2['same_as']
    assert 'NCBIGene:8' in n2['same_as']

    assert updated_graph.has_node('OMIM:2')
Ejemplo n.º 19
0
def test_generate_edge_key():
    """
    Test generation of edge key via generate_edge_key method.
    """
    key = generate_edge_key("S:CURIE", "related_to", "O:CURIE")
    assert key == "S:CURIE-related_to-O:CURIE"
Ejemplo n.º 20
0
    def triple(self, s: URIRef, p: URIRef, o: URIRef) -> None:
        """
        Parse a triple.

        Parameters
        ----------
        s: URIRef
            Subject
        p: URIRef
            Predicate
        o: URIRef
            Object

        """
        self.count += 1
        (element_uri, canonical_uri, predicate, property_name) = self.process_predicate(p)
        if element_uri:
            prop_uri = element_uri
        elif predicate:
            prop_uri = predicate
        else:
            prop_uri = property_name

        s_curie = self.prefix_manager.contract(s)
        if s_curie.startswith('biolink') or s_curie.startswith('OBAN'):
            log.warning(f"Skipping {s} {p} {o}")
        elif s_curie in self.reified_nodes:
            # subject is a reified node
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif p in self.reification_predicates:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif property_name in {'subject', 'predicate', 'object', 'predicate', 'relation'}:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif o in self.reification_types:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif element_uri and element_uri in self.node_property_predicates:
            # treating predicate as a node property
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif (
            p in self.node_property_predicates
            or predicate in self.node_property_predicates
            or property_name in self.node_property_predicates
        ):
            # treating predicate as a node property
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif isinstance(o, rdflib.term.Literal):
            self.add_node_attribute(s, key=prop_uri, value=o)
        else:
            # treating predicate as an edge
            self.add_edge(s, o, p)

        if len(self.edge_cache) >= self.CACHE_SIZE:
            while self.reified_nodes:
                n = self.reified_nodes.pop()
                data = self.node_cache.pop(n)
                try:
                    self.dereify(n, data)
                except ValueError as e:
                    log.info(e)
                    self._incomplete_nodes[n] = data

            for n in self._incomplete_nodes.keys():
                self.node_cache[n] = self._incomplete_nodes[n]
                self.reified_nodes.add(n)
            self._incomplete_nodes.clear()

            for k in self.edge_cache.keys():
                if 'id' not in self.edge_cache[k] and 'association_id' not in self.edge_cache[k]:
                    edge_key = generate_edge_key(
                        self.edge_cache[k]['subject'],
                        self.edge_cache[k]['predicate'],
                        self.edge_cache[k]['object'],
                    )
                    self.edge_cache[k]['id'] = edge_key
                data = self.edge_cache[k]
                data = validate_edge(data)
                data = sanitize_import(data)
                if 'provided_by' in self.graph_metadata and 'provided_by' not in data.keys():
                    data['provided_by'] = self.graph_metadata['provided_by']
                if self.check_edge_filter(data):
                    self.edge_properties.update(data.keys())
                    yield k[0], k[1], k[2], data
            self.edge_cache.clear()
        yield None
Ejemplo n.º 21
0
def test_clique_merge1():
    """
    Test to perform a clique merge where all nodes in a clique are valid.
    """
    ppm = {'biolink:Gene': ['HGNC', 'NCBIGene', 'ENSEMBL', 'OMIM']}
    g1 = NxGraph()
    g1.add_node('HGNC:1', **{'category': ['biolink:Gene']})
    g1.add_node('OMIM:2', **{'category': ['biolink:Gene']})
    g1.add_node('NCBIGene:3', **{'category': ['biolink:Gene']})
    g1.add_node('ENSEMBL:4', **{'category': ['biolink:Gene']})

    g1.add_node('ENSEMBL:6', **{'category': ['biolink:Gene']})
    g1.add_node('HGNC:7', **{'category': ['biolink:Gene']})
    g1.add_node('NCBIGene:8', **{'category': ['biolink:Gene']})

    g1.add_edge('ENSEMBL:4',
                'HGNC:1',
                edge_key=generate_edge_key('ENSEMBL:4', 'biolink:same_as',
                                           'HGNC:1'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })
    g1.add_edge('NCBIGene:3',
                'HGNC:1',
                edge_key=generate_edge_key('NCBIGene:3', 'biolink:same_as',
                                           'HGNC:1'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })
    g1.add_edge('OMIM:2',
                'HGNC:1',
                edge_key=generate_edge_key('OMIM:2', 'biolink:same_as',
                                           'HGNC:1'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })

    g1.add_edge('ENSEMBL:6',
                'NCBIGene:8',
                edge_key=generate_edge_key('ENSEMBL:6', 'biolink:same_as',
                                           'NCBIGene:8'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })
    g1.add_edge('HGNC:7',
                'NCBIGene:8',
                edge_key=generate_edge_key('HGNC:7', 'biolink:same_as',
                                           'NCBIGene:8'),
                **{
                    'predicate': 'biolink:same_as',
                    'relation': 'owl:equivalentClass'
                })

    updated_graph, clique_graph = clique_merge(target_graph=g1,
                                               prefix_prioritization_map=ppm)
    print_graph(updated_graph)
    assert updated_graph.number_of_nodes() == 2
    assert updated_graph.number_of_edges() == 0
    assert updated_graph.has_node('HGNC:1')
    assert updated_graph.has_node('HGNC:7')

    n1 = updated_graph.nodes()['HGNC:1']
    assert 'OMIM:2' in n1['same_as']
    assert 'NCBIGene:3' in n1['same_as']
    assert 'ENSEMBL:4' in n1['same_as']

    n2 = updated_graph.nodes()['HGNC:7']
    assert 'ENSEMBL:6' in n2['same_as']
    assert 'NCBIGene:8' in n2['same_as']

    assert not updated_graph.has_node('OMIM:2')
    assert not updated_graph.has_node('NCBIGene:3')
    assert not updated_graph.has_node('ENSEMBL:4')
    assert not updated_graph.has_node('ENSEMBL:6')
    assert not updated_graph.has_node('NCBIGene:8')
Ejemplo n.º 22
0
def remap_node_identifier(graph: nx.MultiDiGraph, category: str, alternative_property: str, prefix=None) -> nx.MultiDiGraph:
    """
    Remap a node's 'id' attribute with value from a node's ``alternative_property`` attribute.

    Parameters
    ----------
    graph: networkx.MultiDiGraph
        The graph
    category: string
        category referring to nodes whose 'id' needs to be remapped
    alternative_property: string
        property name from which the new value is pulled from
    prefix: string
        signifies that the value for ``alternative_property`` is a list
        and the ``prefix`` indicates which value to pick from the list

    Returns
    -------
    networkx.MultiDiGraph
        The modified graph

    """
    mapping = {}
    for nid, data in graph.nodes(data=True):
        node_data = data.copy()
        if 'category' in node_data and category not in node_data['category']:
            continue

        if alternative_property in node_data:
            alternative_values = node_data[alternative_property]
            if isinstance(alternative_values, (list, set, tuple)):
                if prefix:
                    for v in alternative_values:
                        if prefix in v:
                            # take the first occurring value that contains the given prefix
                            mapping[nid] = v
                            break
                else:
                    # no prefix defined; pick the 1st one from list
                    mapping[nid] = alternative_values[0]
            elif isinstance(alternative_values, str):
                if prefix:
                    if alternative_values.startswith(prefix):
                        mapping[nid] = alternative_values
                else:
                    # no prefix defined
                    mapping[nid] = alternative_values
            else:
                logging.error(f"Cannot use {alternative_values} from alternative_property {alternative_property}")

    nx.set_node_attributes(graph, values=mapping, name='id')
    nx.relabel_nodes(graph, mapping, copy=False)

    # update 'subject' of all outgoing edges
    update_edge_keys = {}
    updated_subject_values = {}
    updated_object_values = {}
    for u, v, k, edge_data in graph.edges(keys=True, data=True):
        if u is not edge_data['subject']:
            updated_subject_values[(u, v, k)] = u
            update_edge_keys[(u, v, k)] = generate_edge_key(u, edge_data['edge_label'], v)
        if v is not edge_data['object']:
            updated_object_values[(u, v, k)] = v
            update_edge_keys[(u, v, k)] = generate_edge_key(u, edge_data['edge_label'], v)

    nx.set_edge_attributes(graph, values=updated_subject_values, name='subject')
    nx.set_edge_attributes(graph, values=updated_object_values, name='object')
    nx.set_edge_attributes(graph, values=update_edge_keys, name='edge_key')

    return graph
Ejemplo n.º 23
0
    def load_edge(self, edge: Dict) -> Generator:
        """
        Load an edge into an instance of BaseGraph

        Parameters
        ----------
        edge : Dict
            An edge

        Returns
        -------
        Generator
            A generator for node and edge records

        """
        (element_uri, canonical_uri, predicate,
         property_name) = process_predicate(self.prefix_manager,
                                            edge['predicate_id'],
                                            self.predicate_mapping)
        if element_uri:
            edge_predicate = element_uri
        elif predicate:
            edge_predicate = predicate
        else:
            edge_predicate = property_name
        if canonical_uri:
            edge_predicate = element_uri
        data = {
            'subject': edge['subject_id'],
            'predicate': edge_predicate,
            'object': edge['object_id'],
        }
        del edge['predicate_id']
        data = validate_edge(data)
        subject_node = {}
        object_node = {}
        for k, v in edge.items():
            if k in SSSOM_NODE_PROPERTY_MAPPING:
                if k.startswith('subject'):
                    mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k]
                    if mapped_k == 'category' and not PrefixManager.is_curie(
                            v):
                        v = f"biolink:OntologyClass"
                    subject_node[mapped_k] = v
                elif k.startswith('object'):
                    mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k]
                    if mapped_k == 'category' and not PrefixManager.is_curie(
                            v):
                        v = f"biolink:OntologyClass"
                    object_node[mapped_k] = v
                else:
                    log.info(f"Ignoring {k} {v}")
            else:
                data[k] = v

        objs = [self.load_node(subject_node), self.load_node(object_node)]

        for k, v in self.graph_metadata.items():
            if k not in {'curie_map'}:
                data[k] = v

        edge_data = sanitize_import(data.copy())
        if 'subject' in edge_data and 'object' in edge_data:
            if 'id' not in edge_data:
                edge_data['id'] = generate_uuid()
            s = edge_data['subject']
            o = edge_data['object']
            if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys(
            ):
                edge_data['provided_by'] = self.graph_metadata['provided_by']
            key = generate_edge_key(s, edge_data['predicate'], o)
            self.edge_properties.update(list(edge_data.keys()))
            objs.append((s, o, key, edge_data))
        else:
            log.info(
                "Ignoring edge with either a missing 'subject' or 'object': {}"
                .format(edge_data))

        for o in objs:
            yield o
Ejemplo n.º 24
0
def test_generate_edge_key():
    """
    Test generation of edge key via generate_edge_key method.
    """
    key = generate_edge_key('S:CURIE', 'related_to', 'O:CURIE')
    assert key == 'S:CURIE-related_to-O:CURIE'
Ejemplo n.º 25
0
def remap_node_identifier(
    graph: BaseGraph, category: str, alternative_property: str, prefix=None
) -> BaseGraph:
    """
    Remap a node's 'id' attribute with value from a node's ``alternative_property`` attribute.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph
    category: string
        category referring to nodes whose 'id' needs to be remapped
    alternative_property: string
        property name from which the new value is pulled from
    prefix: string
        signifies that the value for ``alternative_property`` is a list
        and the ``prefix`` indicates which value to pick from the list

    Returns
    -------
    kgx.graph.base_graph.BaseGraph
        The modified graph

    """
    mapping: Dict = {}
    for nid, data in graph.nodes(data=True):
        node_data = data.copy()
        if "category" in node_data and category not in node_data["category"]:
            continue

        if alternative_property in node_data:
            alternative_values = node_data[alternative_property]
            if isinstance(alternative_values, (list, set, tuple)):
                if prefix:
                    for v in alternative_values:
                        if prefix in v:
                            # take the first occurring value that contains the given prefix
                            mapping[nid] = {"id": v}
                            break
                else:
                    # no prefix defined; pick the 1st one from list
                    mapping[nid] = {"id": next(iter(alternative_values))}
            elif isinstance(alternative_values, str):
                if prefix:
                    if alternative_values.startswith(prefix):
                        mapping[nid] = {"id": alternative_values}
                else:
                    # no prefix defined
                    mapping[nid] = {"id": alternative_values}
            else:
                log.error(
                    f"Cannot use {alternative_values} from alternative_property {alternative_property}"
                )

    graph.set_node_attributes(graph, attributes=mapping)
    graph.relabel_nodes(graph, {k: list(v.values())[0] for k, v in mapping.items()})

    # update 'subject' of all outgoing edges
    update_edge_keys = {}
    updated_subject_values = {}
    updated_object_values = {}
    for u, v, k, edge_data in graph.edges(data=True, keys=True):
        if u is not edge_data["subject"]:
            updated_subject_values[(u, v, k)] = {"subject": u}
            update_edge_keys[(u, v, k)] = {
                "edge_key": generate_edge_key(u, edge_data["predicate"], v)
            }
        if v is not edge_data["object"]:
            updated_object_values[(u, v, k)] = {"object": v}
            update_edge_keys[(u, v, k)] = {
                "edge_key": generate_edge_key(u, edge_data["predicate"], v)
            }

    graph.set_edge_attributes(graph, attributes=updated_subject_values)
    graph.set_edge_attributes(graph, attributes=updated_object_values)
    graph.set_edge_attributes(graph, attributes=update_edge_keys)
    return graph
Ejemplo n.º 26
0
    def triple(self, s: URIRef, p: URIRef, o: URIRef) -> None:
        """
        Parse a triple.

        Parameters
        ----------
        s: URIRef
            Subject
        p: URIRef
            Predicate
        o: URIRef
            Object

        """
        self.count += 1
        (element_uri, canonical_uri, predicate,
         property_name) = self.process_predicate(p)
        if element_uri:
            prop_uri = element_uri
        elif predicate:
            prop_uri = predicate
        else:
            prop_uri = property_name

        s_curie = self.prefix_manager.contract(s)
        if s_curie.startswith("biolink") or s_curie.startswith("OBAN"):
            log.warning(f"Skipping {s} {p} {o}")
        elif s_curie in self.reified_nodes:
            # subject is a reified node
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif p in self.reification_predicates:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif property_name in {
                "subject",
                "predicate",
                "object",
                "predicate",
                "relation",
        }:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif o in self.reification_types:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif element_uri and element_uri in self.node_property_predicates:
            # treating predicate as a node property
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif (p in self.node_property_predicates
              or predicate in self.node_property_predicates
              or property_name in self.node_property_predicates):
            # treating predicate as a node property
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif isinstance(o, rdflib.term.Literal):
            self.add_node_attribute(s, key=prop_uri, value=o)
        else:
            # treating predicate as an edge
            self.add_edge(s, o, p)

        if len(self.edge_cache) >= self.CACHE_SIZE:
            while self.reified_nodes:
                n = self.reified_nodes.pop()
                data = self.node_cache.pop(n)
                try:
                    self.dereify(n, data)
                except ValueError as e:
                    self.owner.log_error(
                        entity=str(data),
                        error_type=ErrorType.INVALID_EDGE_PROPERTY,
                        message=str(e),
                        message_level=MessageLevel.WARNING)
                    self._incomplete_nodes[n] = data

            for n in self._incomplete_nodes.keys():
                self.node_cache[n] = self._incomplete_nodes[n]
                self.reified_nodes.add(n)
            self._incomplete_nodes.clear()

            for k in self.edge_cache.keys():
                if ("id" not in self.edge_cache[k]
                        and "association_id" not in self.edge_cache[k]):
                    edge_key = generate_edge_key(
                        self.edge_cache[k]["subject"],
                        self.edge_cache[k]["predicate"],
                        self.edge_cache[k]["object"],
                    )
                    self.edge_cache[k]["id"] = edge_key
                data = self.edge_cache[k]
                data = self.validate_edge(data)
                data = sanitize_import(data)

                self.set_edge_provenance(data)

                if self.check_edge_filter(data):
                    self.edge_properties.update(data.keys())
                    yield k[0], k[1], k[2], data
            self.edge_cache.clear()
        yield None
Ejemplo n.º 27
0
    def add_edge(
        self,
        subject_iri: URIRef,
        object_iri: URIRef,
        predicate_iri: URIRef,
        data: Optional[Dict[Any, Any]] = None,
    ) -> Dict:
        """
        Add an edge to cache.

        Parameters
        ----------
        subject_iri: rdflib.URIRef
            Subject IRI for the subject in a triple
        object_iri: rdflib.URIRef
            Object IRI for the object in a triple
        predicate_iri: rdflib.URIRef
            Predicate IRI for the predicate in a triple
        data: Optional[Dict[Any, Any]]
            Additional edge properties

        Returns
        -------
        Dict
            The edge data

        """
        (element_uri, canonical_uri, predicate, property_name) = self.process_predicate(
            predicate_iri
        )
        subject_curie = self.prefix_manager.contract(subject_iri)
        object_curie = self.prefix_manager.contract(object_iri)
        if subject_curie in self.node_cache:
            subject_node = self.node_cache[subject_curie]
        else:
            subject_node = self.add_node(subject_iri)

        if object_curie in self.node_cache:
            object_node = self.node_cache[object_curie]
        else:
            object_node = self.add_node(object_iri)
        edge_predicate = element_uri if element_uri else predicate
        if not edge_predicate:
            edge_predicate = property_name

        if ' ' in edge_predicate:
            log.debug(
                f"predicate IRI '{predicate_iri}' yields edge_predicate '{edge_predicate}' that not in snake_case form; replacing ' ' with '_'"
            )
        edge_predicate_prefix = self.prefix_manager.get_prefix(edge_predicate)
        if edge_predicate_prefix not in {'biolink', 'rdf', 'rdfs', 'skos', 'owl'}:
            if PrefixManager.is_curie(edge_predicate):
                # name = curie_lookup(edge_predicate)
                # if name:
                #     log.debug(f"predicate IRI '{predicate_iri}' yields edge_predicate '{edge_predicate}' that is actually a CURIE; Using its mapping instead: {name}")
                #     edge_predicate = f"{edge_predicate_prefix}:{name}"
                # else:
                #     log.debug(f"predicate IRI '{predicate_iri}' yields edge_predicate '{edge_predicate}' that is actually a CURIE; defaulting back to {self.DEFAULT_EDGE_PREDICATE}")
                edge_predicate = DEFAULT_EDGE_PREDICATE

        edge_key = generate_edge_key(subject_node['id'], edge_predicate, object_node['id'])
        if (subject_node['id'], object_node['id'], edge_key) in self.edge_cache:
            # edge already exists; process kwargs and update the edge
            edge_data = self.update_edge(subject_node['id'], object_node['id'], edge_key, data)
        else:
            # add a new edge
            edge_data = data if data else {}
            edge_data.update(
                {
                    'subject': subject_node['id'],
                    'predicate': f"{edge_predicate}",
                    'object': object_node['id'],
                }
            )
            if 'relation' not in edge_data:
                edge_data['relation'] = predicate

            if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data:
                edge_data['provided_by'] = self.graph_metadata['provided_by']
        self.edge_cache[(subject_node['id'], object_node['id'], edge_key)] = edge_data
        return edge_data
Ejemplo n.º 28
0
    def load_edge(self, edge: Dict) -> Generator:
        """
        Load an edge into an instance of BaseGraph

        Parameters
        ----------
        edge : Dict
            An edge

        Returns
        -------
        Generator
            A generator for node and edge records

        """
        (element_uri, canonical_uri, predicate, property_name) = process_predicate(
            self.prefix_manager, edge["predicate_id"], self.predicate_mapping
        )
        if element_uri:
            edge_predicate = element_uri
        elif predicate:
            edge_predicate = predicate
        else:
            edge_predicate = property_name
        if canonical_uri:
            edge_predicate = element_uri

        data = {
            "subject": edge["subject_id"],
            "predicate": edge_predicate,
            "object": edge["object_id"],
        }
        del edge["predicate_id"]

        data = self.validate_edge(data)
        if not data:
            return  # ?

        subject_node = {}
        object_node = {}
        for k, v in edge.items():
            if k in SSSOM_NODE_PROPERTY_MAPPING:
                if k.startswith("subject"):
                    mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k]
                    if mapped_k == "category" and not PrefixManager.is_curie(v):
                        v = f"biolink:OntologyClass"
                    subject_node[mapped_k] = v
                elif k.startswith("object"):
                    mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k]
                    if mapped_k == "category" and not PrefixManager.is_curie(v):
                        v = f"biolink:OntologyClass"
                    object_node[mapped_k] = v
                else:
                    log.info(f"Ignoring {k} {v}")
            else:
                data[k] = v

        subject_node = self.load_node(subject_node)
        object_node = self.load_node(object_node)
        if not (subject_node and object_node):
            return  # ?
        objs = [subject_node, object_node]

        for k, v in self.graph_metadata.items():
            if k not in {"curie_map"}:
                data[k] = v

        edge_data = sanitize_import(data.copy())
        if "subject" in edge_data and "object" in edge_data:
            if "id" not in edge_data:
                edge_data["id"] = generate_uuid()
            s = edge_data["subject"]
            o = edge_data["object"]

            self.set_edge_provenance(edge_data)

            key = generate_edge_key(s, edge_data["predicate"], o)
            self.edge_properties.update(list(edge_data.keys()))
            objs.append((s, o, key, edge_data))
        else:
            self.owner.log_error(
                entity=str(edge_data),
                error_type=ErrorType.MISSING_NODE,
                message="Ignoring edge with either a missing 'subject' or 'object'",
                message_level=MessageLevel.WARNING
            )

        for o in objs:
            yield o
Ejemplo n.º 29
0
    def consolidate_edges(self) -> nx.MultiDiGraph:
        """
        Move all edges from nodes in a clique to the clique leader.

        Returns
        -------
        nx.MultiDiGraph
            The target graph where all edges from nodes in a clique are moved to clique leader

        """
        cliques = list(nx.connected_components(self.clique_graph))
        for clique in cliques:
            logging.info("processing clique: {}".format(clique))
            leader = [x for x in clique if LEADER_ANNOTATION in self.clique_graph.nodes[x] and self.clique_graph.nodes[x][LEADER_ANNOTATION]]
            if len(leader) == 0:
                logging.debug("No leader for clique {}; skipping".format(clique))
                continue
            else:
                leader = leader[0]
            nx.set_node_attributes(self.target_graph, {leader: {LEADER_ANNOTATION: self.clique_graph.nodes[leader].get(LEADER_ANNOTATION), 'election_strategy': self.clique_graph.nodes[leader].get('election_strategy')}})
            for node in clique:
                if node == leader:
                    continue
                in_edges = self.target_graph.in_edges(node, True)
                filtered_in_edges = [x for x in in_edges if x[2]['edge_label'] != SAME_AS]
                equiv_in_edges = [x for x in in_edges if x[2]['edge_label'] == SAME_AS]
                logging.debug("Moving {} in-edges from {} to {}".format(len(in_edges), node, leader))
                for u, v, edge_data in filtered_in_edges:
                    key = generate_edge_key(u, edge_data['edge_label'], v)
                    self.target_graph.remove_edge(u, v, key=key)
                    edge_data['_original_subject'] = edge_data['subject']
                    edge_data['_original_object'] = edge_data['object']
                    edge_data['object'] = leader
                    key = generate_edge_key(u, edge_data['edge_label'], leader)
                    self.target_graph.add_edge(edge_data['subject'], edge_data['object'], key, **edge_data)

                out_edges = self.target_graph.out_edges(node, True)
                filtered_out_edges = [x for x in out_edges if x[2]['edge_label'] != SAME_AS]
                equiv_out_edges = [x for x in out_edges if x[2]['edge_label'] == SAME_AS]
                logging.debug("Moving {} out-edges from {} to {}".format(len(out_edges), node, leader))
                for u, v, edge_data in filtered_out_edges:
                    key = generate_edge_key(u, edge_data['edge_label'], v)
                    self.target_graph.remove_edge(u, v, key=key)
                    edge_data['_original_subject'] = edge_data['subject']
                    edge_data['_original_object'] = edge_data['object']
                    edge_data['subject'] = leader
                    key = generate_edge_key(leader, edge_data['edge_label'], v)
                    self.target_graph.add_edge(edge_data['subject'], edge_data['object'], key, **edge_data)

                aliases = self.target_graph.nodes[leader].get('aliases') if 'aliases' in self.target_graph.nodes[leader] else []

                for u, v, edge_data in equiv_in_edges:
                    if u != leader:
                        aliases.append(u)
                    if v != leader:
                        aliases.append(v)
                    self.target_graph.remove_edge(u, v, key=generate_edge_key(u, SAME_AS, v))

                logging.debug("equiv out edges: {}".format(equiv_out_edges))
                for u, v, edge_data in equiv_out_edges:
                    if u != leader:
                        logging.debug("{} is an alias of leader {}".format(u, leader))
                        aliases.append(u)
                    if v != leader:
                        logging.debug("{} is an alias of leader {}".format(v, leader))
                        aliases.append(v)
                    self.target_graph.remove_edge(u, v, key=generate_edge_key(u, SAME_AS, v))

                # set aliases for leader
                nx.set_node_attributes(self.target_graph, {leader: {'aliases': aliases}})
                # remove all node instances of aliases
                self.target_graph.remove_nodes_from(aliases)

        return self.target_graph
Ejemplo n.º 30
0
def test_clique_merge7():
    """
    Test for clique merge where each clique has a node that has
    a disjoint category from other nodes in a clique and the node is
    not a participant in same_as edges.
    """
    ppm = {"biolink:Gene": ["HGNC", "NCBIGene", "ENSEMBL", "OMIM"]}
    g1 = NxGraph()
    g1.add_node("HGNC:1", **{"category": ["biolink:Gene"]})
    g1.add_node("OMIM:2", **{"category": ["biolink:Disease"]})
    g1.add_node("NCBIGene:3", **{"category": ["biolink:NamedThing"]})
    g1.add_node("ENSEMBL:4", **{"category": ["biolink:Gene"]})

    g1.add_node("ENSEMBL:6", **{"category": ["biolink:Gene"]})
    g1.add_node("HGNC:7", **{"category": ["biolink:Disease"]})
    g1.add_node("NCBIGene:8", **{"category": ["biolink:Gene"]})

    g1.add_edge(
        "ENSEMBL:4",
        "HGNC:1",
        edge_key=generate_edge_key("ENSEMBL:4", "biolink:same_as", "HGNC:1"),
        **{"predicate": "biolink:same_as", "relation": "owl:equivalentClass"}
    )
    g1.add_edge(
        "NCBIGene:3",
        "HGNC:1",
        edge_key=generate_edge_key("NCBIGene:3", "biolink:same_as", "HGNC:1"),
        **{"predicate": "biolink:same_as", "relation": "owl:equivalentClass"}
    )
    g1.add_edge(
        "OMIM:2",
        "HGNC:1",
        edge_key=generate_edge_key("OMIM:2", "biolink:same_as", "HGNC:1"),
        **{"predicate": "biolink:same_as", "relation": "owl:equivalentClass"}
    )

    g1.add_edge(
        "ENSEMBL:6",
        "NCBIGene:8",
        edge_key=generate_edge_key("ENSEMBL:6", "biolink:same_as", "NCBIGene:8"),
        **{"predicate": "biolink:same_as", "relation": "owl:equivalentClass"}
    )
    g1.add_edge(
        "HGNC:7",
        "NCBIGene:8",
        edge_key=generate_edge_key("HGNC:7", "biolink:same_as", "NCBIGene:8"),
        **{"predicate": "biolink:same_as", "relation": "owl:equivalentClass"}
    )

    updated_graph, clique_graph = clique_merge(
        target_graph=g1, prefix_prioritization_map=ppm
    )
    assert updated_graph.number_of_nodes() == 4
    assert updated_graph.number_of_edges() == 2
    assert updated_graph.has_node("HGNC:1")
    assert updated_graph.has_node("NCBIGene:8")

    n1 = updated_graph.nodes()["HGNC:1"]
    assert "NCBIGene:3" in n1["same_as"]
    assert "ENSEMBL:4" in n1["same_as"]
    assert "OMIM:2" not in n1["same_as"]

    n2 = updated_graph.nodes()["NCBIGene:8"]
    assert "ENSEMBL:6" in n2["same_as"]

    assert updated_graph.has_node("OMIM:2")
    assert not updated_graph.has_node("NCBIGene:3")
    assert not updated_graph.has_node("ENSEMBL:4")
    assert updated_graph.has_node("HGNC:7")