Beispiel #1
0
    def read_edge(self, edge: Dict) -> Optional[Tuple]:
        """
        Load an edge into an instance of BaseGraph.

        Parameters
        ----------
        edge: Dict
            An edge

        Returns
        -------
        Optional[Tuple]
            A tuple that contains subject id, object id, edge key, and edge data

        """
        edge = validate_edge(edge)
        edge_data = sanitize_import(edge.copy())
        if 'id' not in edge_data:
            edge_data['id'] = generate_uuid()
        s = edge_data['subject']
        o = edge_data['object']
        if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys():
            edge_data['provided_by'] = self.graph_metadata['provided_by']
        key = generate_edge_key(s, edge_data['predicate'], o)
        self.edge_properties.update(list(edge_data.keys()))
        if self.check_edge_filter(edge_data):
            self.node_properties.update(edge_data.keys())
            return s, o, key, edge_data
Beispiel #2
0
    def load_edge(self, edge_record: List) -> Tuple:
        """
        Load an edge into an instance of BaseGraph

        Parameters
        ----------
        edge_record: List
            A 4-tuple edge record

        Returns
        -------
        Tuple
            A tuple with subject ID, object ID, edge key, and edge data

        """

        subject_node = edge_record[0]
        edge = edge_record[1]
        object_node = edge_record[2]

        if 'provided_by' in self.graph_metadata and 'provided_by' not in edge.keys():
            edge['provided_by'] = self.graph_metadata['provided_by']
        if 'id' not in edge.keys():
            edge['id'] = generate_uuid()
        key = generate_edge_key(subject_node['id'], edge['predicate'], object_node['id'])
        edge = validate_edge(edge)
        edge = sanitize_import(edge.copy())
        self.edge_properties.update(edge.keys())
        return subject_node['id'], object_node['id'], key, edge
Beispiel #3
0
    def read_edges(self) -> Generator:
        """
        Read edges as records from the graph.

        Returns
        -------
        Generator
            A generator for edges

        """
        for u, v, k, data in self.graph.edges(keys=True, data=True):
            edge_data = validate_edge(data)
            edge_data = sanitize_import(edge_data.copy())
            if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys(
            ):
                edge_data['provided_by'] = self.graph_metadata['provided_by']
            if self.check_edge_filter(edge_data):
                self.node_properties.update(edge_data.keys())
                yield u, v, k, edge_data
Beispiel #4
0
    def triple(self, s: URIRef, p: URIRef, o: URIRef) -> None:
        """
        Parse a triple.

        Parameters
        ----------
        s: URIRef
            Subject
        p: URIRef
            Predicate
        o: URIRef
            Object

        """
        self.count += 1
        (element_uri, canonical_uri, predicate, property_name) = self.process_predicate(p)
        if element_uri:
            prop_uri = element_uri
        elif predicate:
            prop_uri = predicate
        else:
            prop_uri = property_name

        s_curie = self.prefix_manager.contract(s)
        if s_curie.startswith('biolink') or s_curie.startswith('OBAN'):
            log.warning(f"Skipping {s} {p} {o}")
        elif s_curie in self.reified_nodes:
            # subject is a reified node
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif p in self.reification_predicates:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif property_name in {'subject', 'predicate', 'object', 'predicate', 'relation'}:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif o in self.reification_types:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif element_uri and element_uri in self.node_property_predicates:
            # treating predicate as a node property
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif (
            p in self.node_property_predicates
            or predicate in self.node_property_predicates
            or property_name in self.node_property_predicates
        ):
            # treating predicate as a node property
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif isinstance(o, rdflib.term.Literal):
            self.add_node_attribute(s, key=prop_uri, value=o)
        else:
            # treating predicate as an edge
            self.add_edge(s, o, p)

        if len(self.edge_cache) >= self.CACHE_SIZE:
            while self.reified_nodes:
                n = self.reified_nodes.pop()
                data = self.node_cache.pop(n)
                try:
                    self.dereify(n, data)
                except ValueError as e:
                    log.info(e)
                    self._incomplete_nodes[n] = data

            for n in self._incomplete_nodes.keys():
                self.node_cache[n] = self._incomplete_nodes[n]
                self.reified_nodes.add(n)
            self._incomplete_nodes.clear()

            for k in self.edge_cache.keys():
                if 'id' not in self.edge_cache[k] and 'association_id' not in self.edge_cache[k]:
                    edge_key = generate_edge_key(
                        self.edge_cache[k]['subject'],
                        self.edge_cache[k]['predicate'],
                        self.edge_cache[k]['object'],
                    )
                    self.edge_cache[k]['id'] = edge_key
                data = self.edge_cache[k]
                data = validate_edge(data)
                data = sanitize_import(data)
                if 'provided_by' in self.graph_metadata and 'provided_by' not in data.keys():
                    data['provided_by'] = self.graph_metadata['provided_by']
                if self.check_edge_filter(data):
                    self.edge_properties.update(data.keys())
                    yield k[0], k[1], k[2], data
            self.edge_cache.clear()
        yield None
Beispiel #5
0
    def parse(
        self,
        filename: str,
        format: str = 'nt',
        compression: Optional[str] = None,
        provided_by: Optional[str] = None,
        **kwargs: Any,
    ) -> Generator:
        """
        This method reads from RDF N-Triples and yields records.

        .. note::
            To ensure proper parsing of N-Triples and a relatively low memory footprint,
            it is recommended that the N-Triples be sorted based on the subject IRIs.

            ```sort -k 1,2 -t ' ' data.nt > data_sorted.nt```

        Parameters
        ----------
        filename: str
            The filename to parse
        format: str
            The format (``nt``)
        compression: Optional[str]
            The compression type (``gz``)
        provided_by: Optional[str]
            The name of the source providing the input file
        kwargs: Any
            Any additional arguments

        Returns
        -------
        Generator
            A generator for records

        """
        p = CustomNTriplesParser(self)
        if provided_by:
            self.graph_metadata['provided_by'] = [provided_by]
        if compression == 'gz':
            yield from p.parse(gzip.open(filename, 'rb'))
        else:
            yield from p.parse(open(filename, 'rb'))
        log.info(f"Done parsing {filename}")

        for n in self.reified_nodes:
            data = self.node_cache.pop(n)
            self.dereify(n, data)

        for k in self.node_cache.keys():
            data = self.node_cache[k]
            if 'category' in data:
                if 'biolink:NamedThing' not in set(data['category']):
                    data['category'].append('biolink:NamedThing')
            else:
                data['category'] = ["biolink:NamedThing"]
            data = validate_node(data)
            data = sanitize_import(data)
            if 'provided_by' in self.graph_metadata and 'provided_by' not in data.keys():
                data['provided_by'] = self.graph_metadata['provided_by']
            if self.check_node_filter(data):
                self.node_properties.update(data.keys())
                yield k, data
        self.node_cache.clear()

        for k in self.edge_cache.keys():
            data = self.edge_cache[k]
            data = validate_edge(data)
            data = sanitize_import(data)
            if 'provided_by' in self.graph_metadata and 'provided_by' not in data.keys():
                data['provided_by'] = self.graph_metadata['provided_by']
            if self.check_edge_filter(data):
                self.edge_properties.update(data.keys())
                yield k[0], k[1], k[2], data
        self.edge_cache.clear()
Beispiel #6
0
    def load_edge(self, edge: Dict) -> Generator:
        """
        Load an edge into an instance of BaseGraph

        Parameters
        ----------
        edge : Dict
            An edge

        Returns
        -------
        Generator
            A generator for node and edge records

        """
        (element_uri, canonical_uri, predicate,
         property_name) = process_predicate(self.prefix_manager,
                                            edge['predicate_id'],
                                            self.predicate_mapping)
        if element_uri:
            edge_predicate = element_uri
        elif predicate:
            edge_predicate = predicate
        else:
            edge_predicate = property_name
        if canonical_uri:
            edge_predicate = element_uri
        data = {
            'subject': edge['subject_id'],
            'predicate': edge_predicate,
            'object': edge['object_id'],
        }
        del edge['predicate_id']
        data = validate_edge(data)
        subject_node = {}
        object_node = {}
        for k, v in edge.items():
            if k in SSSOM_NODE_PROPERTY_MAPPING:
                if k.startswith('subject'):
                    mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k]
                    if mapped_k == 'category' and not PrefixManager.is_curie(
                            v):
                        v = f"biolink:OntologyClass"
                    subject_node[mapped_k] = v
                elif k.startswith('object'):
                    mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k]
                    if mapped_k == 'category' and not PrefixManager.is_curie(
                            v):
                        v = f"biolink:OntologyClass"
                    object_node[mapped_k] = v
                else:
                    log.info(f"Ignoring {k} {v}")
            else:
                data[k] = v

        objs = [self.load_node(subject_node), self.load_node(object_node)]

        for k, v in self.graph_metadata.items():
            if k not in {'curie_map'}:
                data[k] = v

        edge_data = sanitize_import(data.copy())
        if 'subject' in edge_data and 'object' in edge_data:
            if 'id' not in edge_data:
                edge_data['id'] = generate_uuid()
            s = edge_data['subject']
            o = edge_data['object']
            if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys(
            ):
                edge_data['provided_by'] = self.graph_metadata['provided_by']
            key = generate_edge_key(s, edge_data['predicate'], o)
            self.edge_properties.update(list(edge_data.keys()))
            objs.append((s, o, key, edge_data))
        else:
            log.info(
                "Ignoring edge with either a missing 'subject' or 'object': {}"
                .format(edge_data))

        for o in objs:
            yield o
Beispiel #7
0
    def load_graph(self, rdfgraph: rdflib.Graph, **kwargs: Any) -> None:
        """
        Walk through the rdflib.Graph and load all triples into kgx.graph.base_graph.BaseGraph

        Parameters
        ----------
        rdfgraph: rdflib.Graph
            Graph containing nodes and edges
        kwargs: Any
            Any additional arguments

        """
        seen = set()
        seen.add(RDFS.subClassOf)
        for s, p, o in rdfgraph.triples((None, RDFS.subClassOf, None)):
            # ignoring blank nodes
            if isinstance(s, rdflib.term.BNode):
                continue
            pred = None
            parent = None
            os_interpretation = None
            if isinstance(o, rdflib.term.BNode):
                # C SubClassOf R some D
                for x in rdfgraph.objects(o, OWL.onProperty):
                    pred = x
                # owl:someValuesFrom
                for x in rdfgraph.objects(o, OWL.someValuesFrom):
                    os_interpretation = self.OWLSTAR.term(
                        'AllSomeInterpretation')
                    parent = x
                # owl:allValuesFrom
                for x in rdfgraph.objects(o, OWL.allValuesFrom):
                    os_interpretation = self.OWLSTAR.term(
                        "AllOnlyInterpretation")
                    parent = x
                if pred is None or parent is None:
                    log.warning(
                        f"{s} {p} {o} has OWL.onProperty {pred} and OWL.someValuesFrom {parent}"
                    )
                    log.warning(
                        "Do not know how to handle BNode: {}".format(o))
                    continue
            else:
                # C rdfs:subClassOf D (where C and D are named classes)
                pred = p
                parent = o
            if os_interpretation:
                # reify edges that have logical interpretation
                eid = generate_uuid()
                self.reified_nodes.add(eid)
                yield from self.triple(URIRef(eid),
                                       self.BIOLINK.term('category'),
                                       self.BIOLINK.Association)
                yield from self.triple(URIRef(eid),
                                       self.BIOLINK.term('subject'), s)
                yield from self.triple(URIRef(eid),
                                       self.BIOLINK.term('predicate'), pred)
                yield from self.triple(URIRef(eid),
                                       self.BIOLINK.term('object'), parent)
                yield from self.triple(
                    URIRef(eid), self.BIOLINK.term('logical_interpretation'),
                    os_interpretation)
            else:
                yield from self.triple(s, pred, parent)

        seen.add(OWL.equivalentClass)
        for s, p, o in rdfgraph.triples((None, OWL.equivalentClass, None)):
            # A owl:equivalentClass B (where A and B are named classes)
            if not isinstance(o, rdflib.term.BNode):
                yield from self.triple(s, p, o)

        for relation in rdfgraph.subjects(RDF.type, OWL.ObjectProperty):
            seen.add(relation)
            for s, p, o in rdfgraph.triples((relation, None, None)):
                if not isinstance(o, rdflib.term.BNode):
                    if p not in self.excluded_predicates:
                        yield from self.triple(s, p, o)

        for s, p, o in rdfgraph.triples((None, None, None)):
            if isinstance(s, rdflib.term.BNode) or isinstance(
                    o, rdflib.term.BNode):
                continue
            if p in seen:
                continue
            if p in self.excluded_predicates:
                continue
            yield from self.triple(s, p, o)

        for n in self.reified_nodes:
            data = self.node_cache.pop(n)
            self.dereify(n, data)

        for k, data in self.node_cache.items():
            node_data = validate_node(data)
            node_data = sanitize_import(node_data)
            if 'provided_by' in self.graph_metadata and 'provided_by' not in node_data.keys(
            ):
                node_data['provided_by'] = self.graph_metadata['provided_by']
            if self.check_node_filter(node_data):
                yield k, node_data
        self.node_cache.clear()

        for k, data in self.edge_cache.items():
            edge_data = validate_edge(data)
            edge_data = sanitize_import(edge_data)
            if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys(
            ):
                edge_data['provided_by'] = self.graph_metadata['provided_by']
            if self.check_edge_filter(edge_data):
                yield k[0], k[1], k[2], edge_data
        self.edge_cache.clear()
Beispiel #8
0
def test_validate_correct_edge(edge):
    """
    Test basic validation of an edge, where the edge is valid.
    """
    e = validate_edge(edge)
    assert e is not None
Beispiel #9
0
def test_validate_incorrect_edge(edge):
    """
    Test basic validation of an edge, where the edge is invalid.
    """
    with pytest.raises(KeyError):
        validate_edge(edge)