Beispiel #1
0
    def load_node(self, node: Dict) -> Tuple[str, Dict]:
        """
        Load a node into an instance of BaseGraph

        Parameters
        ----------
        node: Dict
            A node

        Returns
        -------
        Optional[Tuple[str, Dict]]
            A tuple that contains node id and node data

        """
        node = validate_node(node)
        node_data = sanitize_import(node.copy())
        if 'id' in node_data:
            n = node_data['id']
            if 'provided_by' in self.graph_metadata and 'provided_by' not in node_data.keys(
            ):
                node_data['provided_by'] = self.graph_metadata['provided_by']
            self.node_properties.update(list(node_data.keys()))
            return n, node_data
        else:
            log.info("Ignoring node with no 'id': {}".format(node))
Beispiel #2
0
    def read_node(self, node: Dict) -> Optional[Tuple[str, Dict]]:
        """
        Prepare a node.

        Parameters
        ----------
        node: Dict
            A node

        Returns
        -------
        Optional[Tuple[str, Dict]]
            A tuple that contains node id and node data

        """
        node = validate_node(node)
        node_data = sanitize_import(node.copy())
        if 'id' in node_data:
            n = node_data['id']
            if 'provided_by' in self.graph_metadata and 'provided_by' not in node_data.keys():
                node_data['provided_by'] = self.graph_metadata['provided_by']
            self.node_properties.update(list(node_data.keys()))
            if self.check_node_filter(node_data):
                self.node_properties.update(node_data.keys())
                return n, node_data
        else:
            log.info(f"Ignoring node with no 'id': {node}")
Beispiel #3
0
    def read_nodes(self) -> Generator:
        """
        Read nodes as records from the graph.

        Returns
        -------
        Generator
            A generator for nodes

        """
        for n, data in self.graph.nodes(data=True):
            if "id" not in data:
                data["id"] = n

            node_data = self.validate_node(data)
            if not node_data:
                continue

            node_data = sanitize_import(node_data.copy())

            self.set_node_provenance(node_data)

            if self.check_node_filter(node_data):
                self.node_properties.update(node_data.keys())
                yield n, node_data
Beispiel #4
0
    def load_node(self, node_data: Dict) -> Optional[Tuple[str, Dict]]:
        """
        Load a node into an instance of BaseGraph

        Parameters
        ----------
        node_data: Dict
            A node

        Returns
        -------
        Optional[Tuple[str, Dict]]
            A tuple that contains node id and node data

        """
        node_data = self.validate_node(node_data)
        if not node_data:
            return None

        node_data = sanitize_import(node_data.copy())
        if "id" in node_data:
            n = node_data["id"]

            self.set_node_provenance(node_data)

            self.node_properties.update(list(node_data.keys()))
            return n, node_data
        else:
            self.owner.log_error(
                entity=str(node_data),
                error_type=ErrorType.MISSING_NODE_PROPERTY,
                message="Ignoring node with no 'id'",
                message_level=MessageLevel.WARNING
            )
Beispiel #5
0
    def read_node(self, node: Dict) -> Optional[Tuple[str, Dict]]:
        """
        Prepare a node.

        Parameters
        ----------
        node: Dict
            A node

        Returns
        -------
        Optional[Tuple[str, Dict]]
            A tuple that contains node id and node data
        """
        node = self.validate_node(node)
        if node:
            # if not None, assumed to have an "id" here...
            node_data = sanitize_import(node.copy(), self.list_delimiter)

            n = node_data["id"]

            self.set_node_provenance(node_data)

            self.node_properties.update(list(node_data.keys()))
            if self.check_node_filter(node_data):
                self.node_properties.update(node_data.keys())
                return n, node_data
Beispiel #6
0
    def read_edge(self, edge: Dict) -> Optional[Tuple]:
        """
        Load an edge into an instance of BaseGraph.

        Parameters
        ----------
        edge: Dict
            An edge

        Returns
        -------
        Optional[Tuple]
            A tuple that contains subject id, object id, edge key, and edge data

        """
        edge = self.validate_edge(edge)
        if not edge:
            return None

        edge_data = sanitize_import(edge.copy(), self.list_delimiter)

        if "id" not in edge_data:
            edge_data["id"] = generate_uuid()
        s = edge_data["subject"]
        o = edge_data["object"]

        self.set_edge_provenance(edge_data)

        key = generate_edge_key(s, edge_data["predicate"], o)
        self.edge_properties.update(list(edge_data.keys()))
        if self.check_edge_filter(edge_data):
            self.node_properties.update(edge_data.keys())
            return s, o, key, edge_data
Beispiel #7
0
    def load_edge(self, edge_record: List) -> Tuple:
        """
        Load an edge into an instance of BaseGraph

        Parameters
        ----------
        edge_record: List
            A 4-tuple edge record

        Returns
        -------
        Tuple
            A tuple with subject ID, object ID, edge key, and edge data

        """

        subject_node = edge_record[0]
        edge_data = edge_record[1]
        object_node = edge_record[2]

        self.set_edge_provenance(edge_data)

        if "id" not in edge_data.keys():
            edge_data["id"] = generate_uuid()
        key = generate_edge_key(subject_node["id"], edge_data["predicate"],
                                object_node["id"])

        edge_data = self.validate_edge(edge_data)
        if not edge_data:
            return ()

        edge_data = sanitize_import(edge_data.copy())
        self.edge_properties.update(edge_data.keys())
        return subject_node["id"], object_node["id"], key, edge_data
Beispiel #8
0
    def load_node(self, node_data: Dict) -> Optional[Tuple]:
        """
        Load node into an instance of BaseGraph

        Parameters
        ----------
        node_data: Dict
            A node

        Returns
        -------
        Tuple
            A tuple with node ID and node data

        """
        self.node_count += 1
        # TODO: remove the seen_nodes
        self.seen_nodes.add(node_data["id"])

        self.set_node_provenance(node_data)

        node_data = self.validate_node(node_data)
        if not node_data:
            return None

        node_data = sanitize_import(node_data.copy())
        self.node_properties.update(node_data.keys())
        return node_data["id"], node_data
Beispiel #9
0
    def load_edge(self, edge_record: List) -> Tuple:
        """
        Load an edge into an instance of BaseGraph

        Parameters
        ----------
        edge_record: List
            A 4-tuple edge record

        Returns
        -------
        Tuple
            A tuple with subject ID, object ID, edge key, and edge data

        """

        subject_node = edge_record[0]
        edge = edge_record[1]
        object_node = edge_record[2]

        if 'provided_by' in self.graph_metadata and 'provided_by' not in edge.keys():
            edge['provided_by'] = self.graph_metadata['provided_by']
        if 'id' not in edge.keys():
            edge['id'] = generate_uuid()
        key = generate_edge_key(subject_node['id'], edge['predicate'], object_node['id'])
        edge = validate_edge(edge)
        edge = sanitize_import(edge.copy())
        self.edge_properties.update(edge.keys())
        return subject_node['id'], object_node['id'], key, edge
Beispiel #10
0
    def read_edge(self, edge: Dict) -> Optional[Tuple]:
        """
        Load an edge into an instance of BaseGraph.

        Parameters
        ----------
        edge: Dict
            An edge

        Returns
        -------
        Optional[Tuple]
            A tuple that contains subject id, object id, edge key, and edge data

        """
        edge = validate_edge(edge)
        edge_data = sanitize_import(edge.copy())
        if 'id' not in edge_data:
            edge_data['id'] = generate_uuid()
        s = edge_data['subject']
        o = edge_data['object']
        if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys():
            edge_data['provided_by'] = self.graph_metadata['provided_by']
        key = generate_edge_key(s, edge_data['predicate'], o)
        self.edge_properties.update(list(edge_data.keys()))
        if self.check_edge_filter(edge_data):
            self.node_properties.update(edge_data.keys())
            return s, o, key, edge_data
Beispiel #11
0
def test_sanitize_import1(query):
    """
    Test sanitize_import method.
    """
    d = sanitize_import(query[0], list_delimiter='|')
    for k, v in query[1].items():
        assert k in d
        assert d[k] == v
Beispiel #12
0
    def read_edges(self) -> Generator:
        """
        Read edges as records from the graph.

        Returns
        -------
        Generator
            A generator for edges

        """
        for u, v, k, data in self.graph.edges(keys=True, data=True):
            edge_data = validate_edge(data)
            edge_data = sanitize_import(edge_data.copy())
            if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys(
            ):
                edge_data['provided_by'] = self.graph_metadata['provided_by']
            if self.check_edge_filter(edge_data):
                self.node_properties.update(edge_data.keys())
                yield u, v, k, edge_data
Beispiel #13
0
    def read_nodes(self) -> Generator:
        """
        Read nodes as records from the graph.

        Returns
        -------
        Generator
            A generator for nodes

        """
        for n, data in self.graph.nodes(data=True):
            if 'id' not in data:
                data['id'] = n
            node_data = validate_node(data)
            node_data = sanitize_import(node_data.copy())
            if 'provided_by' in self.graph_metadata and 'provided_by' not in node_data.keys(
            ):
                node_data['provided_by'] = self.graph_metadata['provided_by']
            if self.check_node_filter(node_data):
                self.node_properties.update(node_data.keys())
                yield n, node_data
Beispiel #14
0
    def read_edges(self) -> Generator:
        """
        Read edges as records from the graph.

        Returns
        -------
        Generator
            A generator for edges

        """
        for u, v, k, data in self.graph.edges(keys=True, data=True):

            edge_data = self.validate_edge(data)
            if not edge_data:
                continue

            edge_data = sanitize_import(edge_data.copy())

            self.set_edge_provenance(edge_data)

            if self.check_edge_filter(edge_data):
                self.node_properties.update(edge_data.keys())
                yield u, v, k, edge_data
Beispiel #15
0
    def load_node(self, node: Dict) -> Tuple:
        """
        Load node into an instance of BaseGraph

        Parameters
        ----------
        node: Dict
            A node

        Returns
        -------
        Tuple
            A tuple with node ID and node data

        """
        self.node_count += 1
        # TODO: remove the seen_nodes
        self.seen_nodes.add(node['id'])
        if 'provided_by' in self.graph_metadata and 'provided_by' not in node.keys():
            node['provided_by'] = self.graph_metadata['provided_by']
        node = validate_node(node)
        node = sanitize_import(node.copy())
        self.node_properties.update(node.keys())
        return node['id'], node
Beispiel #16
0
    def load_edge(self, edge: Dict) -> Generator:
        """
        Load an edge into an instance of BaseGraph

        Parameters
        ----------
        edge : Dict
            An edge

        Returns
        -------
        Generator
            A generator for node and edge records

        """
        (element_uri, canonical_uri, predicate,
         property_name) = process_predicate(self.prefix_manager,
                                            edge['predicate_id'],
                                            self.predicate_mapping)
        if element_uri:
            edge_predicate = element_uri
        elif predicate:
            edge_predicate = predicate
        else:
            edge_predicate = property_name
        if canonical_uri:
            edge_predicate = element_uri
        data = {
            'subject': edge['subject_id'],
            'predicate': edge_predicate,
            'object': edge['object_id'],
        }
        del edge['predicate_id']
        data = validate_edge(data)
        subject_node = {}
        object_node = {}
        for k, v in edge.items():
            if k in SSSOM_NODE_PROPERTY_MAPPING:
                if k.startswith('subject'):
                    mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k]
                    if mapped_k == 'category' and not PrefixManager.is_curie(
                            v):
                        v = f"biolink:OntologyClass"
                    subject_node[mapped_k] = v
                elif k.startswith('object'):
                    mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k]
                    if mapped_k == 'category' and not PrefixManager.is_curie(
                            v):
                        v = f"biolink:OntologyClass"
                    object_node[mapped_k] = v
                else:
                    log.info(f"Ignoring {k} {v}")
            else:
                data[k] = v

        objs = [self.load_node(subject_node), self.load_node(object_node)]

        for k, v in self.graph_metadata.items():
            if k not in {'curie_map'}:
                data[k] = v

        edge_data = sanitize_import(data.copy())
        if 'subject' in edge_data and 'object' in edge_data:
            if 'id' not in edge_data:
                edge_data['id'] = generate_uuid()
            s = edge_data['subject']
            o = edge_data['object']
            if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys(
            ):
                edge_data['provided_by'] = self.graph_metadata['provided_by']
            key = generate_edge_key(s, edge_data['predicate'], o)
            self.edge_properties.update(list(edge_data.keys()))
            objs.append((s, o, key, edge_data))
        else:
            log.info(
                "Ignoring edge with either a missing 'subject' or 'object': {}"
                .format(edge_data))

        for o in objs:
            yield o
Beispiel #17
0
    def parse(
        self,
        filename: str,
        format: str = 'nt',
        compression: Optional[str] = None,
        provided_by: Optional[str] = None,
        **kwargs: Any,
    ) -> Generator:
        """
        This method reads from RDF N-Triples and yields records.

        .. note::
            To ensure proper parsing of N-Triples and a relatively low memory footprint,
            it is recommended that the N-Triples be sorted based on the subject IRIs.

            ```sort -k 1,2 -t ' ' data.nt > data_sorted.nt```

        Parameters
        ----------
        filename: str
            The filename to parse
        format: str
            The format (``nt``)
        compression: Optional[str]
            The compression type (``gz``)
        provided_by: Optional[str]
            The name of the source providing the input file
        kwargs: Any
            Any additional arguments

        Returns
        -------
        Generator
            A generator for records

        """
        p = CustomNTriplesParser(self)
        if provided_by:
            self.graph_metadata['provided_by'] = [provided_by]
        if compression == 'gz':
            yield from p.parse(gzip.open(filename, 'rb'))
        else:
            yield from p.parse(open(filename, 'rb'))
        log.info(f"Done parsing {filename}")

        for n in self.reified_nodes:
            data = self.node_cache.pop(n)
            self.dereify(n, data)

        for k in self.node_cache.keys():
            data = self.node_cache[k]
            if 'category' in data:
                if 'biolink:NamedThing' not in set(data['category']):
                    data['category'].append('biolink:NamedThing')
            else:
                data['category'] = ["biolink:NamedThing"]
            data = validate_node(data)
            data = sanitize_import(data)
            if 'provided_by' in self.graph_metadata and 'provided_by' not in data.keys():
                data['provided_by'] = self.graph_metadata['provided_by']
            if self.check_node_filter(data):
                self.node_properties.update(data.keys())
                yield k, data
        self.node_cache.clear()

        for k in self.edge_cache.keys():
            data = self.edge_cache[k]
            data = validate_edge(data)
            data = sanitize_import(data)
            if 'provided_by' in self.graph_metadata and 'provided_by' not in data.keys():
                data['provided_by'] = self.graph_metadata['provided_by']
            if self.check_edge_filter(data):
                self.edge_properties.update(data.keys())
                yield k[0], k[1], k[2], data
        self.edge_cache.clear()
Beispiel #18
0
    def load_graph(self, rdfgraph: rdflib.Graph, **kwargs: Any) -> None:
        """
        Walk through the rdflib.Graph and load all triples into kgx.graph.base_graph.BaseGraph

        Parameters
        ----------
        rdfgraph: rdflib.Graph
            Graph containing nodes and edges
        kwargs: Any
            Any additional arguments

        """
        seen = set()
        seen.add(RDFS.subClassOf)
        for s, p, o in rdfgraph.triples((None, RDFS.subClassOf, None)):
            # ignoring blank nodes
            if isinstance(s, rdflib.term.BNode):
                continue
            pred = None
            parent = None
            os_interpretation = None
            if isinstance(o, rdflib.term.BNode):
                # C SubClassOf R some D
                for x in rdfgraph.objects(o, OWL.onProperty):
                    pred = x
                # owl:someValuesFrom
                for x in rdfgraph.objects(o, OWL.someValuesFrom):
                    os_interpretation = self.OWLSTAR.term(
                        "AllSomeInterpretation")
                    parent = x
                # owl:allValuesFrom
                for x in rdfgraph.objects(o, OWL.allValuesFrom):
                    os_interpretation = self.OWLSTAR.term(
                        "AllOnlyInterpretation")
                    parent = x
                if pred is None or parent is None:
                    log.warning(
                        f"{s} {p} {o} has OWL.onProperty {pred} and OWL.someValuesFrom {parent}"
                    )
                    log.warning(
                        "Do not know how to handle BNode: {}".format(o))
                    continue
            else:
                # C rdfs:subClassOf D (where C and D are named classes)
                pred = p
                parent = o
            if os_interpretation:
                # reify edges that have logical interpretation
                eid = generate_uuid()
                self.reified_nodes.add(eid)
                yield from self.triple(URIRef(eid),
                                       self.BIOLINK.term("category"),
                                       self.BIOLINK.Association)
                yield from self.triple(URIRef(eid),
                                       self.BIOLINK.term("subject"), s)
                yield from self.triple(URIRef(eid),
                                       self.BIOLINK.term("predicate"), pred)
                yield from self.triple(URIRef(eid),
                                       self.BIOLINK.term("object"), parent)
                yield from self.triple(
                    URIRef(eid),
                    self.BIOLINK.term("logical_interpretation"),
                    os_interpretation,
                )
            else:
                yield from self.triple(s, pred, parent)

        seen.add(OWL.equivalentClass)
        for s, p, o in rdfgraph.triples((None, OWL.equivalentClass, None)):
            # A owl:equivalentClass B (where A and B are named classes)
            if not isinstance(o, rdflib.term.BNode):
                yield from self.triple(s, p, o)

        for relation in rdfgraph.subjects(RDF.type, OWL.ObjectProperty):
            seen.add(relation)
            for s, p, o in rdfgraph.triples((relation, None, None)):
                if not isinstance(o, rdflib.term.BNode):
                    if p not in self.excluded_predicates:
                        yield from self.triple(s, p, o)

        for s, p, o in rdfgraph.triples((None, None, None)):
            if isinstance(s, rdflib.term.BNode) or isinstance(
                    o, rdflib.term.BNode):
                continue
            if p in seen:
                continue
            if p in self.excluded_predicates:
                continue
            yield from self.triple(s, p, o)

        for n in self.reified_nodes:
            data = self.node_cache.pop(n)
            self.dereify(n, data)

        for k, data in self.node_cache.items():

            node_data = self.validate_node(data)
            if not node_data:
                continue

            node_data = sanitize_import(node_data)
            self.set_node_provenance(node_data)
            if self.check_node_filter(node_data):
                self.node_properties.update(node_data.keys())
                yield k, node_data
        self.node_cache.clear()

        for k, data in self.edge_cache.items():

            edge_data = self.validate_edge(data)
            if not edge_data:
                continue

            edge_data = sanitize_import(edge_data)
            self.set_edge_provenance(edge_data)
            if self.check_edge_filter(edge_data):
                self.edge_properties.update(edge_data.keys())
                yield k[0], k[1], k[2], edge_data
        self.edge_cache.clear()
Beispiel #19
0
    def load_edge(self, edge: Dict) -> Generator:
        """
        Load an edge into an instance of BaseGraph

        Parameters
        ----------
        edge : Dict
            An edge

        Returns
        -------
        Generator
            A generator for node and edge records

        """
        (element_uri, canonical_uri, predicate, property_name) = process_predicate(
            self.prefix_manager, edge["predicate_id"], self.predicate_mapping
        )
        if element_uri:
            edge_predicate = element_uri
        elif predicate:
            edge_predicate = predicate
        else:
            edge_predicate = property_name
        if canonical_uri:
            edge_predicate = element_uri

        data = {
            "subject": edge["subject_id"],
            "predicate": edge_predicate,
            "object": edge["object_id"],
        }
        del edge["predicate_id"]

        data = self.validate_edge(data)
        if not data:
            return  # ?

        subject_node = {}
        object_node = {}
        for k, v in edge.items():
            if k in SSSOM_NODE_PROPERTY_MAPPING:
                if k.startswith("subject"):
                    mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k]
                    if mapped_k == "category" and not PrefixManager.is_curie(v):
                        v = f"biolink:OntologyClass"
                    subject_node[mapped_k] = v
                elif k.startswith("object"):
                    mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k]
                    if mapped_k == "category" and not PrefixManager.is_curie(v):
                        v = f"biolink:OntologyClass"
                    object_node[mapped_k] = v
                else:
                    log.info(f"Ignoring {k} {v}")
            else:
                data[k] = v

        subject_node = self.load_node(subject_node)
        object_node = self.load_node(object_node)
        if not (subject_node and object_node):
            return  # ?
        objs = [subject_node, object_node]

        for k, v in self.graph_metadata.items():
            if k not in {"curie_map"}:
                data[k] = v

        edge_data = sanitize_import(data.copy())
        if "subject" in edge_data and "object" in edge_data:
            if "id" not in edge_data:
                edge_data["id"] = generate_uuid()
            s = edge_data["subject"]
            o = edge_data["object"]

            self.set_edge_provenance(edge_data)

            key = generate_edge_key(s, edge_data["predicate"], o)
            self.edge_properties.update(list(edge_data.keys()))
            objs.append((s, o, key, edge_data))
        else:
            self.owner.log_error(
                entity=str(edge_data),
                error_type=ErrorType.MISSING_NODE,
                message="Ignoring edge with either a missing 'subject' or 'object'",
                message_level=MessageLevel.WARNING
            )

        for o in objs:
            yield o
Beispiel #20
0
    def triple(self, s: URIRef, p: URIRef, o: URIRef) -> None:
        """
        Parse a triple.

        Parameters
        ----------
        s: URIRef
            Subject
        p: URIRef
            Predicate
        o: URIRef
            Object

        """
        self.count += 1
        (element_uri, canonical_uri, predicate,
         property_name) = self.process_predicate(p)
        if element_uri:
            prop_uri = element_uri
        elif predicate:
            prop_uri = predicate
        else:
            prop_uri = property_name

        s_curie = self.prefix_manager.contract(s)
        if s_curie.startswith("biolink") or s_curie.startswith("OBAN"):
            log.warning(f"Skipping {s} {p} {o}")
        elif s_curie in self.reified_nodes:
            # subject is a reified node
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif p in self.reification_predicates:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif property_name in {
                "subject",
                "predicate",
                "object",
                "predicate",
                "relation",
        }:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif o in self.reification_types:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif element_uri and element_uri in self.node_property_predicates:
            # treating predicate as a node property
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif (p in self.node_property_predicates
              or predicate in self.node_property_predicates
              or property_name in self.node_property_predicates):
            # treating predicate as a node property
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif isinstance(o, rdflib.term.Literal):
            self.add_node_attribute(s, key=prop_uri, value=o)
        else:
            # treating predicate as an edge
            self.add_edge(s, o, p)

        if len(self.edge_cache) >= self.CACHE_SIZE:
            while self.reified_nodes:
                n = self.reified_nodes.pop()
                data = self.node_cache.pop(n)
                try:
                    self.dereify(n, data)
                except ValueError as e:
                    self.owner.log_error(
                        entity=str(data),
                        error_type=ErrorType.INVALID_EDGE_PROPERTY,
                        message=str(e),
                        message_level=MessageLevel.WARNING)
                    self._incomplete_nodes[n] = data

            for n in self._incomplete_nodes.keys():
                self.node_cache[n] = self._incomplete_nodes[n]
                self.reified_nodes.add(n)
            self._incomplete_nodes.clear()

            for k in self.edge_cache.keys():
                if ("id" not in self.edge_cache[k]
                        and "association_id" not in self.edge_cache[k]):
                    edge_key = generate_edge_key(
                        self.edge_cache[k]["subject"],
                        self.edge_cache[k]["predicate"],
                        self.edge_cache[k]["object"],
                    )
                    self.edge_cache[k]["id"] = edge_key
                data = self.edge_cache[k]
                data = self.validate_edge(data)
                data = sanitize_import(data)

                self.set_edge_provenance(data)

                if self.check_edge_filter(data):
                    self.edge_properties.update(data.keys())
                    yield k[0], k[1], k[2], data
            self.edge_cache.clear()
        yield None
Beispiel #21
0
    def triple(self, s: URIRef, p: URIRef, o: URIRef) -> None:
        """
        Parse a triple.

        Parameters
        ----------
        s: URIRef
            Subject
        p: URIRef
            Predicate
        o: URIRef
            Object

        """
        self.count += 1
        (element_uri, canonical_uri, predicate, property_name) = self.process_predicate(p)
        if element_uri:
            prop_uri = element_uri
        elif predicate:
            prop_uri = predicate
        else:
            prop_uri = property_name

        s_curie = self.prefix_manager.contract(s)
        if s_curie.startswith('biolink') or s_curie.startswith('OBAN'):
            log.warning(f"Skipping {s} {p} {o}")
        elif s_curie in self.reified_nodes:
            # subject is a reified node
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif p in self.reification_predicates:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif property_name in {'subject', 'predicate', 'object', 'predicate', 'relation'}:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif o in self.reification_types:
            # subject is a reified node
            self.reified_nodes.add(s_curie)
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif element_uri and element_uri in self.node_property_predicates:
            # treating predicate as a node property
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif (
            p in self.node_property_predicates
            or predicate in self.node_property_predicates
            or property_name in self.node_property_predicates
        ):
            # treating predicate as a node property
            self.add_node_attribute(s, key=prop_uri, value=o)
        elif isinstance(o, rdflib.term.Literal):
            self.add_node_attribute(s, key=prop_uri, value=o)
        else:
            # treating predicate as an edge
            self.add_edge(s, o, p)

        if len(self.edge_cache) >= self.CACHE_SIZE:
            while self.reified_nodes:
                n = self.reified_nodes.pop()
                data = self.node_cache.pop(n)
                try:
                    self.dereify(n, data)
                except ValueError as e:
                    log.info(e)
                    self._incomplete_nodes[n] = data

            for n in self._incomplete_nodes.keys():
                self.node_cache[n] = self._incomplete_nodes[n]
                self.reified_nodes.add(n)
            self._incomplete_nodes.clear()

            for k in self.edge_cache.keys():
                if 'id' not in self.edge_cache[k] and 'association_id' not in self.edge_cache[k]:
                    edge_key = generate_edge_key(
                        self.edge_cache[k]['subject'],
                        self.edge_cache[k]['predicate'],
                        self.edge_cache[k]['object'],
                    )
                    self.edge_cache[k]['id'] = edge_key
                data = self.edge_cache[k]
                data = validate_edge(data)
                data = sanitize_import(data)
                if 'provided_by' in self.graph_metadata and 'provided_by' not in data.keys():
                    data['provided_by'] = self.graph_metadata['provided_by']
                if self.check_edge_filter(data):
                    self.edge_properties.update(data.keys())
                    yield k[0], k[1], k[2], data
            self.edge_cache.clear()
        yield None
Beispiel #22
0
    def parse(
        self,
        filename: str,
        format: str = "nt",
        compression: Optional[str] = None,
        **kwargs: Any,
    ) -> Generator:
        """
        This method reads from RDF N-Triples and yields records.

        .. note::
            To ensure proper parsing of N-Triples and a relatively low memory footprint,
            it is recommended that the N-Triples be sorted based on the subject IRIs.

            ```sort -k 1,2 -t ' ' data.nt > data_sorted.nt```

        Parameters
        ----------
        filename: str
            The filename to parse
        format: str
            The format (``nt``)
        compression: Optional[str]
            The compression type (``gz``)
        kwargs: Any
            Any additional arguments

        Returns
        -------
        Generator
            A generator for records

        """
        p = CustomNTriplesParser(self)

        self.set_provenance_map(kwargs)

        if compression == "gz":
            yield from p.parse(gzip.open(filename, "rb"))
        else:
            yield from p.parse(open(filename, "rb"))
        log.info(f"Done parsing {filename}")

        for n in self.reified_nodes:
            data = self.node_cache.pop(n)
            self.dereify(n, data)

        for k in self.node_cache.keys():
            node_data = self.node_cache[k]
            if "category" in node_data:
                if NAMED_THING not in set(node_data["category"]):
                    node_data["category"].append(NAMED_THING)
            else:
                node_data["category"] = [NAMED_THING]

            node_data = self.validate_node(node_data)
            if not node_data:
                continue

            node_data = sanitize_import(node_data)

            self.set_node_provenance(node_data)

            if self.check_node_filter(node_data):
                self.node_properties.update(node_data.keys())
                yield k, node_data

        self.node_cache.clear()

        for k in self.edge_cache.keys():
            edge_data = self.edge_cache[k]

            edge_data = self.validate_edge(edge_data)
            if not edge_data:
                continue

            edge_data = sanitize_import(edge_data)

            self.set_edge_provenance(edge_data)

            if self.check_edge_filter(edge_data):
                self.edge_properties.update(edge_data.keys())
                yield k[0], k[1], k[2], edge_data

        self.edge_cache.clear()