def curiefy_result(self, result): """ Convert subject, predicate and object IRIs to their respective CURIEs, where applicable """ if result['subject']['type'] == 'uri': subject_curie = make_curie(result['subject']['value']) if subject_curie != result['subject']['value']: result['subject']['value'] = subject_curie result['subject']['type'] = 'curie' else: logging.warning("Could not CURIEfy {}".format( result['subject']['value'])) if result['object']['type'] == 'curie': object_curie = make_curie(result['object']['value']) if object_curie != result['object']['value']: result['object']['value'] = object_curie result['object']['type'] = 'curie' else: logging.warning("Could not CURIEfy {}".format( result['object']['value'])) predicate_curie = make_curie(result['predicate']['value']) if predicate_curie != result['predicate']['value']: result['predicate']['value'] = predicate_curie result['predicate']['type'] = 'curie' else: result['predicate']['type'] = 'uri' return result
def add_edge_attribute(self, subject_iri: Union[URIRef, str], object_iri: URIRef, predicate_iri: URIRef, key: str, value: str) -> None: """ Adds an attribute to an edge, while taking into account whether the attribute should be multi-valued. Multi-valued properties will not contain duplicates. The key may be a rdflib.URIRef or a URI string that maps onto a property name as defined in `rdf_utils.property_mapping`. If the nodes in the edge does not exist then they will be created using subject_iri and object_iri. If the edge itself does not exist then it will be created using subject_iri, object_iri and predicate_iri. Parameters ---------- subject_iri: [rdflib.URIRef, str] The IRI of the subject node of an edge in rdflib.Graph object_iri: rdflib.URIRef The IRI of the object node of an edge in rdflib.Graph predicate_iri: rdflib.URIRef The IRI of the predicate representing an edge in rdflib.Graph key: str The name of the attribute. Can be a rdflib.URIRef or URI string value: str The value of the attribute """ if key.lower() in is_property_multivalued: key = key.lower() else: if not isinstance(key, URIRef): key = URIRef(key) key = property_mapping.get(key) if key is not None: subject_curie = make_curie(subject_iri) object_curie = make_curie(object_iri) edge_label = process_iri(predicate_iri) if is_curie(edge_label): edge_label = curie_lookup(edge_label) edge_key = generate_edge_key(subject_curie, edge_label, object_curie) attr_dict = self.graph.get_edge_data(subject_curie, object_curie, key=edge_key) self._add_attribute(attr_dict, key, value)
def add_node(self, iri: URIRef) -> str: """ This method should be used by all derived classes when adding a node to the networkx.MultiDiGraph. This ensures that a node's identifier is a CURIE, and that it's 'iri' property is set. Returns the CURIE identifier for the node in the networkx.MultiDiGraph Parameters ---------- iri : rdflib.URIRef IRI of a node Returns ------- str The CURIE identifier of a node """ kwargs = { 'iri': str(iri), } if 'provided_by' in self.graph_metadata: kwargs['provided_by'] = self.graph_metadata['provided_by'] n = make_curie(iri) if n not in self.graph: self.graph.add_node(n, **kwargs) return n
def add_edge(self, subject_iri: URIRef, object_iri: URIRef, predicate_iri: URIRef) -> Tuple[str, str, str]: """ This method should be used by all derived classes when adding an edge to the networkx.MultiDiGraph. This ensures that the `subject` and `object` identifiers are CURIEs, and that `edge_label` is in the correct form. Returns the CURIE identifiers used for the `subject` and `object` in the networkx.MultiDiGraph, and the processed `edge_label`. Parameters ---------- subject_iri: rdflib.URIRef Subject IRI for the subject in a triple object_iri: rdflib.URIRef Object IRI for the object in a triple predicate_iri: rdflib.URIRef Predicate IRI for the predicate in a triple Returns ------- Tuple[str, str, str] A 3-nary tuple (of the form subject, object, predicate) that represents the edge """ s = self.add_node(subject_iri) o = self.add_node(object_iri) relation = make_curie(predicate_iri) edge_label = process_iri(predicate_iri) if ' ' in edge_label: logging.debug("predicate IRI '{}' yields edge_label '{}' that not in snake_case form; replacing ' ' with '_'".format(predicate_iri, edge_label)) if edge_label.startswith(self.BIOLINK): logging.debug("predicate IRI '{}' yields edge_label '{}' that starts with '{}'; removing IRI prefix".format(predicate_iri, edge_label, self.BIOLINK)) edge_label = edge_label.replace(self.BIOLINK, '') if PrefixManager.is_curie(edge_label): name = curie_lookup(edge_label) if name: logging.debug("predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; Using its mapping instead: {}".format(predicate_iri, edge_label, name)) edge_label = name else: logging.debug("predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; defaulting back to {}".format(predicate_iri, edge_label, self.DEFAULT_EDGE_LABEL)) edge_label = self.DEFAULT_EDGE_LABEL kwargs = { 'subject': s, 'predicate': predicate_iri, 'object': o, 'relation': relation, 'edge_label': edge_label } if 'provided_by' in self.graph_metadata: kwargs['provided_by'] = self.graph_metadata['provided_by'] key = generate_edge_key(s, edge_label, o) if not self.graph.has_edge(s, o, key=key): self.graph.add_edge(s, o, key=key, **kwargs) return s, o, edge_label
def _add_attribute(self, attr_dict: Dict, key: str, value: str) -> None: """ Adds an attribute to the attribute dictionary, respecting whether or not that attribute should be multi-valued. Multi-valued attributes will not contain duplicates. Some attributes are singular form of others. In such cases overflowing values will be placed into the correlating multi-valued attribute. For example, 'name' attribute will hold only one value while any additional value will be stored as 'synonym' attribute. Parameters ---------- attr_dict: dict Dictionary representing the attribute set of a node or an edge in a networkx graph key: str The name of the attribute value: str The value of the attribute """ if key is None or key not in is_property_multivalued: logging.warning( "Discarding key {} as it is not a valid property.".format(key)) return value = make_curie(process_iri(value)) if is_property_multivalued[key]: if key not in attr_dict: attr_dict[key] = [value] elif value not in attr_dict[key]: attr_dict[key].append(value) else: if key not in attr_dict: attr_dict[key] = value elif key == 'name': self._add_attribute(attr_dict, 'synonym', value)
def add_edge(self, subject_iri: URIRef, object_iri: URIRef, predicate_iri: URIRef) -> Tuple[str, str, str]: """ This method should be used by all derived classes when adding an edge to the networkx.MultiDiGraph. This ensures that the subject and object identifiers are CURIEs, and that edge_label is in the correct form. Returns the CURIE identifiers used for the subject and object in the networkx.MultiDiGraph, and the processed edge_label. Parameters ---------- subject_iri: rdflib.URIRef Subject IRI for the subject in a triple object_iri: rdflib.URIRef Object IRI for the object in a triple predicate_iri: rdflib.URIRef Predicate IRI for the predicate in a triple Returns ------- Tuple[str, str, str] A 3-nary tuple (of the form subject, object, predicate) that represents the edge """ s = self.add_node(subject_iri) o = self.add_node(object_iri) relation = make_curie(predicate_iri) edge_label = process_iri(predicate_iri) if ' ' in edge_label: logging.debug( "predicate IRI '{}' yields edge_label '{}' that not in snake_case form; replacing ' ' with '_'" .format(predicate_iri, edge_label)) # TODO: shouldn't this move to the utilities function process_uri() if edge_label.startswith(self.BIOLINK): logging.debug( "predicate IRI '{}' yields edge_label '{}' that starts with '{}'; removing IRI prefix" .format(predicate_iri, edge_label, self.BIOLINK)) edge_label = edge_label.replace(self.BIOLINK, '') # TODO: is there no way to get label of a CURIE? # TODO: this should also move to the utilities function # Any service? or preload required ontologies by prefix? if ':' in edge_label: logging.debug( "edge label '{}' is a CURIE; defaulting back to 'related_to'". format(edge_label)) logging.debug( "predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; defaulting back to {}" .format(predicate_iri, edge_label, self.DEFAULT_EDGE_LABEL)) edge_label = self.DEFAULT_EDGE_LABEL kwargs = {'relation': relation, 'edge_label': edge_label} if 'provided_by' in self.graph_metadata: kwargs['provided_by'] = self.graph_metadata['provided_by'] if self.graph.has_edge(s, o, key=edge_label): logging.debug("{} -- {} --> {} edge already exists".format( s, edge_label, o)) else: self.graph.add_edge(s, o, key=edge_label, **kwargs) return s, o, edge_label
"shortened life span", "http://purl.obolibrary.org/obo/CHEBI_23367": "molecular entity", "http://purl.obolibrary.org/obo/CHEBI_23888": "drug", "http://purl.obolibrary.org/obo/CHEBI_51086": "chemical role", "http://purl.obolibrary.org/obo/UPHENO_0001001": "phenotypic feature", "http://purl.obolibrary.org/obo/GO_0008150": "biological_process", "http://purl.obolibrary.org/obo/GO_0005575": "cellular component", "http://purl.obolibrary.org/obo/SO_0000704": "gene", "http://purl.obolibrary.org/obo/SO_0000110": "sequence feature", "http://purl.obolibrary.org/obo/GENO_0000536": "genotype", } mapping = defaultdict(set) for key, value in iri_mapping.items(): mapping[make_curie(key)].add(value) for key, value in toolkit.generator.mappings.items(): mapping[key].update(value) def walk(node, next_node_generator): to_visit = {node: 0} # Dict[URIRef, Integer] visited = {} # Dict[URIRef, Integer] while to_visit != {}: m, score = to_visit.popitem() visited[m] = score for t in next_node_generator(m): if isinstance(t, tuple) and len(t) > 1: n, s = t
def load_edges(self, association='bl:ChemicalToGeneAssociation', limit=None): sparql = SPARQLWrapper(self.url) query = render(self.count_query, {'association': association}) logging.debug(query) sparql.setQuery(query) sparql.setReturnFormat(JSON) results = sparql.query().convert() count = int(results['results']['bindings'][0]['triples']['value']) logging.info("Expected triples for query: {}".format(count)) step = 1000 start = 0 for i in range(step, count + step, step): end = i query = render(self.edge_query, { 'association': association, 'offset': start, 'limit': step }) sparql.setQuery(query) results = sparql.query().convert() node_list = set() for r in results['results']['bindings']: node_list.add("<{}>".format(r['subject']['value'])) node_list.add("<{}>".format(r['object']['value'])) start = end self.load_nodes(node_list) logging.info("Fetching edges...") map = {} for r in results['results']['bindings']: s = r['subject']['value'] p = r['predicate']['value'] o = r['object']['value'] self.add_edge(s, o, p) continue # make_curie_result(r) key = ((r['subject']['value'], r['object']['value']), r['predicate']['value']) if key in map: # seen this triple before. look at properties edge_property_key = r['edge_property_key'] edge_property_key_curie = make_curie( edge_property_key['value']) if edge_property_key_curie.startswith('bl:'): edge_property_key_curie = edge_property_key_curie.split( ':')[1] edge_property_value = r['edge_property_value'] if edge_property_value['type'] == 'uri': edge_property_value_curie = make_curie( edge_property_value['value']) else: edge_property_value_curie = edge_property_value[ 'value'] map[key][ edge_property_key_curie] = edge_property_value_curie else: map[key] = {} edge_property_key = r['edge_property_key'] edge_property_key_curie = make_curie( edge_property_key['value']) if edge_property_key_curie.startswith('bl:'): edge_property_key_curie = edge_property_key_curie.split( ':')[1] edge_property_value = r['edge_property_value'] if edge_property_value['type'] == 'uri': edge_property_value_curie = make_curie( edge_property_value['value']) else: edge_property_value_curie = edge_property_value[ 'value'] map[key][ edge_property_key_curie] = edge_property_value_curie logging.info("Loading edges...") for key, properties in map.items(): self.graph.add_node(key[0][0]) self.graph.add_node(key[0][1]) if 'is_defined_by' not in properties and self.IS_DEFINED_BY: properties['is_defined_by'] = self.IS_DEFINED_BY if key[1].startswith('bl:'): relation = key[1].split(':')[1] else: relation = key[1] properties['edge_label'] = relation if 'relation' not in properties: properties['relation'] = relation self.graph.add_edge(key[0][0], key[0][1], **properties) map.clear() if limit is not None and i > limit: break self.set_categories()