def validate_node_properties(self, G): named_thing = bmt.get_class('named thing') with click.progressbar(G.nodes(data=True)) as bar: for n, data in bar: for key, value in data.items(): if key in named_thing.slots: if bmt.get_element(key).multivalued and not isinstance(value, list): self.log_node_error(n, 'invalid property type', message='{} type should be {} but its {}'.format(key, list, type(value))) if not bmt.get_element(key).multivalued and isinstance(value, list): self.log_node_error(n, 'invalid property type', message='{} type should be {} but its {}'.format(key, str, type(value))) if not re.match(r'^[^ :]+:[^ :]+$', n): self.log_node_error(n, 'invalid property value', message='id is not a curie')
def find_superclass(node, graph: MultiDiGraph) -> Optional[str]: """ Attempts to find a superclass for the given node in the given graph. Chooses superclasses that are in the biolink model whenever able. """ def super_class_generator(n) -> Tuple[str, int]: for _, m, data in graph.out_edges(n, data=True): edge_label = data.get('edge_label') if edge_label is None: continue elif edge_label == 'same_as': yield m, 0 elif edge_label == 'subclass_of': yield m, 1 for m, _, data in graph.in_edges(n, data=True): edge_label = data.get('edge_label') if edge_label is None: continue elif data['edge_label'] == 'same_as': yield m, 0 best_node, best_score = None, 0 for n, score in walk(node, super_class_generator): if 'name' in graph.node[n]: c = bmt.get_element(graph.node[n]['name']) if c is not None and c.name is not None: return c.name elif score > best_score and node.name is not None: best_node, best_score = node, score if best_node is not None: return graph.node[best_node].get('name')
def save_attribute(self, rdfgraph:rdflib.Graph, obj_iri:URIRef, *, key:str, value:Union[List[str], str]) -> None: """ Saves a node or edge attributes from the biolink model in the rdfgraph. Intended to be used within `ObanRdfTransformer.save`. """ element = bmt.get_element(key) if element is None: return if element.is_a == 'association slot' or element.is_a == 'node property': if key in mapping: key = mapping[key] else: key = URIRef('{}{}'.format(BIOLINK, element.name.replace(' ', '_'))) if not isinstance(value, (list, tuple, set)): value = [value] for value in value: if element.range == 'iri type': value = URIRef('{}{}'.format(BIOLINK, ''.join(value.title().split(' ')))) rdfgraph.add((obj_iri, key, rdflib.term.Literal(value)))
def get_term(curie: str, biolink_model_only=False) -> str: if curie.lower() == 'ncbitaxon:9606': return 'human' ontology = get_ontology(curie) if ontology is None: return curie terms = [ ontology.label(c) for c in ontology.ancestors(curie, reflexive=True) ] for term in terms: if term is not None and bmt.get_element(term) is not None: return term terms.sort(key=lambda s: len(s) if isinstance(s, str) else float('inf')) if biolink_model_only: return curie else: return terms[0]
def clique_merge(graph: nx.Graph, report=False) -> nx.Graph: """ Builds up cliques using the `same_as` attribute of each node. Uses those cliques to build up a mapping for relabelling nodes. Chooses labels so as to preserve the original nodes, rather than taking xrefs that don't appear as nodes in the graph. This method will also expand the `same_as` attribute of the nodes to include the discovered clique. """ original_size = len(graph) print('original graph has {} nodes'.format(original_size)) cliqueGraph = nx.Graph() with click.progressbar( graph.nodes(data=True), label='building cliques from same_as node property') as bar: for n, attr_dict in bar: if 'same_as' in attr_dict: for m in attr_dict['same_as']: cliqueGraph.add_edge(n, m) with click.progressbar(graph.edges(data=True), label='building cliques from same_as edges') as bar: for u, v, attr_dict in bar: if 'edge_label' in attr_dict and attr_dict[ 'edge_label'] == 'same_as': cliqueGraph.add_edge(u, v) edges = [] with click.progressbar(cliqueGraph.edges(), label='Breaking invalid cliques') as bar: for u, v in bar: try: u_categories = graph.node[u].get('category', []) v_categories = graph.node[v].get('category', []) except: continue l = len(edges) for a in u_categories: if len(edges) > l: break for b in v_categories: a_ancestors = bmt.ancestors(a) b_ancestors = bmt.ancestors(b) if a not in b_ancestors and b not in a_ancestors: edges.append((u, v)) break print('breaking {} many edges'.format(len(edges))) cliqueGraph.remove_edges_from(edges) mapping = {} connected_components = list(nx.connected_components(cliqueGraph)) print('Discovered {} cliques'.format(len(connected_components))) with click.progressbar(connected_components, label='building mapping') as bar: for nodes in bar: nodes = list(nodes) categories = set() for n in nodes: if not graph.has_node(n): continue attr_dict = graph.node[n] attr_dict['same_as'] = nodes if 'category' in attr_dict: categories.update(listify(attr_dict['category'])) if 'categories' in attr_dict: categories.update(listify(attr_dict['categories'])) list_of_prefixes = [] for category in categories: try: list_of_prefixes.append( bmt.get_element(category).id_prefixes) except: pass nodes.sort() nodes.sort(key=build_sort_key(list_of_prefixes)) for n in nodes: if n != nodes[0]: mapping[n] = nodes[0] g = relabel_nodes(graph, mapping) edges = [] for u, v, key, data in g.edges(keys=True, data=True): if data['edge_label'] == 'same_as': edges.append((u, v, key)) g.remove_edges_from(edges) for n, data in g.nodes(data=True): data['iri'] = expand_uri(n) if 'id' in data and data['id'] != n: data['id'] = n if 'same_as' in data and n in data['same_as']: data['same_as'].remove(n) if data['same_as'] == []: del data['same_as'] final_size = len(g) print('Resulting graph has {} nodes'.format(final_size)) print('Eliminated {} nodes'.format(original_size - final_size)) return g