Esempio n. 1
0
 def validate_node_properties(self, G):
     named_thing = bmt.get_class('named thing')
     with click.progressbar(G.nodes(data=True)) as bar:
         for n, data in bar:
             for key, value in data.items():
                 if key in named_thing.slots:
                     if bmt.get_element(key).multivalued and not isinstance(value, list):
                         self.log_node_error(n, 'invalid property type', message='{} type should be {} but its {}'.format(key, list, type(value)))
                     if not bmt.get_element(key).multivalued and isinstance(value, list):
                         self.log_node_error(n, 'invalid property type', message='{} type should be {} but its {}'.format(key, str, type(value)))
             if not re.match(r'^[^ :]+:[^ :]+$', n):
                 self.log_node_error(n, 'invalid property value', message='id is not a curie')
Esempio n. 2
0
def find_superclass(node, graph: MultiDiGraph) -> Optional[str]:
    """
    Attempts to find a superclass for the given node in the given graph. Chooses
    superclasses that are in the biolink model whenever able.
    """
    def super_class_generator(n) -> Tuple[str, int]:
        for _, m, data in graph.out_edges(n, data=True):
            edge_label = data.get('edge_label')
            if edge_label is None:
                continue
            elif edge_label == 'same_as':
                yield m, 0
            elif edge_label == 'subclass_of':
                yield m, 1

        for m, _, data in graph.in_edges(n, data=True):
            edge_label = data.get('edge_label')
            if edge_label is None:
                continue
            elif data['edge_label'] == 'same_as':
                yield m, 0

    best_node, best_score = None, 0

    for n, score in walk(node, super_class_generator):
        if 'name' in graph.node[n]:
            c = bmt.get_element(graph.node[n]['name'])
            if c is not None and c.name is not None:
                return c.name
        elif score > best_score and node.name is not None:
            best_node, best_score = node, score

    if best_node is not None:
        return graph.node[best_node].get('name')
Esempio n. 3
0
 def save_attribute(self, rdfgraph:rdflib.Graph, obj_iri:URIRef, *, key:str, value:Union[List[str], str]) -> None:
     """
     Saves a node or edge attributes from the biolink model in the rdfgraph.
     Intended to be used within `ObanRdfTransformer.save`.
     """
     element = bmt.get_element(key)
     if element is None:
         return
     if element.is_a == 'association slot' or element.is_a == 'node property':
         if key in mapping:
             key = mapping[key]
         else:
             key = URIRef('{}{}'.format(BIOLINK, element.name.replace(' ', '_')))
         if not isinstance(value, (list, tuple, set)):
             value = [value]
         for value in value:
             if element.range == 'iri type':
                 value = URIRef('{}{}'.format(BIOLINK, ''.join(value.title().split(' '))))
             rdfgraph.add((obj_iri, key, rdflib.term.Literal(value)))
Esempio n. 4
0
def get_term(curie: str, biolink_model_only=False) -> str:
    if curie.lower() == 'ncbitaxon:9606':
        return 'human'

    ontology = get_ontology(curie)

    if ontology is None:
        return curie
    terms = [
        ontology.label(c) for c in ontology.ancestors(curie, reflexive=True)
    ]

    for term in terms:
        if term is not None and bmt.get_element(term) is not None:
            return term

    terms.sort(key=lambda s: len(s) if isinstance(s, str) else float('inf'))

    if biolink_model_only:
        return curie
    else:
        return terms[0]
Esempio n. 5
0
def clique_merge(graph: nx.Graph, report=False) -> nx.Graph:
    """
    Builds up cliques using the `same_as` attribute of each node. Uses those
    cliques to build up a mapping for relabelling nodes. Chooses labels so as
    to preserve the original nodes, rather than taking xrefs that don't appear
    as nodes in the graph.

    This method will also expand the `same_as` attribute of the nodes to
    include the discovered clique.
    """
    original_size = len(graph)
    print('original graph has {} nodes'.format(original_size))

    cliqueGraph = nx.Graph()

    with click.progressbar(
            graph.nodes(data=True),
            label='building cliques from same_as node property') as bar:
        for n, attr_dict in bar:
            if 'same_as' in attr_dict:
                for m in attr_dict['same_as']:
                    cliqueGraph.add_edge(n, m)

    with click.progressbar(graph.edges(data=True),
                           label='building cliques from same_as edges') as bar:
        for u, v, attr_dict in bar:
            if 'edge_label' in attr_dict and attr_dict[
                    'edge_label'] == 'same_as':
                cliqueGraph.add_edge(u, v)

    edges = []
    with click.progressbar(cliqueGraph.edges(),
                           label='Breaking invalid cliques') as bar:
        for u, v in bar:
            try:
                u_categories = graph.node[u].get('category', [])
                v_categories = graph.node[v].get('category', [])
            except:
                continue
            l = len(edges)
            for a in u_categories:
                if len(edges) > l:
                    break
                for b in v_categories:
                    a_ancestors = bmt.ancestors(a)
                    b_ancestors = bmt.ancestors(b)
                    if a not in b_ancestors and b not in a_ancestors:
                        edges.append((u, v))
                        break

    print('breaking {} many edges'.format(len(edges)))
    cliqueGraph.remove_edges_from(edges)

    mapping = {}

    connected_components = list(nx.connected_components(cliqueGraph))

    print('Discovered {} cliques'.format(len(connected_components)))

    with click.progressbar(connected_components,
                           label='building mapping') as bar:
        for nodes in bar:
            nodes = list(nodes)
            categories = set()
            for n in nodes:
                if not graph.has_node(n):
                    continue

                attr_dict = graph.node[n]

                attr_dict['same_as'] = nodes

                if 'category' in attr_dict:
                    categories.update(listify(attr_dict['category']))

                if 'categories' in attr_dict:
                    categories.update(listify(attr_dict['categories']))

            list_of_prefixes = []
            for category in categories:
                try:
                    list_of_prefixes.append(
                        bmt.get_element(category).id_prefixes)
                except:
                    pass

            nodes.sort()
            nodes.sort(key=build_sort_key(list_of_prefixes))

            for n in nodes:
                if n != nodes[0]:
                    mapping[n] = nodes[0]

    g = relabel_nodes(graph, mapping)

    edges = []
    for u, v, key, data in g.edges(keys=True, data=True):
        if data['edge_label'] == 'same_as':
            edges.append((u, v, key))
    g.remove_edges_from(edges)

    for n, data in g.nodes(data=True):
        data['iri'] = expand_uri(n)
        if 'id' in data and data['id'] != n:
            data['id'] = n
        if 'same_as' in data and n in data['same_as']:
            data['same_as'].remove(n)
            if data['same_as'] == []:
                del data['same_as']

    final_size = len(g)
    print('Resulting graph has {} nodes'.format(final_size))
    print('Eliminated {} nodes'.format(original_size - final_size))

    return g