def get_clique_category( clique_graph: nx.MultiDiGraph, clique: List ) -> Tuple[str, List]: """ Given a clique, identify the category of the clique. Parameters ---------- clique_graph: nx.MultiDiGraph Clique graph clique: List A list of nodes in clique Returns ------- Tuple[str, list] A tuple of clique category and its ancestors """ l = [clique_graph.nodes()[x]["category"] for x in clique] u = OrderedSet.union(*l) uo = sort_categories(u) log.debug(f"outcome of union (sorted): {uo}") clique_category = uo[0] clique_category_ancestors = get_biolink_ancestors(uo[0]) return clique_category, clique_category_ancestors
def get_the_most_specific_category(self, categories: list) -> Tuple[str, list]: """ From a list of categories, it tries to fetch ancestors for all. The category with the longest ancestor is considered to be the most specific. Parameters ---------- categories: list A list of categories Returns ------- tuple[str, list] A tuple of the most specific category and a list of ancestors of that category """ # TODO: could be integrated into update_categories method most_specific_category = None most_specific_category_ancestors = [] for category in categories: logging.debug("category: {}".format(category)) element = get_biolink_element(category) if element: # category exists in BioLink Model as a class or as an alias to a class mapped_category = element['name'] ancestors = get_biolink_ancestors(mapped_category) logging.debug("ancestors: {}".format(ancestors)) if len(ancestors) > len(most_specific_category_ancestors): # the category with the longest list of ancestors will be the most specific category most_specific_category = category most_specific_category_ancestors = ancestors return most_specific_category, most_specific_category_ancestors
def test_get_biolink_ancestors(): """ Test to get biolink ancestors. """ ancestors1 = get_biolink_ancestors('phenotypic feature') assert ancestors1 is not None assert len(ancestors1) == 5
def test_check_all_categories2(): """ Test check_all_categories method. """ # explicitly pin the release of Biolink to a # specific one with known category ancestry get_toolkit(biolink_release="2.2.11") categories = get_biolink_ancestors("biolink:Gene") vbc, ibc, ic = check_all_categories(categories) assert len(vbc) == 4 assert len(ibc) == 0 assert ( len(ic) == 8 ) # mixins are not valid biolink categories, but they are ancestors. categories = ["biolink:NamedThing", "biolink:GeneOrGeneProduct", "biolink:Gene"] vbc, ibc, ic = check_all_categories(categories) assert len(vbc) == 2 assert len(ibc) == 0 assert len(ic) == 1 categories = ["biolink:NamedThing", "biolink:GeneOrGeneProduct", "Node"] vbc, ibc, ic = check_all_categories(categories) assert len(vbc) == 1 assert len(ibc) == 0 assert len(ic) == 2
def test_get_biolink_ancestors(): """ Test to get biolink ancestors. """ ancestors1 = get_biolink_ancestors("phenotypic feature") assert ancestors1 is not None # changed to 6 from 5 when biolink model updated to 2.2.1 and mixins are included in ancestry assert len(ancestors1) == 6
def test_check_categories(): """ Test check_categories method. """ vbc, ibc, ic = check_categories( ["biolink:Gene"], get_biolink_ancestors("biolink:Gene"), None ) assert "biolink:Gene" in vbc assert len(ibc) == 0 vbc, ibc, ic = check_categories( ["biolink:BiologicalEntity"], get_biolink_ancestors("biolink:Disease"), None ) assert "biolink:BiologicalEntity" in vbc assert len(ibc) == 0 vbc, ibc, ic = check_categories( ["biolink:Disease"], get_biolink_ancestors("biolink:Gene"), None ) assert len(vbc) == 0 assert len(ibc) == 1 and "biolink:Disease" in ibc
def test_check_categories(): """ Test check_categories method. """ vbc, ibc, ic = check_categories(['biolink:Gene'], get_biolink_ancestors('biolink:Gene'), None) assert 'biolink:Gene' in vbc assert len(ibc) == 0 vbc, ibc, ic = check_categories(['biolink:GenomicEntity'], get_biolink_ancestors('biolink:Gene'), None) assert 'biolink:GenomicEntity' in vbc assert len(ibc) == 0 vbc, ibc, ic = check_categories(['biolink:Disease'], get_biolink_ancestors('biolink:Gene'), None) assert len(vbc) == 0 assert len(ibc) == 1 and 'biolink:Disease' in ibc
def check_all_categories(categories) -> Tuple[List, List, List]: """ Check all categories in ``categories``. Parameters ---------- categories: List A list of categories Returns ------- Tuple[List, List, List] A tuple consisting of valid biolink categories, invalid biolink categories, and invalid categories Note: the sort_categories method will re-arrange the passed in category list according to the distance of each list member from the top of their hierarchy. Each category's hierarchy is made up of its 'is_a' and mixin ancestors. """ previous: List = [] valid_biolink_categories: List = [] invalid_biolink_categories: List = [] invalid_categories: List = [] sc: List = sort_categories(categories) for c in sc: if previous: vbc, ibc, ic = check_categories( [c], get_biolink_ancestors(previous[0]), None ) else: vbc, ibc, ic = check_categories([c], get_biolink_ancestors(c), None) if vbc: valid_biolink_categories.extend(vbc) if ic: invalid_categories.extend(ic) if ibc: invalid_biolink_categories.extend(ibc) else: previous = vbc return valid_biolink_categories, invalid_biolink_categories, invalid_categories
def check_all_categories(categories) -> Tuple[List, List, List]: """ Check all categories in ``categories``. Parameters ---------- categories: List A list of categories Returns ------- Tuple[List, List, List] A tuple consisting of valid biolink categories, invalid biolink categories, and invalid categories """ previous: List = [] valid_biolink_categories: List = [] invalid_biolink_categories: List = [] invalid_categories: List = [] sc: List = sort_categories(categories) for c in sc: if previous: vbc, ibc, ic = check_categories([c], get_biolink_ancestors(previous[0]), None) else: vbc, ibc, ic = check_categories([c], get_biolink_ancestors(c), None) if vbc: valid_biolink_categories.extend(vbc) if ic: invalid_categories.extend(ic) if ibc: invalid_biolink_categories.extend(ibc) else: previous = vbc return valid_biolink_categories, invalid_biolink_categories, invalid_categories
def sort_categories(categories: Union[List, Set, OrderedSet]) -> List: """ Sort a list of categories from most specific to the most generic. Parameters ---------- categories: Union[List, Set, OrderedSet] A list of categories Returns ------- List A sorted list of categories where sorted means that the first element in the list returned has the most number of parents in the class hierarchy. """ weighted_categories = [] for c in categories: weighted_categories.append((len(get_biolink_ancestors(c)), c)) sorted_categories = sorted(weighted_categories, key=lambda x: x[0], reverse=True) return [x[1] for x in sorted_categories]
def sort_categories(categories: Union[List, Set, OrderedSet]) -> List: """ Sort a list of categories from most specific to the most generic. Parameters ---------- categories: Union[List, Set, OrderedSet] A list of categories Returns ------- List A sorted list of categories """ weighted_categories = [] for c in categories: weighted_categories.append((len(get_biolink_ancestors(c)), c)) sorted_categories = sorted(weighted_categories, key=lambda x: x[0], reverse=True) return [x[1] for x in sorted_categories]
def test_check_all_categories2(): """ Test check_all_categories method. """ categories = get_biolink_ancestors('biolink:Gene') vbc, ibc, ic = check_all_categories(categories) assert len(vbc) == 6 assert len(ibc) == 0 assert len(ic) == 0 categories = [ 'biolink:NamedThing', 'biolink:GeneOrGeneProduct', 'biolink:Gene' ] vbc, ibc, ic = check_all_categories(categories) assert len(vbc) == 2 assert len(ibc) == 1 assert len(ic) == 0 categories = ['biolink:NamedThing', 'biolink:GeneOrGeneProduct', 'Node'] vbc, ibc, ic = check_all_categories(categories) assert len(vbc) == 1 assert len(ibc) == 1 assert len(ic) == 1
def update_node_categories( target_graph: BaseGraph, clique_graph: nx.MultiDiGraph, clique: List, category_mapping: Optional[Dict[str, str]], strict: bool = True, ) -> List: """ For a given clique, get category for each node in clique and validate against Biolink Model, mapping to Biolink Model category where needed. For example, If a node has ``biolink:Gene`` as its category, then this method adds all of its ancestors. Parameters ---------- target_graph: kgx.graph.base_graph.BaseGraph The original graph clique_graph: networkx.Graph The clique graph clique: List A list of nodes from a clique category_mapping: Optional[Dict[str, str]] Mapping for non-Biolink Model categories to Biolink Model categories strict: bool Whether or not to merge nodes in a clique that have conflicting node categories Returns ------- List The clique """ updated_clique_graph_properties = {} updated_target_graph_properties = {} for node in clique: # For each node in a clique, get its category property data = clique_graph.nodes()[node] if "category" in data: categories = data["category"] else: categories = get_category_from_equivalence( target_graph, clique_graph, node, data ) # differentiate between valid and invalid categories ( valid_biolink_categories, invalid_biolink_categories, invalid_categories, ) = check_all_categories(categories) log.debug( f"valid biolink categories: {valid_biolink_categories} invalid biolink categories: {invalid_biolink_categories} invalid_categories: {invalid_categories}" ) # extend categories to have the longest list of ancestors extended_categories: List = [] for x in valid_biolink_categories: ancestors = get_biolink_ancestors(x) if len(ancestors) > len(extended_categories): extended_categories.extend(ancestors) log.debug(f"Extended categories: {extended_categories}") clique_graph_update_dict: Dict = {"category": list(extended_categories)} target_graph_update_dict: Dict = {} if invalid_biolink_categories: if strict: clique_graph_update_dict["_excluded_from_clique"] = True target_graph_update_dict["_excluded_from_clique"] = True clique_graph_update_dict[ "invalid_biolink_category" ] = invalid_biolink_categories target_graph_update_dict[ "invalid_biolink_category" ] = invalid_biolink_categories if invalid_categories: clique_graph_update_dict["_invalid_category"] = invalid_categories target_graph_update_dict["_invalid_category"] = invalid_categories updated_clique_graph_properties[node] = clique_graph_update_dict updated_target_graph_properties[node] = target_graph_update_dict nx.set_node_attributes(clique_graph, updated_clique_graph_properties) target_graph.set_node_attributes(target_graph, updated_target_graph_properties) return clique
def test_get_biolink_ancestors(): # TODO: Parameterize ancestors1 = get_biolink_ancestors('phenotypic feature') assert ancestors1 is not None assert len(ancestors1) == 4
def update_categories(self, clique: list): """ For a given clique, get category for each node in clique and validate against BioLink Model, mapping to BioLink Model category where needed. Ex.: If a node has `gene` as its category, then this method adds all of its ancestors. Parameters ---------- clique: list A list of nodes from a clique """ updated_node_categories = {} for node in clique: data = self.clique_graph.nodes[node] print(data) if 'category' in data: categories = data['category'] else: # get category from equivalence categories = self.get_category_from_equivalence(node, data) extended_categories = set() invalid_categories = [] for category in categories: logging.debug("Looking at category: {}".format(category)) element = get_biolink_element(category) if element: # category exists in BioLink Model as a class or as an alias to a class mapped_category = element['name'] ancestors = get_biolink_ancestors(mapped_category) if len(ancestors) > len(extended_categories): # the category with the longest list of ancestors will be the most specific category logging.debug("Ancestors for {} is larger than previous one".format(mapped_category)) extended_categories = ancestors else: logging.warning("[1] category '{}' not in BioLink Model".format(category)) invalid_categories.append(category) logging.debug("Invalid categories: {}".format(invalid_categories)) for x in categories: element = get_biolink_element(x) if element is None: logging.warning("[2] category '{}' is not in BioLink Model".format(x)) continue mapped_category = element['name'] if mapped_category not in extended_categories: logging.warning("category '{}' not in ancestor closure: {}".format(mapped_category, extended_categories)) mapped = MAPPING[x] if x in MAPPING.keys() else x if mapped not in extended_categories: logging.warning("category '{}' is not even in any custom defined mapping. ".format(mapped_category)) invalid_categories.append(x) update_dict = {'category': extended_categories} if invalid_categories: update_dict['_invalid_category'] = invalid_categories updated_node_categories[node] = update_dict logging.debug("Updating nodes in clique with: {}".format(updated_node_categories)) nx.set_node_attributes(self.clique_graph, updated_node_categories) nx.set_node_attributes(self.target_graph, updated_node_categories)
def process_predicate(self, p: Optional[Union[URIRef, str]]) -> Tuple: """ Process a predicate where the method checks if there is a mapping in Biolink Model. Parameters ---------- p: Optional[Union[URIRef, str]] The predicate Returns ------- Tuple A tuple that contains the Biolink CURIE (if available), the Biolink slot_uri CURIE (if available), the CURIE form of p, the reference of p """ if p in self.cache: # already processed this predicate before; pull from cache element_uri = self.cache[p]['element_uri'] canonical_uri = self.cache[p]['canonical_uri'] predicate = self.cache[p]['predicate'] property_name = self.cache[p]['property_name'] else: # haven't seen this property before; map to element if self.prefix_manager.is_iri(p): predicate = self.prefix_manager.contract(str(p)) else: predicate = None if self.prefix_manager.is_curie(p): property_name = self.prefix_manager.get_reference(p) predicate = p else: if predicate and self.prefix_manager.is_curie(predicate): property_name = self.prefix_manager.get_reference(predicate) else: property_name = p predicate = f":{p}" element = self.get_biolink_element(p) canonical_uri = None if element: if isinstance(element, SlotDefinition): # predicate corresponds to a biolink slot if element.definition_uri: element_uri = self.prefix_manager.contract(element.definition_uri) else: element_uri = f"biolink:{sentencecase_to_snakecase(element.name)}" if element.slot_uri: canonical_uri = element.slot_uri elif isinstance(element, ClassDefinition): # this will happen only when the IRI is actually # a reference to a class element_uri = self.prefix_manager.contract(element.class_uri) else: element_uri = f"biolink:{sentencecase_to_camelcase(element.name)}" if 'biolink:Attribute' in get_biolink_ancestors(element.name): element_uri = f"biolink:{sentencecase_to_snakecase(element.name)}" if not predicate: predicate = element_uri else: # no mapping to biolink model; # look at predicate mappings element_uri = None if p in self.predicate_mapping: property_name = self.predicate_mapping[p] predicate = f":{property_name}" self.cache[p] = { 'element_uri': element_uri, 'canonical_uri': canonical_uri, 'predicate': predicate, 'property_name': property_name, } return element_uri, canonical_uri, predicate, property_name
def process_predicate( prefix_manager: PrefixManager, p: Union[URIRef, str], predicate_mapping: Optional[Dict] = None, ) -> Tuple: """ Process a predicate where the method checks if there is a mapping in Biolink Model. Parameters ---------- prefix_manager: PrefixManager An instance of prefix manager p: Union[URIRef, str] The predicate predicate_mapping: Optional[Dict] Predicate mappings Returns ------- Tuple[str, str, str, str] A tuple that contains the Biolink CURIE (if available), the Biolink slot_uri CURIE (if available), the CURIE form of p, the reference of p """ if prefix_manager.is_iri(p): predicate = prefix_manager.contract(str(p)) else: predicate = None if prefix_manager.is_curie(p): property_name = prefix_manager.get_reference(p) predicate = p else: if predicate and prefix_manager.is_curie(predicate): property_name = prefix_manager.get_reference(predicate) else: property_name = p predicate = f":{p}" element = get_biolink_element(prefix_manager, p) canonical_uri = None if element: if isinstance(element, SlotDefinition): # predicate corresponds to a biolink slot if element.definition_uri: element_uri = prefix_manager.contract(element.definition_uri) else: element_uri = f"biolink:{sentencecase_to_snakecase(element.name)}" if element.slot_uri: canonical_uri = element.slot_uri elif isinstance(element, ClassDefinition): # this will happen only when the IRI is actually # a reference to a class element_uri = prefix_manager.contract(element.class_uri) else: element_uri = f"biolink:{sentencecase_to_camelcase(element.name)}" if "biolink:Attribute" in get_biolink_ancestors(element.name): element_uri = f"biolink:{sentencecase_to_snakecase(element.name)}" if not predicate: predicate = element_uri else: # no mapping to biolink model; # look at predicate mappings element_uri = None if predicate_mapping: if p in predicate_mapping: property_name = predicate_mapping[p] predicate = f":{property_name}" # cache[p] = {'element_uri': element_uri, 'canonical_uri': canonical_uri, # 'predicate': predicate, 'property_name': property_name} return element_uri, canonical_uri, predicate, property_name