Beispiel #1
0
def get_clique_category(
    clique_graph: nx.MultiDiGraph, clique: List
) -> Tuple[str, List]:
    """
    Given a clique, identify the category of the clique.

    Parameters
    ----------
    clique_graph: nx.MultiDiGraph
        Clique graph
    clique: List
        A list of nodes in clique

    Returns
    -------
    Tuple[str, list]
        A tuple of clique category and its ancestors

    """
    l = [clique_graph.nodes()[x]["category"] for x in clique]
    u = OrderedSet.union(*l)
    uo = sort_categories(u)
    log.debug(f"outcome of union (sorted): {uo}")
    clique_category = uo[0]
    clique_category_ancestors = get_biolink_ancestors(uo[0])
    return clique_category, clique_category_ancestors
Beispiel #2
0
    def get_the_most_specific_category(self, categories: list) -> Tuple[str, list]:
        """
        From a list of categories, it tries to fetch ancestors for all.
        The category with the longest ancestor is considered to be the most specific.

        Parameters
        ----------
        categories: list
            A list of categories

        Returns
        -------
        tuple[str, list]
            A tuple of the most specific category and a list of ancestors of that category

        """
        # TODO: could be integrated into update_categories method
        most_specific_category = None
        most_specific_category_ancestors = []
        for category in categories:
            logging.debug("category: {}".format(category))
            element = get_biolink_element(category)
            if element:
                # category exists in BioLink Model as a class or as an alias to a class
                mapped_category = element['name']
                ancestors = get_biolink_ancestors(mapped_category)
                logging.debug("ancestors: {}".format(ancestors))
                if len(ancestors) > len(most_specific_category_ancestors):
                    # the category with the longest list of ancestors will be the most specific category
                    most_specific_category = category
                    most_specific_category_ancestors = ancestors
        return most_specific_category, most_specific_category_ancestors
Beispiel #3
0
def test_get_biolink_ancestors():
    """
    Test to get biolink ancestors.
    """
    ancestors1 = get_biolink_ancestors('phenotypic feature')
    assert ancestors1 is not None
    assert len(ancestors1) == 5
Beispiel #4
0
def test_check_all_categories2():
    """
    Test check_all_categories method.
    """
    # explicitly pin the release of Biolink to a
    # specific one with known category ancestry
    get_toolkit(biolink_release="2.2.11")
    categories = get_biolink_ancestors("biolink:Gene")
    vbc, ibc, ic = check_all_categories(categories)

    assert len(vbc) == 4
    assert len(ibc) == 0
    assert (
        len(ic) == 8
    )  # mixins are not valid biolink categories, but they are ancestors.

    categories = ["biolink:NamedThing", "biolink:GeneOrGeneProduct", "biolink:Gene"]
    vbc, ibc, ic = check_all_categories(categories)
    assert len(vbc) == 2
    assert len(ibc) == 0
    assert len(ic) == 1

    categories = ["biolink:NamedThing", "biolink:GeneOrGeneProduct", "Node"]
    vbc, ibc, ic = check_all_categories(categories)
    assert len(vbc) == 1
    assert len(ibc) == 0
    assert len(ic) == 2
Beispiel #5
0
def test_get_biolink_ancestors():
    """
    Test to get biolink ancestors.
    """
    ancestors1 = get_biolink_ancestors("phenotypic feature")
    assert ancestors1 is not None
    # changed to 6 from 5 when biolink model updated to 2.2.1 and mixins are included in ancestry
    assert len(ancestors1) == 6
Beispiel #6
0
def test_check_categories():
    """
    Test check_categories method.
    """
    vbc, ibc, ic = check_categories(
        ["biolink:Gene"], get_biolink_ancestors("biolink:Gene"), None
    )
    assert "biolink:Gene" in vbc
    assert len(ibc) == 0

    vbc, ibc, ic = check_categories(
        ["biolink:BiologicalEntity"], get_biolink_ancestors("biolink:Disease"), None
    )
    assert "biolink:BiologicalEntity" in vbc
    assert len(ibc) == 0

    vbc, ibc, ic = check_categories(
        ["biolink:Disease"], get_biolink_ancestors("biolink:Gene"), None
    )
    assert len(vbc) == 0
    assert len(ibc) == 1 and "biolink:Disease" in ibc
Beispiel #7
0
def test_check_categories():
    """
    Test check_categories method.
    """
    vbc, ibc, ic = check_categories(['biolink:Gene'],
                                    get_biolink_ancestors('biolink:Gene'),
                                    None)
    assert 'biolink:Gene' in vbc
    assert len(ibc) == 0

    vbc, ibc, ic = check_categories(['biolink:GenomicEntity'],
                                    get_biolink_ancestors('biolink:Gene'),
                                    None)
    assert 'biolink:GenomicEntity' in vbc
    assert len(ibc) == 0

    vbc, ibc, ic = check_categories(['biolink:Disease'],
                                    get_biolink_ancestors('biolink:Gene'),
                                    None)
    assert len(vbc) == 0
    assert len(ibc) == 1 and 'biolink:Disease' in ibc
Beispiel #8
0
def check_all_categories(categories) -> Tuple[List, List, List]:
    """
    Check all categories in ``categories``.

    Parameters
    ----------
    categories: List
        A list of categories

    Returns
    -------
    Tuple[List, List, List]
        A tuple consisting of valid biolink categories, invalid biolink categories, and invalid categories

    Note: the sort_categories method will re-arrange the passed in category list according to the distance
    of each list member from the top of their hierarchy.  Each category's hierarchy is made up of its
    'is_a' and mixin ancestors.

    """
    previous: List = []
    valid_biolink_categories: List = []
    invalid_biolink_categories: List = []
    invalid_categories: List = []
    sc: List = sort_categories(categories)
    for c in sc:
        if previous:
            vbc, ibc, ic = check_categories(
                [c], get_biolink_ancestors(previous[0]), None
            )
        else:
            vbc, ibc, ic = check_categories([c], get_biolink_ancestors(c), None)
        if vbc:
            valid_biolink_categories.extend(vbc)
        if ic:
            invalid_categories.extend(ic)
        if ibc:
            invalid_biolink_categories.extend(ibc)
        else:
            previous = vbc
    return valid_biolink_categories, invalid_biolink_categories, invalid_categories
Beispiel #9
0
def check_all_categories(categories) -> Tuple[List, List, List]:
    """
    Check all categories in ``categories``.

    Parameters
    ----------
    categories: List
        A list of categories

    Returns
    -------
    Tuple[List, List, List]
        A tuple consisting of valid biolink categories, invalid biolink categories, and invalid categories

    """
    previous: List = []
    valid_biolink_categories: List = []
    invalid_biolink_categories: List = []
    invalid_categories: List = []
    sc: List = sort_categories(categories)
    for c in sc:
        if previous:
            vbc, ibc, ic = check_categories([c],
                                            get_biolink_ancestors(previous[0]),
                                            None)
        else:
            vbc, ibc, ic = check_categories([c], get_biolink_ancestors(c),
                                            None)
        if vbc:
            valid_biolink_categories.extend(vbc)
        if ic:
            invalid_categories.extend(ic)
        if ibc:
            invalid_biolink_categories.extend(ibc)
        else:
            previous = vbc

    return valid_biolink_categories, invalid_biolink_categories, invalid_categories
Beispiel #10
0
def sort_categories(categories: Union[List, Set, OrderedSet]) -> List:
    """
    Sort a list of categories from most specific to the most generic.

    Parameters
    ----------
    categories: Union[List, Set, OrderedSet]
        A list of categories

    Returns
    -------
    List
        A sorted list of categories where sorted means that the first element in the list returned
        has the most number of parents in the class hierarchy.

    """
    weighted_categories = []
    for c in categories:
        weighted_categories.append((len(get_biolink_ancestors(c)), c))
    sorted_categories = sorted(weighted_categories, key=lambda x: x[0], reverse=True)
    return [x[1] for x in sorted_categories]
Beispiel #11
0
def sort_categories(categories: Union[List, Set, OrderedSet]) -> List:
    """
    Sort a list of categories from most specific to the most generic.

    Parameters
    ----------
    categories: Union[List, Set, OrderedSet]
        A list of categories

    Returns
    -------
    List
        A sorted list of categories

    """
    weighted_categories = []
    for c in categories:
        weighted_categories.append((len(get_biolink_ancestors(c)), c))
    sorted_categories = sorted(weighted_categories,
                               key=lambda x: x[0],
                               reverse=True)
    return [x[1] for x in sorted_categories]
Beispiel #12
0
def test_check_all_categories2():
    """
    Test check_all_categories method.
    """
    categories = get_biolink_ancestors('biolink:Gene')
    vbc, ibc, ic = check_all_categories(categories)
    assert len(vbc) == 6
    assert len(ibc) == 0
    assert len(ic) == 0

    categories = [
        'biolink:NamedThing', 'biolink:GeneOrGeneProduct', 'biolink:Gene'
    ]
    vbc, ibc, ic = check_all_categories(categories)
    assert len(vbc) == 2
    assert len(ibc) == 1
    assert len(ic) == 0

    categories = ['biolink:NamedThing', 'biolink:GeneOrGeneProduct', 'Node']
    vbc, ibc, ic = check_all_categories(categories)
    assert len(vbc) == 1
    assert len(ibc) == 1
    assert len(ic) == 1
Beispiel #13
0
def update_node_categories(
    target_graph: BaseGraph,
    clique_graph: nx.MultiDiGraph,
    clique: List,
    category_mapping: Optional[Dict[str, str]],
    strict: bool = True,
) -> List:
    """
    For a given clique, get category for each node in clique and validate against Biolink Model,
    mapping to Biolink Model category where needed.

    For example, If a node has ``biolink:Gene`` as its category, then this method adds all of its ancestors.

    Parameters
    ----------
    target_graph: kgx.graph.base_graph.BaseGraph
        The original graph
    clique_graph: networkx.Graph
        The clique graph
    clique: List
        A list of nodes from a clique
    category_mapping: Optional[Dict[str, str]]
        Mapping for non-Biolink Model categories to Biolink Model categories
    strict: bool
        Whether or not to merge nodes in a clique that have conflicting node categories

    Returns
    -------
    List
        The clique

    """
    updated_clique_graph_properties = {}
    updated_target_graph_properties = {}
    for node in clique:
        # For each node in a clique, get its category property
        data = clique_graph.nodes()[node]
        if "category" in data:
            categories = data["category"]
        else:
            categories = get_category_from_equivalence(
                target_graph, clique_graph, node, data
            )

        # differentiate between valid and invalid categories
        (
            valid_biolink_categories,
            invalid_biolink_categories,
            invalid_categories,
        ) = check_all_categories(categories)
        log.debug(
            f"valid biolink categories: {valid_biolink_categories} invalid biolink categories: {invalid_biolink_categories} invalid_categories: {invalid_categories}"
        )
        # extend categories to have the longest list of ancestors
        extended_categories: List = []
        for x in valid_biolink_categories:
            ancestors = get_biolink_ancestors(x)
            if len(ancestors) > len(extended_categories):
                extended_categories.extend(ancestors)
        log.debug(f"Extended categories: {extended_categories}")
        clique_graph_update_dict: Dict = {"category": list(extended_categories)}
        target_graph_update_dict: Dict = {}

        if invalid_biolink_categories:
            if strict:
                clique_graph_update_dict["_excluded_from_clique"] = True
                target_graph_update_dict["_excluded_from_clique"] = True
            clique_graph_update_dict[
                "invalid_biolink_category"
            ] = invalid_biolink_categories
            target_graph_update_dict[
                "invalid_biolink_category"
            ] = invalid_biolink_categories

        if invalid_categories:
            clique_graph_update_dict["_invalid_category"] = invalid_categories
            target_graph_update_dict["_invalid_category"] = invalid_categories

        updated_clique_graph_properties[node] = clique_graph_update_dict
        updated_target_graph_properties[node] = target_graph_update_dict

    nx.set_node_attributes(clique_graph, updated_clique_graph_properties)
    target_graph.set_node_attributes(target_graph, updated_target_graph_properties)
    return clique
Beispiel #14
0
def test_get_biolink_ancestors():
    # TODO: Parameterize
    ancestors1 = get_biolink_ancestors('phenotypic feature')
    assert ancestors1 is not None
    assert len(ancestors1) == 4
Beispiel #15
0
    def update_categories(self, clique: list):
        """
        For a given clique, get category for each node in clique and validate against BioLink Model,
        mapping to BioLink Model category where needed.

        Ex.: If a node has `gene` as its category, then this method adds all of its ancestors.

        Parameters
        ----------
        clique: list
            A list of nodes from a clique

        """
        updated_node_categories = {}
        for node in clique:
            data = self.clique_graph.nodes[node]
            print(data)
            if 'category' in data:
                categories = data['category']
            else:
                # get category from equivalence
                categories = self.get_category_from_equivalence(node, data)

            extended_categories = set()
            invalid_categories = []
            for category in categories:
                logging.debug("Looking at category: {}".format(category))
                element = get_biolink_element(category)
                if element:
                    # category exists in BioLink Model as a class or as an alias to a class
                    mapped_category = element['name']
                    ancestors = get_biolink_ancestors(mapped_category)
                    if len(ancestors) > len(extended_categories):
                        # the category with the longest list of ancestors will be the most specific category
                        logging.debug("Ancestors for {} is larger than previous one".format(mapped_category))
                        extended_categories = ancestors
                else:
                    logging.warning("[1] category '{}' not in BioLink Model".format(category))
                    invalid_categories.append(category)
            logging.debug("Invalid categories: {}".format(invalid_categories))

            for x in categories:
                element = get_biolink_element(x)
                if element is None:
                    logging.warning("[2] category '{}' is not in BioLink Model".format(x))
                    continue
                mapped_category = element['name']
                if mapped_category not in extended_categories:
                    logging.warning("category '{}' not in ancestor closure: {}".format(mapped_category, extended_categories))
                    mapped = MAPPING[x] if x in MAPPING.keys() else x
                    if mapped not in extended_categories:
                        logging.warning("category '{}' is not even in any custom defined mapping. ".format(mapped_category))
                        invalid_categories.append(x)

            update_dict = {'category': extended_categories}
            if invalid_categories:
                update_dict['_invalid_category'] = invalid_categories
            updated_node_categories[node] = update_dict
        logging.debug("Updating nodes in clique with: {}".format(updated_node_categories))
        nx.set_node_attributes(self.clique_graph, updated_node_categories)
        nx.set_node_attributes(self.target_graph, updated_node_categories)
Beispiel #16
0
    def process_predicate(self, p: Optional[Union[URIRef, str]]) -> Tuple:
        """
        Process a predicate where the method checks if there is a mapping in Biolink Model.

        Parameters
        ----------
        p: Optional[Union[URIRef, str]]
            The predicate

        Returns
        -------
        Tuple
            A tuple that contains the Biolink CURIE (if available), the Biolink slot_uri CURIE (if available),
            the CURIE form of p, the reference of p

        """
        if p in self.cache:
            # already processed this predicate before; pull from cache
            element_uri = self.cache[p]['element_uri']
            canonical_uri = self.cache[p]['canonical_uri']
            predicate = self.cache[p]['predicate']
            property_name = self.cache[p]['property_name']
        else:
            # haven't seen this property before; map to element
            if self.prefix_manager.is_iri(p):
                predicate = self.prefix_manager.contract(str(p))
            else:
                predicate = None
            if self.prefix_manager.is_curie(p):
                property_name = self.prefix_manager.get_reference(p)
                predicate = p
            else:
                if predicate and self.prefix_manager.is_curie(predicate):
                    property_name = self.prefix_manager.get_reference(predicate)
                else:
                    property_name = p
                    predicate = f":{p}"
            element = self.get_biolink_element(p)
            canonical_uri = None
            if element:
                if isinstance(element, SlotDefinition):
                    # predicate corresponds to a biolink slot
                    if element.definition_uri:
                        element_uri = self.prefix_manager.contract(element.definition_uri)
                    else:
                        element_uri = f"biolink:{sentencecase_to_snakecase(element.name)}"
                    if element.slot_uri:
                        canonical_uri = element.slot_uri
                elif isinstance(element, ClassDefinition):
                    # this will happen only when the IRI is actually
                    # a reference to a class
                    element_uri = self.prefix_manager.contract(element.class_uri)
                else:
                    element_uri = f"biolink:{sentencecase_to_camelcase(element.name)}"
                if 'biolink:Attribute' in get_biolink_ancestors(element.name):
                    element_uri = f"biolink:{sentencecase_to_snakecase(element.name)}"
                if not predicate:
                    predicate = element_uri
            else:
                # no mapping to biolink model;
                # look at predicate mappings
                element_uri = None
                if p in self.predicate_mapping:
                    property_name = self.predicate_mapping[p]
                    predicate = f":{property_name}"
            self.cache[p] = {
                'element_uri': element_uri,
                'canonical_uri': canonical_uri,
                'predicate': predicate,
                'property_name': property_name,
            }
        return element_uri, canonical_uri, predicate, property_name
Beispiel #17
0
def process_predicate(
    prefix_manager: PrefixManager,
    p: Union[URIRef, str],
    predicate_mapping: Optional[Dict] = None,
) -> Tuple:
    """
    Process a predicate where the method checks if there is a mapping in Biolink Model.

    Parameters
    ----------
    prefix_manager: PrefixManager
        An instance of prefix manager
    p: Union[URIRef, str]
        The predicate
    predicate_mapping: Optional[Dict]
        Predicate mappings

    Returns
    -------
    Tuple[str, str, str, str]
        A tuple that contains the Biolink CURIE (if available), the Biolink slot_uri CURIE (if available),
        the CURIE form of p, the reference of p

    """
    if prefix_manager.is_iri(p):
        predicate = prefix_manager.contract(str(p))
    else:
        predicate = None
    if prefix_manager.is_curie(p):
        property_name = prefix_manager.get_reference(p)
        predicate = p
    else:
        if predicate and prefix_manager.is_curie(predicate):
            property_name = prefix_manager.get_reference(predicate)
        else:
            property_name = p
            predicate = f":{p}"
    element = get_biolink_element(prefix_manager, p)
    canonical_uri = None
    if element:
        if isinstance(element, SlotDefinition):
            # predicate corresponds to a biolink slot
            if element.definition_uri:
                element_uri = prefix_manager.contract(element.definition_uri)
            else:
                element_uri = f"biolink:{sentencecase_to_snakecase(element.name)}"
            if element.slot_uri:
                canonical_uri = element.slot_uri
        elif isinstance(element, ClassDefinition):
            # this will happen only when the IRI is actually
            # a reference to a class
            element_uri = prefix_manager.contract(element.class_uri)
        else:
            element_uri = f"biolink:{sentencecase_to_camelcase(element.name)}"
        if "biolink:Attribute" in get_biolink_ancestors(element.name):
            element_uri = f"biolink:{sentencecase_to_snakecase(element.name)}"
        if not predicate:
            predicate = element_uri
    else:
        # no mapping to biolink model;
        # look at predicate mappings
        element_uri = None
        if predicate_mapping:
            if p in predicate_mapping:
                property_name = predicate_mapping[p]
                predicate = f":{property_name}"
        # cache[p] = {'element_uri': element_uri, 'canonical_uri': canonical_uri,
        # 'predicate': predicate, 'property_name': property_name}
    return element_uri, canonical_uri, predicate, property_name