Beispiel #1
0
    def get_the_most_specific_category(self, categories: list) -> Tuple[str, list]:
        """
        From a list of categories, it tries to fetch ancestors for all.
        The category with the longest ancestor is considered to be the most specific.

        Parameters
        ----------
        categories: list
            A list of categories

        Returns
        -------
        tuple[str, list]
            A tuple of the most specific category and a list of ancestors of that category

        """
        # TODO: could be integrated into update_categories method
        most_specific_category = None
        most_specific_category_ancestors = []
        for category in categories:
            logging.debug("category: {}".format(category))
            formatted_category = snakecase_to_sentencecase(category)
            logging.debug("formatted_category: {}".format(formatted_category))
            element = self.toolkit.get_element(category)
            if element:
                # category exists in BioLink Model as a class or as an alias to a class
                mapped_category = element['name']
                ancestors = self.toolkit.ancestors(mapped_category)
                logging.debug("ancestors: {}".format(ancestors))
                if len(ancestors) > len(most_specific_category_ancestors):
                    # the category with the longest list of ancestors will be the most specific category
                    most_specific_category = category
                    most_specific_category_ancestors = ancestors
        return most_specific_category, most_specific_category_ancestors
Beispiel #2
0
    def validate_edge_predicate(
            self,
            subject: str,
            object: str,
            data: dict,
            toolkit: Optional[Toolkit] = None
    ):
        """
        Validate ``edge_predicate`` field of a given edge.

        Parameters
        ----------
        subject: str
            Subject identifier
        object: str
            Object identifier
        data: dict
            Edge properties
        toolkit: Optional[Toolkit]
            Optional externally provided toolkit (default: use Validator class defined toolkit)

        """
        if not toolkit:
            toolkit = Validator.get_toolkit()

        error_type = ErrorType.INVALID_EDGE_PREDICATE
        edge_predicate = data.get("predicate")
        if edge_predicate is None:
            message = "Edge does not have an 'predicate' property"
            self.log_error(f"{subject}->{object}", error_type, message, MessageLevel.ERROR)
        elif not isinstance(edge_predicate, str):
            message = f"Edge property 'edge_predicate' is expected to be of type 'string'"
            self.log_error(f"{subject}->{object}", error_type, message, MessageLevel.ERROR)
        else:
            if PrefixManager.is_curie(edge_predicate):
                edge_predicate = PrefixManager.get_reference(edge_predicate)
            m = re.match(r"^([a-z_][^A-Z\s]+_?[a-z_][^A-Z\s]+)+$", edge_predicate)
            if m:
                p = toolkit.get_element(snakecase_to_sentencecase(edge_predicate))
                if p is None:
                    message = f"Edge predicate '{edge_predicate}' is not in Biolink Model"
                    self.log_error(
                        f"{subject}->{object}",
                        error_type,
                        message,
                        MessageLevel.ERROR,
                    )
                elif edge_predicate != p.name and edge_predicate in p.aliases:
                    message = f"Edge predicate '{edge_predicate}' is actually an alias for {p.name}; " + \
                              f"Should replace {edge_predicate} with {p.name}"
                    self.log_error(
                        f"{subject}->{object}",
                        error_type,
                        message,
                        MessageLevel.ERROR,
                    )
            else:
                message = f"Edge predicate '{edge_predicate}' is not in snake_case form"
                self.log_error(f"{subject}->{object}", error_type, message, MessageLevel.ERROR)
Beispiel #3
0
    def validate_edge_predicate(subject: str, object: str, data: dict) -> list:
        """
        Validate ``edge_predicate`` field of a given edge.

        Parameters
        ----------
        subject: str
            Subject identifier
        object: str
            Object identifier
        data: dict
            Edge properties

        Returns
        -------
        list
            A list of errors for a given edge

        """
        toolkit = get_toolkit()
        error_type = ErrorType.INVALID_EDGE_PREDICATE
        errors = []
        edge_predicate = data.get('predicate')
        if edge_predicate is None:
            message = "Edge does not have an 'predicate' property"
            errors.append(
                ValidationError(f"{subject}-{object}", error_type, message,
                                MessageLevel.ERROR))
        elif not isinstance(edge_predicate, str):
            message = f"Edge property 'edge_predicate' expected to be of type 'string'"
            errors.append(
                ValidationError(f"{subject}-{object}", error_type, message,
                                MessageLevel.ERROR))
        else:
            if PrefixManager.is_curie(edge_predicate):
                edge_predicate = PrefixManager.get_reference(edge_predicate)
            m = re.match(r"^([a-z_][^A-Z\s]+_?[a-z_][^A-Z\s]+)+$",
                         edge_predicate)
            if m:
                p = toolkit.get_element(
                    snakecase_to_sentencecase(edge_predicate))
                if p is None:
                    message = f"Edge label '{edge_predicate}' not in Biolink Model"
                    errors.append(
                        ValidationError(f"{subject}-{object}", error_type,
                                        message, MessageLevel.ERROR))
                elif edge_predicate != p.name and edge_predicate in p.aliases:
                    message = f"Edge label '{edge_predicate}' is actually an alias for {p.name}; Should replace {edge_predicate} with {p.name}"
                    errors.append(
                        ValidationError(f"{subject}-{object}", error_type,
                                        message, MessageLevel.ERROR))
            else:
                message = f"Edge label '{edge_predicate}' is not in snake_case form"
                errors.append(
                    ValidationError(f"{subject}-{object}", error_type, message,
                                    MessageLevel.ERROR))
        return errors
Beispiel #4
0
def test_snakecase_to_sentencecase():
    s = snakecase_to_sentencecase('named_thing')
    assert s == 'named thing'
Beispiel #5
0
def test_snakecase_to_sentencecase():
    """
    Test conversion of a snake_case to sentence case.
    """
    s = snakecase_to_sentencecase('named_thing')
    assert s == 'named thing'
Beispiel #6
0
    def update_categories(self, clique: list):
        """
        For a given clique, get category for each node in clique and validate against BioLink Model,
        mapping to BioLink Model category where needed.

        Ex.: If a node has `gene` as its category, then this method adds all of its ancestors.

        Parameters
        ----------
        clique: list
            A list of nodes from a clique

        """
        updated_node_categories = {}
        for node in clique:
            data = self.clique_graph.nodes[node]
            print(data)
            if 'category' in data:
                categories = data['category']
            else:
                # get category from equivalence
                categories = self.get_category_from_equivalence(node, data)

            extended_categories = set()
            invalid_categories = []
            for category in categories:
                # TODO: this sentence case conversion needs to be handled properly
                category = snakecase_to_sentencecase(category).lower()
                logging.debug("Looking at category: {}".format(category))
                element = self.toolkit.get_element(category)
                if element:
                    # category exists in BioLink Model as a class or as an alias to a class
                    mapped_category = element['name']
                    ancestors = self.toolkit.ancestors(mapped_category)
                    if len(ancestors) > len(extended_categories):
                        # the category with the longest list of ancestors will be the most specific category
                        logging.debug("Ancestors for {} is larger than previous one".format(mapped_category))
                        extended_categories = ancestors
                else:
                    logging.warning("[1] category '{}' not in BioLink Model".format(category))
                    invalid_categories.append(category)
            logging.debug("Invalid categories: {}".format(invalid_categories))
            extended_categories = [stringcase.snakecase(x).lower() for x in extended_categories]

            for x in categories:
                element = self.toolkit.get_element(x)
                if element is None:
                    logging.warning("[2] category '{}' is not in BioLink Model".format(x))
                    continue
                mapped_category = element['name']
                if stringcase.snakecase(mapped_category).lower() not in extended_categories:
                    logging.warning("category '{}' not in ancestor closure: {}".format(stringcase.snakecase(mapped_category).lower(), extended_categories))
                    mapped = MAPPING[x] if x in MAPPING.keys() else x
                    if mapped not in extended_categories:
                        logging.warning("category '{}' is not even in any custom defined mapping. ".format(mapped_category))
                        invalid_categories.append(x)

            update_dict = {'category': extended_categories}
            if invalid_categories:
                update_dict['_invalid_category'] = invalid_categories
            updated_node_categories[node] = update_dict
        logging.debug("Updating nodes in clique with: {}".format(updated_node_categories))
        nx.set_node_attributes(self.clique_graph, updated_node_categories)
        nx.set_node_attributes(self.target_graph, updated_node_categories)