def euclidean_distance(
            self,
            profile_a: Iterable[str],
            profile_b: Iterable[str],
            predicate: Optional[URIRef] = RDFS['subClassOf']) -> float:
        """
        Groupwise euclidean distance

        The euclidean distance between two vectors of IC values,
        where a vector is created by taking the union of phenotypes
        in two profiles (including parents of each phenotype)

        This is roughly analogous to, but the not the inverse of simGIC
        """
        # Filter out negative phenotypes
        profile_a = {pheno for pheno in profile_a if not pheno.startswith("-")}
        profile_b = {pheno for pheno in profile_b if not pheno.startswith("-")}

        a_closure = owl_utils.get_profile_closure(
            profile_a, self.graph, self.root, predicate)
        b_closure = owl_utils.get_profile_closure(
            profile_b, self.graph, self.root, predicate)

        all_phenotypes = a_closure.union(b_closure)

        a_vector = np.array([self.ic_map[item] if
                             item in a_closure else 0 for item in all_phenotypes])
        b_vector = np.array([self.ic_map[item] if
                            item in b_closure else 0 for item in all_phenotypes])

        return np.linalg.norm(a_vector - b_vector)
Beispiel #2
0
    def groupwise_jaccard(
            self,
            profiles: Iterable[Iterable[str]],
            predicate: Optional[URIRef] = RDFS['subClassOf']) -> float:
        """
        Groupwise groupwise resnik similarity
        assumes no negative phenotypes
        """
        # Filter out negative phenotypes
        profile_union = set()
        profile_intersection = set()

        is_first = True
        for profile in profiles:
            profile_union = profile_union.union(
                owl_utils.get_profile_closure(profile, self.graph, self.root,
                                              predicate))
            if is_first:
                profile_intersection = owl_utils.get_profile_closure(
                    profile, self.graph, self.root, predicate)
                is_first = False
            else:
                profile_intersection = profile_intersection.intersection(
                    owl_utils.get_profile_closure(profile, self.graph,
                                                  self.root, predicate))

        return len(profile_intersection) / len(profile_union)
Beispiel #3
0
    def groupwise_sim_gic(
            self,
            profiles: Iterable[Iterable[str]],
            predicate: Optional[URIRef] = RDFS['subClassOf']) -> float:
        """
        Groupwise groupwise resnik similarity
        assumes no negative phenotypes
        """
        # Filter out negative phenotypes
        profile_union = set()
        profile_intersection = set()

        is_first = True
        for profile in profiles:
            profile_union = profile_union.union(
                owl_utils.get_profile_closure(profile, self.graph, self.root,
                                              predicate))
            if is_first:
                profile_intersection = owl_utils.get_profile_closure(
                    profile, self.graph, self.root, predicate)
                is_first = False
            else:
                profile_intersection = profile_intersection.intersection(
                    owl_utils.get_profile_closure(profile, self.graph,
                                                  self.root, predicate))

        numerator = reduce(
            lambda x, y: x + y,
            [self.ic_map[pheno] for pheno in profile_intersection])
        denominator = reduce(lambda x, y: x + y,
                             [self.ic_map[pheno] for pheno in profile_union])

        return numerator / denominator
Beispiel #4
0
    def sim_gic(self,
                profile_a: Iterable[str],
                profile_b: Iterable[str],
                predicate: Optional[URIRef] = RDFS['subClassOf']) -> float:
        """
        Groupwise resnik similarity:
        Summed information content of common ancestors divided by summed
        information content of all ancestors in profile a and profile b
        https://bmcbioinformatics.biomedcentral.com/track/
        pdf/10.1186/1471-2105-9-S5-S4
        """
        # Filter out negative phenotypes
        profile_a = {pheno for pheno in profile_a if not pheno.startswith("-")}
        profile_b = {pheno for pheno in profile_b if not pheno.startswith("-")}

        a_closure = owl_utils.get_profile_closure(profile_a, self.graph,
                                                  self.root, predicate)
        b_closure = owl_utils.get_profile_closure(profile_b, self.graph,
                                                  self.root, predicate)

        numerator = reduce(lambda x, y: x + y, [
            self.ic_map[pheno] for pheno in a_closure.intersection(b_closure)
        ])
        denominator = reduce(
            lambda x, y: x + y,
            [self.ic_map[pheno] for pheno in a_closure.union(b_closure)])

        return numerator / denominator
Beispiel #5
0
    def jaccard_sim(self,
                    profile_a: Iterable[str],
                    profile_b: Iterable[str],
                    predicate: Optional[URIRef] = RDFS['subClassOf']) -> float:
        """
        Groupwise jaccard similarty
        Negative phenotypes must be prefixed with a '-'
        """
        # Filter out negative phenotypes
        profile_a = {pheno for pheno in profile_a if not pheno.startswith("-")}
        profile_b = {pheno for pheno in profile_b if not pheno.startswith("-")}

        pheno_a_set = owl_utils.get_profile_closure(profile_a, self.graph,
                                                    self.root, predicate)
        pheno_b_set = owl_utils.get_profile_closure(profile_b, self.graph,
                                                    self.root, predicate)

        return metric.jaccard(pheno_a_set, pheno_b_set)
Beispiel #6
0
    def cosine_sim(self,
                   profile_a: Iterable[str],
                   profile_b: Iterable[str],
                   ic_weighted: Optional[bool] = False,
                   negative_weight: Optional[Num] = 1,
                   predicate: Optional[URIRef] = RDFS['subClassOf']) -> float:
        """
        Cosine similarity
        Profiles are treated as vectors of numbers between 0-1:
        1: Phenotype present
        0: Absent (no information)
        1 * negative weight: Negated phenotypes

        if ic_weighted is true the attributes become vectors
        of information content scores

        Inferred phenotypes are computed as parent classes for positive phenotypes
        and child classes for negative phenotypes.  Typically we do not want to
        weight negative phenotypes as high as positive phenotypes.  A weight between
        .01-.1 may be desirable
        """
        def score(term):
            if ic_weighted:
                attribute = self.ic_map[term]
            else:
                attribute = 1
            return attribute

        positive_a_profile = {
            item
            for item in profile_a if not item.startswith('-')
        }
        negative_a_profile = {
            item[1:]
            for item in profile_a if item.startswith('-')
        }

        positive_b_profile = {
            item
            for item in profile_b if not item.startswith('-')
        }
        negative_b_profile = {
            item[1:]
            for item in profile_b if item.startswith('-')
        }

        pos_a_closure = owl_utils.get_profile_closure(positive_a_profile,
                                                      self.graph, self.root,
                                                      predicate)
        pos_b_closure = owl_utils.get_profile_closure(positive_b_profile,
                                                      self.graph, self.root,
                                                      predicate)

        neg_a_closure = {
            "-{}".format(item)
            for item in owl_utils.get_profile_closure(negative_a_profile,
                                                      self.graph,
                                                      self.root,
                                                      predicate,
                                                      negative=True)
        }

        neg_b_closure = {
            "-{}".format(item)
            for item in owl_utils.get_profile_closure(negative_b_profile,
                                                      self.graph,
                                                      self.root,
                                                      predicate,
                                                      negative=True)
        }

        pos_intersect_dot_product = reduce(lambda x, y: x + y, [
            math.pow(score(item), 2)
            for item in pos_a_closure.intersection(pos_b_closure)
        ], 0)

        neg_intersect_dot_product = reduce(lambda x, y: x + y, [
            math.pow(score(item) * negative_weight, 2)
            for item in neg_a_closure.intersection(neg_b_closure)
        ], 0)

        a_square_dot_product = math.sqrt(
            reduce(lambda x, y: x + y,
                   [math.pow(score(item), 2) for item in pos_a_closure], 0) +
            reduce(lambda x, y: x + y, [
                math.pow(score(item) * negative_weight, 2)
                for item in neg_a_closure
            ], 0))

        b_square_dot_product = math.sqrt(
            reduce(lambda x, y: x + y,
                   [math.pow(score(item), 2) for item in pos_b_closure], 0) +
            reduce(lambda x, y: x + y, [
                math.pow(score(item) * negative_weight, 2)
                for item in neg_b_closure
            ], 0))

        numerator = pos_intersect_dot_product + neg_intersect_dot_product
        denominator = a_square_dot_product * b_square_dot_product

        return numerator / denominator