def test_similarity(hpo_graph, hpo_by_proband, probands, n_sims, score_type="resnik"): """ find if groups of probands per gene share HPO terms more than by chance. We simulate a distribution of similarity scores by randomly sampling groups of probands. I tried matching the number of sampled HPO terms to the numbers in the probands for the gene. For that, I gave each term the chance of being sampled as the rate at which it was observed in all the probands. However, these sampled terms gave abberant QQ plots, with excessive numbers of extremely signficant P values. I suspect this is due to underlying relationships between HPO terms. Args: hpo_graph: ICSimilarity object for the HPO term graph, with information on how many times each term has been used across all probands. hpo_by_proband: dictionary of HPO terms per proband probands: list of proband IDs. n_sims: number of simulations to run. score_type: type of similarity metric to use ["resnik", "lin", "simGIC"] Returns: The probability that the HPO terms used in the probands match as well as they do. """ probands = [hpo_by_proband[x] for x in probands if x in hpo_by_proband] other_probands = [x for x in hpo_by_proband if x not in probands] # We can't test similarity from a single proband. We don't call this # function for genes with a single proband, however, sometimes only one of # the probands has HPO terms recorded. We cannot estimate the phenotypic # similarity between probands in this case, so return None instead. if len(probands) < 2: return None observed = get_proband_similarity(hpo_graph, probands, score_type) # get a distribution of scores for randomly sampled HPO terms distribution = [] for x in range(n_sims): sampled = random.sample(other_probands, len(probands)) simulated = [hpo_by_proband[n] for n in sampled] predicted = get_proband_similarity(hpo_graph, simulated, score_type) distribution.append(predicted) distribution = sorted(distribution) # figure out where in the distribution the observed value occurs pos = bisect.bisect_left(distribution, observed) sim_prob = (abs(pos - len(distribution))) / (1 + len(distribution)) if sim_prob == 0: sim_prob = 1 / (1 + len(distribution)) return sim_prob
def test_similarity(hpo_graph, hpo_by_proband, probands, n_sims, score_type="resnik"): """ find if groups of probands per gene share HPO terms more than by chance. We simulate a distribution of similarity scores by randomly sampling groups of probands. I tried matching the number of sampled HPO terms to the numbers in the probands for the gene. For that, I gave each term the chance of being sampled as the rate at which it was observed in all the probands. However, these sampled terms gave abberant QQ plots, with excessive numbers of extremely signficant P values. I suspect this is due to underlying relationships between HPO terms. Args: hpo_graph: ICSimilarity object for the HPO term graph, with information on how many times each term has been used across all probands. hpo_by_proband: dictionary of HPO terms per proband probands: list of proband IDs. n_sims: number of simulations to run. score_type: type of similarity metric to use ["resnik", "lin", "simGIC"] Returns: The probability that the HPO terms used in the probands match as well as they do. """ probands = [hpo_by_proband[x] for x in probands if x in hpo_by_proband] other_probands = [x for x in hpo_by_proband if x not in probands] # We can't test similarity from a single proband. We don't call this # function for genes with a single proband, however, sometimes only one of # the probands has HPO terms recorded. We cannot estimate the phenotypic # similarity between probands in this case, so return None instead. if len(probands) < 2: return None observed = get_proband_similarity(hpo_graph, probands, score_type) # get a distribution of scores for randomly sampled HPO terms distribution = [] for x in range(n_sims): sampled = random.sample(other_probands, len(probands)) simulated = [hpo_by_proband[n] for n in sampled] predicted = get_proband_similarity(hpo_graph, simulated, score_type) distribution.append(predicted) distribution = sorted(distribution) # figure out where in the distribution the observed value occurs pos = bisect.bisect_left(distribution, observed) sim_prob = (abs(pos - len(distribution)))/(1 + len(distribution)) if sim_prob == 0: sim_prob = 1 / (1 + len(distribution)) return sim_prob
def test_get_proband_similarity(self): """ check that get_proband_similarity works correctly """ # check the default probands probands = list(self.hpo_terms.values()) self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "resnik"), 0.916290731874155) self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "simGIC"), 1.0) # add another proband, who has a rare term that matches a term in # another proband probands.append(["HP:0000924", "HP:0000118"]) self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "resnik"), 2.525728644308255) self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "simGIC"), 3.0)
def test_get_proband_similarity(self): """ check that get_proband_similarity works correctly """ # check the default probands probands = list(self.hpo_terms.values()) self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "resnik"), -math.log(2/3.0)) self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "simGIC"), 1.0) # add another proband, who has a rare term that matches a term in # another proband probands.append(["HP:0000924", "HP:0000118"]) self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "resnik"), -math.log(2/3.0) + -math.log(1/3.0)) self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "simGIC"), 3.0)