コード例 #1
0
def test_similarity(hpo_graph,
                    hpo_by_proband,
                    probands,
                    n_sims,
                    score_type="resnik"):
    """ find if groups of probands per gene share HPO terms more than by chance.
    
    We simulate a distribution of similarity scores by randomly sampling groups
    of probands. I tried matching the number of sampled HPO terms to the numbers
    in the probands for the gene. For that, I gave each term the chance of being
    sampled as the rate at which it was observed in all the probands. However,
    these sampled terms gave abberant QQ plots, with excessive numbers of
    extremely signficant P values. I suspect this is due to underlying
    relationships between HPO terms.
    
    Args:
        hpo_graph: ICSimilarity object for the HPO term graph, with
            information on how many times each term has been used across all
            probands.
        hpo_by_proband: dictionary of HPO terms per proband
        probands: list of proband IDs.
        n_sims: number of simulations to run.
        score_type: type of similarity metric to use ["resnik", "lin", "simGIC"]
    
    Returns:
        The probability that the HPO terms used in the probands match as well as
        they do.
    """

    probands = [hpo_by_proband[x] for x in probands if x in hpo_by_proband]
    other_probands = [x for x in hpo_by_proband if x not in probands]

    # We can't test similarity from a single proband. We don't call this
    # function for genes with a single proband, however, sometimes only one of
    # the probands has HPO terms recorded. We cannot estimate the phenotypic
    # similarity between probands in this case, so return None instead.
    if len(probands) < 2:
        return None

    observed = get_proband_similarity(hpo_graph, probands, score_type)

    # get a distribution of scores for randomly sampled HPO terms
    distribution = []
    for x in range(n_sims):
        sampled = random.sample(other_probands, len(probands))
        simulated = [hpo_by_proband[n] for n in sampled]
        predicted = get_proband_similarity(hpo_graph, simulated, score_type)
        distribution.append(predicted)

    distribution = sorted(distribution)

    # figure out where in the distribution the observed value occurs
    pos = bisect.bisect_left(distribution, observed)
    sim_prob = (abs(pos - len(distribution))) / (1 + len(distribution))

    if sim_prob == 0:
        sim_prob = 1 / (1 + len(distribution))

    return sim_prob
コード例 #2
0
def test_similarity(hpo_graph, hpo_by_proband, probands, n_sims, score_type="resnik"):
    """ find if groups of probands per gene share HPO terms more than by chance.
    
    We simulate a distribution of similarity scores by randomly sampling groups
    of probands. I tried matching the number of sampled HPO terms to the numbers
    in the probands for the gene. For that, I gave each term the chance of being
    sampled as the rate at which it was observed in all the probands. However,
    these sampled terms gave abberant QQ plots, with excessive numbers of
    extremely signficant P values. I suspect this is due to underlying
    relationships between HPO terms.
    
    Args:
        hpo_graph: ICSimilarity object for the HPO term graph, with
            information on how many times each term has been used across all
            probands.
        hpo_by_proband: dictionary of HPO terms per proband
        probands: list of proband IDs.
        n_sims: number of simulations to run.
        score_type: type of similarity metric to use ["resnik", "lin", "simGIC"]
    
    Returns:
        The probability that the HPO terms used in the probands match as well as
        they do.
    """
    
    probands = [hpo_by_proband[x] for x in probands if x in hpo_by_proband]
    other_probands = [x for x in hpo_by_proband if x not in probands]
    
    # We can't test similarity from a single proband. We don't call this
    # function for genes with a single proband, however, sometimes only one of
    # the probands has HPO terms recorded. We cannot estimate the phenotypic
    # similarity between probands in this case, so return None instead.
    if len(probands) < 2:
        return None
    
    observed = get_proband_similarity(hpo_graph, probands, score_type)
    
    # get a distribution of scores for randomly sampled HPO terms
    distribution = []
    for x in range(n_sims):
        sampled = random.sample(other_probands, len(probands))
        simulated = [hpo_by_proband[n] for n in sampled]
        predicted = get_proband_similarity(hpo_graph, simulated, score_type)
        distribution.append(predicted)
    
    distribution = sorted(distribution)
    
    # figure out where in the distribution the observed value occurs
    pos = bisect.bisect_left(distribution, observed)
    sim_prob = (abs(pos - len(distribution)))/(1 + len(distribution))
    
    if sim_prob == 0:
        sim_prob = 1 / (1 + len(distribution))
    
    return sim_prob
コード例 #3
0
 def test_get_proband_similarity(self):
     """ check that get_proband_similarity works correctly
     """
     
     # check the default probands
     probands = list(self.hpo_terms.values())
     self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "resnik"), 0.916290731874155)
     self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "simGIC"), 1.0)
     
     # add another proband, who has a rare term that matches a term in
     # another proband
     probands.append(["HP:0000924", "HP:0000118"])
     self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "resnik"), 2.525728644308255)
     self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "simGIC"), 3.0)
コード例 #4
0
 def test_get_proband_similarity(self):
     """ check that get_proband_similarity works correctly
     """
     
     # check the default probands
     probands = list(self.hpo_terms.values())
     self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "resnik"), -math.log(2/3.0))
     self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "simGIC"), 1.0)
     
     # add another proband, who has a rare term that matches a term in
     # another proband
     probands.append(["HP:0000924", "HP:0000118"])
     self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "resnik"),
         -math.log(2/3.0) + -math.log(1/3.0))
     self.assertEqual(get_proband_similarity(self.hpo_graph, probands, "simGIC"), 3.0)