Example #1
0
    def test_test_similarity(self):
        """ check that test_similarity works correctly
        """

        # select two probands with relatively rare terms. Ignore that we select
        # the same person twice, the example will still work. Since the
        # reference population only has three individuals, the chance of
        # selecting two individuals with HPO similarity as rare as those two
        # individuals is 1 in three. We check that the probability estimate for
        # the two "rare" individuals is relatively close to 0.33.
        probands = ["person_03", "person_03"]
        p = test_similarity(self.hpo_graph,
                            self.hpo_terms,
                            probands,
                            n_sims=1000,
                            score_type="resnik")
        self.assertLess(abs(p - 0.33), 0.04)

        # now chose two individuals who do not share terms, and so the chance
        # that two random probands share their terms to the same extent is
        # effectively 1. The test is currently set up so that the maximum P-value
        # is actually n/(n + 1), where n is the number of iterations. We use
        # n + 1, since the observed similarity should be in the simulated
        # distribution under the null hypothesis.
        probands = ["person_01", "person_03"]
        p = test_similarity(self.hpo_graph,
                            self.hpo_terms,
                            probands,
                            n_sims=1000,
                            score_type="resnik")
        self.assertLess(abs(p - 0.999), 0.03)
Example #2
0
def analyse_genes(hpo_graph, hpo_by_proband, probands_by_gene, output_path, iterations, score_type):
    """ tests genes to see if their probands share HPO terms more than by chance.
    
    Args:
        hpo_graph: ICSimilarity object for the HPO term graph, with
            information on how many times each term has been used across all
            probands.
        hpo_by_proband: dictionary of HPO terms per proband
        probands_by_gene: dictionary of genes, to the probands who have variants
            in those genes.
        output_path: path to file to write the results to, or sys.stdout object.
        iterations: number of iterations to run.
    """
    
    # Sometimes output_path is actually sys.stdout, other times it is a path.
    try:
        output = open(output_path, "w")
    except TypeError:
        output = output_path
    
    output.write("hgnc\thpo_similarity_p_value\n")
    
    for gene in sorted(probands_by_gene):
        probands = probands_by_gene[gene]
        
        p_value = None
        if len(probands) > 1:
            p_value = test_similarity(hpo_graph, hpo_by_proband, probands, iterations, score_type)
        
        if p_value is None:
            continue
        
        output.write("{0}\t{1}\n".format(gene, p_value))
    
    output.close()
 def test_test_similarity(self):
     """ check that test_similarity works correctly
     """
     
     # select two probands with relatively rare terms. Ignore that we select
     # the same person twice, the example will still work. Since the
     # reference population only has three individuals, the chance of
     # selecting two individuals with HPO similarity as rare as those two
     # individuals is 1 in three. We check that the probability estimate for
     # the two "rare" individuals is relatively close to 0.33.
     probands = ["person_03", "person_03"]
     p = test_similarity(self.hpo_graph, self.hpo_terms, probands, n_sims=1000, score_type="resnik")
     self.assertLess(abs(p - 0.33), 0.04)
     
     # now chose two individuals who do not share terms, and so the chance
     # that two random probands share their terms to the same extent is
     # effectively 1. The test is currently set up so that the maximum P-value
     # is actually n/(n + 1), where n is the number of iterations. We use
     # n + 1, since the observed similarity should be in the simulated
     # distribution under the null hypothesis.
     probands = ["person_01", "person_03"]
     p = test_similarity(self.hpo_graph, self.hpo_terms, probands, n_sims=1000, score_type="resnik")
     self.assertLess(abs(p - 0.999), 0.03)
Example #4
0
def analyse_genes(hpo_graph, hpo_by_proband, probands_by_gene, output_path,
                  iterations, score_type):
    """ tests genes to see if their probands share HPO terms more than by chance.
    
    Args:
        hpo_graph: ICSimilarity object for the HPO term graph, with
            information on how many times each term has been used across all
            probands.
        hpo_by_proband: dictionary of HPO terms per proband
        probands_by_gene: dictionary of genes, to the probands who have variants
            in those genes.
        output_path: path to file to write the results to, or sys.stdout object.
        iterations: number of iterations to run.
    """

    check_terms_in_graph(hpo_graph, hpo_by_proband)

    # Sometimes output_path is actually sys.stdout, other times it is a path.
    try:
        output = open(output_path, "w")
    except TypeError:
        output = output_path

    output.write("hgnc\thpo_similarity_p_value\n")

    for gene in sorted(probands_by_gene):
        probands = probands_by_gene[gene]

        p_value = None
        if len(probands) > 1:
            p_value = test_similarity(hpo_graph, hpo_by_proband, probands,
                                      iterations, score_type)

        if p_value is None:
            continue

        output.write("{0}\t{1}\n".format(gene, p_value))

    output.close()