コード例 #1
0
ファイル: hypergeometric.py プロジェクト: idekerlab/nexo2
 def __init__(self):
     self._mapper = TermMapping()
コード例 #2
0
 def __init__(self):
     self._mapper = TermMapping()
コード例 #3
0
ファイル: hypergeometric.py プロジェクト: idekerlab/nexo2
class HypergeometricTest:

    _mapper = None

    def __init__(self):
        self._mapper = TermMapping()

    def perform_test(self, ontology_type, genes_of_interest, cutoff):

        if cutoff is None:
            cutoff = p_val_threshold

        total_num_genes = self._mapper.get_gene_count(ontology_type)
        term2genes = self._mapper.get_term_mapping(ontology_type)
        gene2terms = self._mapper.get_gene_mapping(ontology_type)

        filtered = self.__filter_input(genes_of_interest, gene2terms)

        # print(filtered)
        # print(len(filtered))

        sample_map = self.__calculateTermFrequency(filtered, gene2terms)

        n = len(filtered)

        # Number of tests performed: will be used for correction.
        num_tests = len(sample_map)

        results = [{}] * num_tests

        pvals = np.zeros(num_tests)

        idx = 0
        for term in sample_map:
            # Calculate p-value
            sampled = sample_map[term]
            assigned_genes = term2genes[term]
            k = len(sampled)
            m = len(assigned_genes)

            p = stats.hypergeom.pmf(k,total_num_genes,m,n)
            p_corrected = p * num_tests
            pvals[idx] = p

            result = {}
            result['id'] = term
            result['p-value'] = p_corrected
            result['background'] = m
            result['genes'] = list(sampled)
            results[idx] = result
            idx = idx + 1
            # print(term + " = " + str(p) + ", k = " + str(k) + ",
            # m = " + str(m) + ", total = " + str(total_num_genes) + ", n= " + str(n))

        # Correct border values (library does not accept 0 & 1)
        i = 0
        for p in pvals:
            if p >= 1.0 or math.isnan(p):
                pvals[i] = 0.9999999999999999999999
            elif p <= 0:
                pvals[i] = 0.0000000000000000000001
            i += 1
        qvals = qvalue.estimate(pvals)
        filtered_results = []

        idx = 0

        for term in sample_map:
            qv = qvals[idx]
            res = results[idx]

            pv = res['p-value']
            k = len(res['genes'])
            res['q-value'] = qv
            if pv < cutoff and k >= gene_threshold:
                filtered_results.append(res)

            idx = idx + 1

        return {
            'results': filtered_results,
            'total_genes': total_num_genes
        }

    # remove junks & duplicates
    def __filter_input(self, input_list, gene2terms):
        filtered = set([])

        for gene in input_list:
            if gene in gene2terms:
                filtered.add(gene)

        return list(filtered)

    def __calculateTermFrequency(self, genes_of_interest, gene2terms):
        sample_term_map = {}

        for gene in genes_of_interest:
            terms = []
            if gene in gene2terms:
                terms = gene2terms[gene]
            else:
                continue

            for term in terms:
                if term in sample_term_map:
                    assigned_genes = sample_term_map[term]
                    assigned_genes.add(gene)
                    sample_term_map[term] = assigned_genes
                else:
                    assigned_genes = set([])
                    assigned_genes.add(gene)
                    sample_term_map[term] = assigned_genes
        return sample_term_map
コード例 #4
0
class HypergeometricTest:

    _mapper = None

    def __init__(self):
        self._mapper = TermMapping()

    def perform_test(self, ontology_type, genes_of_interest, cutoff):

        if cutoff is None:
            cutoff = p_val_threshold

        total_num_genes = self._mapper.get_gene_count(ontology_type)
        term2genes = self._mapper.get_term_mapping(ontology_type)
        gene2terms = self._mapper.get_gene_mapping(ontology_type)

        filtered = self.__filter_input(genes_of_interest, gene2terms)

        # print(filtered)
        # print(len(filtered))

        sample_map = self.__calculateTermFrequency(filtered, gene2terms)

        n = len(filtered)

        # Number of tests performed: will be used for correction.
        num_tests = len(sample_map)

        results = [{}] * num_tests

        pvals = np.zeros(num_tests)

        idx = 0
        for term in sample_map:
            # Calculate p-value
            sampled = sample_map[term]
            assigned_genes = term2genes[term]
            k = len(sampled)
            m = len(assigned_genes)

            p = stats.hypergeom.pmf(k, total_num_genes, m, n)
            p_corrected = p * num_tests
            pvals[idx] = p

            result = {}
            result['id'] = term
            result['p-value'] = p_corrected
            result['background'] = m
            result['genes'] = list(sampled)
            results[idx] = result
            idx = idx + 1
            # print(term + " = " + str(p) + ", k = " + str(k) + ",
            # m = " + str(m) + ", total = " + str(total_num_genes) + ", n= " + str(n))

        # Correct border values (library does not accept 0 & 1)
        i = 0
        for p in pvals:
            if p >= 1.0 or math.isnan(p):
                pvals[i] = 0.9999999999999999999999
            elif p <= 0:
                pvals[i] = 0.0000000000000000000001
            i += 1
        qvals = qvalue.estimate(pvals)
        filtered_results = []

        idx = 0

        for term in sample_map:
            qv = qvals[idx]
            res = results[idx]

            pv = res['p-value']
            k = len(res['genes'])
            res['q-value'] = qv
            if pv < cutoff and k >= gene_threshold:
                filtered_results.append(res)

            idx = idx + 1

        return {'results': filtered_results, 'total_genes': total_num_genes}

    # remove junks & duplicates
    def __filter_input(self, input_list, gene2terms):
        filtered = set([])

        for gene in input_list:
            if gene in gene2terms:
                filtered.add(gene)

        return list(filtered)

    def __calculateTermFrequency(self, genes_of_interest, gene2terms):
        sample_term_map = {}

        for gene in genes_of_interest:
            terms = []
            if gene in gene2terms:
                terms = gene2terms[gene]
            else:
                continue

            for term in terms:
                if term in sample_term_map:
                    assigned_genes = sample_term_map[term]
                    assigned_genes.add(gene)
                    sample_term_map[term] = assigned_genes
                else:
                    assigned_genes = set([])
                    assigned_genes.add(gene)
                    sample_term_map[term] = assigned_genes
        return sample_term_map