def prepare_run(self): # study_n should be the same in genome vs. observed vs. abundance_corrected ''' :return: None ''' if self.abcorr: self.study_an_frset = self.ui.get_sample_an_frset() self.term_study, self.go2ans_study_dict, study_n = ratio.count_terms_v2(self.study_an_frset, self.assoc_dict, self.obo_dag) if self.randomSample: self.pop_an_set = self.ui.get_background_an_set_random_sample() pop_n = len(self.pop_an_set) self.term_pop, self.go2ans_pop_dict = ratio.count_terms(self.pop_an_set, self.assoc_dict, self.obo_dag) else: pop_n = study_n self.term_pop, self.go2ans_pop_dict = ratio.count_terms_abundance_corrected(self.ui, self.assoc_dict, self.obo_dag) else: if self.ui.col_background_an == 'Genome': self.study_an_frset = self.ui.get_sample_an_frset_genome() else: self.study_an_frset = self.ui.get_sample_an_frset() self.term_study, self.go2ans_study_dict, study_n = ratio.count_terms_v2(self.study_an_frset, self.assoc_dict, self.obo_dag) self.pop_an_set = self.ui.get_background_an_all_set() self.term_pop, self.go2ans_pop_dict, pop_n = ratio.count_terms_v2(self.pop_an_set, self.assoc_dict, self.obo_dag) self.run_study_v2(self.term_study, self.term_pop, study_n, pop_n)
def calc_qval_dbl(study_n, pop_n, pop, assoc, term_pop, obo_dag, T=500): """ :param study_n: Integer (number of ANs from sample frequency) :param pop_n: Integer (number of ANs from background frequency = sample freq.) :param pop: :param assoc: :param term_pop: :param obo_dag: :param T: :return: """ distribution = [] for i in range(T): new_study = random.sample(pop, study_n) # add pop and study new_term_study = count_terms(new_study, assoc, obo_dag)[0] #!!! smallest_p = 1 for term, study_count in list(new_term_study.items()): pop_count = term_pop[term] a = study_count b = study_n - study_count c = pop_count d = pop_n - pop_count p_two_tail = stats.fisher_exact([[a, b], [c, d]], alternative='greater')[1] if p_two_tail < smallest_p: smallest_p = p_two_tail distribution.append(smallest_p) if i % 10 == 0: print("Sample {0} / {1}: p-value {2}".\ format(i, T, smallest_p), file=sys.stderr) return distribution