def write_to_data(data, dp, mode): # calculate and populate the p values # calculate pat g for related and unrelated cases related_pat = set([p for h, v in dp.iteritems() for p in v['data']]) related_pat_g = len(related_pat) unrelated_pat_g = len( [p for p in related_pat if p in patients_hpo['unrelated']]) for h, v in dp.iteritems(): related_pat_a = v['related_pat_a'] unrelated_pat_a = v['unrelated_pat_a'] related_pat_h = v['related_pat_h'] unrelated_pat_h = v['unrelated_pat_h'] related_pat_gh_set = set([p for p in v['data']]) related_pat_gh = len(related_pat_gh_set) unrelated_pat_gh = len( [p for p in related_pat_gh_set if p in patients_hpo['unrelated']]) related_p_val = fisher.pvalue( related_pat_a - related_pat_h - related_pat_g + related_pat_gh, related_pat_h - related_pat_gh, related_pat_g - related_pat_gh, related_pat_gh) unrelated_p_val = fisher.pvalue( unrelated_pat_a - unrelated_pat_h - unrelated_pat_g + unrelated_pat_gh, unrelated_pat_h - unrelated_pat_gh, unrelated_pat_g - unrelated_pat_gh, unrelated_pat_gh) data[h]['related_' + mode + '_p_val'] = related_p_val.right_tail data[h]['unrelated_' + mode + '_p_val'] = unrelated_p_val.right_tail data[h]['related_' + mode + '_pat_g'] = related_pat_g data[h]['unrelated_' + mode + '_pat_g'] = unrelated_pat_g data[h]['related_' + mode + '_pat_gh'] = related_pat_gh data[h]['unrelated_' + mode + '_pat_gh'] = unrelated_pat_gh
def table_maker(subset, ind1, ind2, row_labels, col_labels, title): """ `subset` provides a subsetted boolean of items to consider. If no subset, you can use all with `np.ones_like(ind1) == 1` `ind1` is used to subset rows, e.g., log2fc > 0. This is used for rows, so row_label might be ['upregulated', 'others'] `ind2` is used to subset cols. For example, col_labels would be ['bound', 'unbound'] """ table = [ sum(subset & ind1 & ind2), sum(subset & ind1 & ~ind2), sum(subset & ~ind1 & ind2), sum(subset & ~ind1 & ~ind2) ] print print title print '-' * len(title) print print_2x2_table(table, row_labels=row_labels, col_labels=col_labels) print print_row_perc_table(table, row_labels=row_labels, col_labels=col_labels) print print_col_perc_table(table, row_labels=row_labels, col_labels=col_labels) print fisher.pvalue(*table)
def table_maker(subset, ind1, ind2, row_labels, col_labels, title): """ `subset` provides a subsetted boolean of items to consider. If no subset, you can use all with `np.ones_like(ind1) == 1` `ind1` is used to subset rows, e.g., log2fc > 0. This is used for rows, so row_label might be ['upregulated', 'others'] `ind2` is used to subset cols. For example, col_labels would be ['bound', 'unbound'] """ table = [ sum(subset & ind1 & ind2), sum(subset & ind1 & ~ind2), sum(subset & ~ind1 & ind2), sum(subset & ~ind1 & ~ind2) ] print print title print '-' * len(title) print print_2x2_table(table, row_labels=row_labels, col_labels=col_labels) print print_row_perc_table( table, row_labels=row_labels, col_labels=col_labels) print print_col_perc_table( table, row_labels=row_labels, col_labels=col_labels) print fisher.pvalue(*table)
def test(comps, genome, motif, rcounts, nums): # rcounts print "%s.%s.%s: fisher test on real and perm data" % (comps, genome, motif) results = {} for rt in ["r1", "r2", "r3"]: for event_class in ["s", "e"]: val_class = rcounts.get("%s.%s" % (rt, event_class), 0) val_control = rcounts.get("%s.%s" % (rt, "c"), 0) num_class = nums.get("%s.%s" % (rt, event_class), 0) num_control = nums.get("%s.%s" % (rt, "c"), 0) val = pvalue(val_class, val_control, num_class-val_class, num_control-val_control).right_tail results["%s.%s" % (rt, event_class)] = val # information gain (g at the end) i1 = (num_class/float(num_class+num_control))*math.log(num_class/float(num_class+num_control), 2) i2 = (num_control/float(num_class+num_control))*math.log(num_control/float(num_class+num_control), 2) i = -(i1+i2) c1_num = max(1, float(val_class + val_control)) # dont allow it to be 0 if (val_class/c1_num)>0 and val_control/c1_num>0: c1_num = -( (val_class/c1_num)*math.log(val_class/c1_num, 2) + (val_control/c1_num)*math.log(val_control/c1_num, 2) ) c1_num = (val_class+val_control)/float(num_class+num_control) * c1_num else: c1_num = 0 c2_num = max(1, float( (num_class-val_class) + (num_control-val_control) ) ) # dont allow it to be 0 if (num_class-val_class)/c2_num>0 and (num_control-val_control)/c2_num>0: c2_num = -( ((num_class-val_class)/c2_num)*math.log((num_class-val_class)/c2_num, 2) + ((num_control-val_control)/c2_num)*math.log((num_control-val_control)/c2_num, 2) ) c2_num = (num_class+num_control-(val_class+val_control))/float(num_class+num_control) * c2_num else: c2_num = 0 g = i - (c1_num + c2_num) #print rt, event_class, val_class, num_class #print rt, "c", val_control, num_control #print results["%s.%s.g" % (rt, event_class)] = g for p in range(0, rnamotifs2.config.perms): val_class = rcounts.get("%s.%s.p%s" % (rt, event_class, p), 0) val_control = rcounts.get("%s.%s.p%s" % (rt, "c", p), 0) num_class = rnamotifs2.perm.ec_dist[p].get(event_class, 0) num_control = rnamotifs2.perm.ec_dist[p].get("c", 0) val = pvalue(val_class, val_control, num_class-val_class, num_control-val_control).right_tail results["%s.%s.p%s" % (rt, event_class, p)] = val test_results = {} for rt in ["r1", "r2", "r3"]: for event_class in ["s", "e"]: pval = results["%s.%s" % (rt, event_class)] pemp = [results["%s.%s.p%s" % (rt, event_class, p)] for p in range(0, rnamotifs2.config.perms)] g = results["%s.%s.g" % (rt, event_class)] test_results["%s.%s" % (rt, event_class)] = [pval, pemp, g] return test_results
def calculate_fisher(row_vals, col_vals, test_type): """Calculate fishers exact test on prepared contingency table""" row_val_1, row_val_2 = row_vals col_val_1, col_val_2 = col_vals if test_type == 1: return fisher.pvalue(row_val_1, row_val_2, col_val_1, col_val_2).two_tail elif test_type == 2: return fisher.pvalue(row_val_1, row_val_2, col_val_1, col_val_2).left_tail elif test_type == 3: return fisher.pvalue(row_val_1, row_val_2, col_val_1, col_val_2).right_tail else: raise TypeError
def es_apa(data_s, data_e, data_c): num_s, num_e, num_c = rnamotifs2.data.dist["s"], rnamotifs2.data.dist["e"], rnamotifs2.data.dist["c"] logs = [] loge = [] temp_e = [] temp_s = [] for val_s, val_e, val_c in zip(data_s, data_e, data_c): # there are 4 areas f1 = -2 * math.log(pvalue(val_s, val_c, num_s-val_s, num_c-val_c).right_tail) f2 = -2 * math.log(pvalue(val_e, val_c, num_e-val_e, num_c-val_c).right_tail) temp_s.append(f1) temp_e.append(f2) logs.append(temp_s) loge.append(temp_e) return logs, loge
def modifyStrelkaRow(record,fixIndels=True): """Add info for strelka processing to vcf record :param record: a pyVCF record object """ if(record.is_snp or record.ALT[0] is None): ref = record.REF alt = record.ALT[0] record.INFO['NORMREF']=getattr(record.samples[0].data,ref+'U')[0] record.INFO['TUMREF']=getattr(record.samples[1].data,ref+'U')[0] # strelka sometimes reports a non-passing variant as no "ALT" allele (no change) if(alt is None): record.INFO['NORMALT']=0 record.INFO['TUMALT']=0 record.INFO['TUMVAF']=0 record.INFO['TUMVARFRACTION']=0 else: alt = str(alt) record.INFO['NORMALT']=getattr(record.samples[0].data,alt+'U')[0] record.INFO['TUMALT']=getattr(record.samples[1].data,alt+'U')[0] try: record.INFO['TUMVAF']=float(record.INFO['TUMALT'])/(record.INFO['TUMALT']+record.INFO['TUMREF']) except ZeroDivisionError: record.INFO['TUMVAF']=0 try: record.INFO['TUMVARFRACTION']=float(record.INFO['TUMALT'])/(record.INFO['TUMALT']+record.INFO['NORMALT']) except ZeroDivisionError: record.INFO['TUMVARFRACTION']=0 record.INFO['LOG_FISHER']=-math.log10(fisher.pvalue(record.INFO['TUMREF'],record.INFO['TUMALT'],record.INFO['NORMREF'],record.INFO['NORMALT']).two_tail) return(record) else: record.INFO['NORMREF']=getattr(record.samples[0].data,'TAR')[0] record.INFO['NORMALT']=getattr(record.samples[0].data,'TIR')[0] record.INFO['TUMREF']=getattr(record.samples[1].data,'TAR')[0] record.INFO['TUMALT']=getattr(record.samples[1].data,'TIR')[0] try: record.INFO['TUMVAF']=float(record.INFO['TUMALT'])/(record.INFO['TUMALT']+record.INFO['TUMREF']) except ZeroDivisionError: record.INFO['TUMVAF']=0 try: record.INFO['TUMVARFRACTION']=float(record.INFO['TUMALT'])/(record.INFO['TUMALT']+record.INFO['NORMALT']) except ZeroDivisionError: record.INFO['TUMVARFRACTION']=0 if(fixIndels): record.REF=record.REF.replace('.','') for i in range(len(record.ALT)): if(not isinstance(record.ALT[i],vcf.model._Substitution)): return(None) record.INFO['LOG_FISHER']=-math.log10(fisher.pvalue(record.INFO['TUMREF'],record.INFO['TUMALT'],record.INFO['NORMREF'],record.INFO['NORMALT']).two_tail) return(record)
def cap(size1, size2): """ This function finds the population value at which we begin to obtain a signiciant value. This may need to be modified depending on the training size. """ lowest_sig = 0.05 for n in range(1, 10): if size1 < size2: sig = pvalue(size2, size1 - n, 0, n).two_tail return n else: sig = pvalue(size1, size2 - n, 0, n).two_tail if sig <= 0.05: return n
def single_maker_allelic_association(self, phenotype_list=[], genotype_list=[]): """ Computes single marker logistic regressionassociation for lists of phenotypes and genotypes of equal length """ # Make sure phenotype and genotype lists are same size if len(phenotype_list) != len(genotype_list): return None case_alleles = [] control_alleles = [] for i in range(len(phenotype_list)): loc_alleles = genotype_list[i].split(",") if phenotype_list[i] == 1: for a in loc_alleles: case_alleles.append(a) else: for a in loc_alleles: control_alleles.append(a) # Getting set of alleles and their counts allele = list(set(chain(case_alleles,control_alleles))) case_counts = Counter(case_alleles) control_counts = Counter(control_alleles) # Implementing slow scipy chi-square if we have more than two allles if len(allele) > 2: table = np.zeros(shape=(2,len(allele))) for i in range(len(allele)): table[0,i] = case_counts[allele[i]] table[1,i] = control_counts[allele[i]] chi2, p, dof, ex = chi2_contingency(table) return p # Running fast Fisher's algoritm OW if len(case_counts) == 2 and len(control_counts) == 2: p = pvalue(case_counts[allele[0]], control_counts[allele[0]], case_counts[allele[1]], control_counts[allele[1]]).two_tail elif len(case_counts) == 2 and len(control_counts) == 1: p = pvalue(case_counts[allele[0]], control_counts[allele[0]], case_counts[allele[1]], 0).two_tail elif len(case_counts) == 1 and len(control_counts) == 2: p = pvalue(case_counts[allele[0]], control_counts[allele[0]], 0, control_counts[allele[1]]).two_tail else: p = 1 return p
def _find_sequence_p_values_with_fisher(self, sequence_presence_matrix, is_first_class): sequence_p_values = [] for sequence_vector in sequence_presence_matrix: if sequence_vector.sum() > 1: first_class_present = np.sum(sequence_vector[np.logical_and( sequence_vector, is_first_class)]) second_class_present = np.sum(sequence_vector[np.logical_and( sequence_vector, np.logical_not(is_first_class))]) first_class_absent = np.sum( np.logical_and(is_first_class, sequence_vector == 0)) second_class_absent = np.sum( np.logical_and(np.logical_not(is_first_class), sequence_vector == 0)) sequence_p_values.append( fisher.pvalue(first_class_present, second_class_present, first_class_absent, second_class_absent).right_tail) else: sequence_p_values.append(SequenceFilterHelper.INVALID_P_VALUE) return sequence_p_values
def getSub(ref, fread, dics): #fread={A,C,G,T} nref = fread[ref.upper()] sub = [(ref.upper() + i, nref, fread[i]) for i in fread if i != ref.upper() and fread[i] != 0] allsub = ' '.join([x[0] for x in sub]) # lista del tipo [('AT', 50, 10), ('AG', 50, 2)] res = [ ] #[(int(dics[i[0]]*(i[1]+i[2])),((i[1]+i[2])-exp1),pvalue(i[1],i[2],int(dics[i[0]]*(i[1]+i[2])),((i[1]+i[2])-exp1))) for i in sub] for i in sub: obs1 = i[1] obs2 = i[2] exp1 = int(dics[i[0]] * (i[1] + i[2])) exp2 = ((i[1] + i[2]) - exp1) if not exfisher: pval = FishersExactTest([[exp1, exp2], [obs1, obs2]]) else: pval = pvalue(obs1, obs2, exp1, exp2) pval = getTail(pval) res.append((i[0], obs1, obs2, exp1, exp2, str(pval))) if len(res) == 1: return res[0][5] #,allsub,fread elif len(res) > 1: rr = [float(x[-1]) for x in res] idx = rr.index(min(rr)) return res[idx][5] #,allsub,fread else: return '1.0' #,0,0
def ethinic_filter_gNOMAD(self): """ Pass a file with the gnomad dataset filter for the genes of interest. For example a dataset might intersects with 800,000 snps (determined by wc -l of the file. We will test to see if any of those are candidates for removal. The pvalue threshold will be 0.05 / 28 (28 being the number of ethnic combinations). If 1 of 28 combinations are indeed a hit for signif. then we toss that SNP from the dataset as it could be missused for separating disease vs non-dieases by ethnicity instead of by diease bearing snps. """ cutoff = 0.05 / 28 snps_to_remove = [] with open(self.gnomad_file, 'r') as fin: line_num = 0 for line in fin: div = line.split("\t") chrm_info = div[0] genotype_list = [] for ethnicity in range(8): # 8 for number of ethn. in gnomad genotype_list.append(div[1].split(",")[ethnicity]) for i in itertools.combinations(genotype_list, 2): n11 = int(i[0].split(" ")[0]) n12 = int(i[0].split(" ")[1]) n21 = int(i[1].split(" ")[0]) n22 = int(i[1].split(" ")[1]) pval = pvalue(n11, n21, n12, n22).two_tail if pval <= cutoff: snps_to_remove.append(chrm_info) break line_num += 1 print(line_num / 15008010, end='\r') # percent complete self.filtered_columns = self.col_translator[ self.col_translator['chrm_pos_ref_alt'].isin(snps_to_remove)].index
def get_separator_words(toks1): """ Finds the words that separate a list of tokens from a background corpus Basically this generates a list of informative/interesting words in a set toks1 is a list of words Returns a list of separator words """ tab_toks1 = nltk.FreqDist(word.lower() for word in toks1) if(os.path.isfile(ESSAY_COR_TOKENS_PATH)): toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb')) else: essay_corpus = open(ESSAY_CORPUS_PATH).read() essay_corpus = sub_chars(essay_corpus) toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus)) pickle.dump(toks2, open(ESSAY_COR_TOKENS_PATH, 'wb')) sep_words = [] for word in tab_toks1.keys(): tok1_present = tab_toks1[word] if(tok1_present > 2): tok1_total = tab_toks1._N tok2_present = toks2[word] tok2_total = toks2._N fish_val = pvalue(tok1_present, tok2_present, tok1_total, tok2_total).two_tail if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2): sep_words.append(word) sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5] return sep_words
def create_count_table(df, Cyt1, Cyt2): df_cont = pd.DataFrame(zip(df[Cyt1], df[Cyt2]), columns = ['A', 'B']) #create contigency table df_cont[df_cont > 0] = 1 #get counts for each condition d = df_cont.to_dict() A = 0 B = 0 AandB = 0 none = 0 tup = zip(d['A'].values(), d['B'].values()) for row in tup: if row[0] == 0 and row[1] == 0: none = none +1 if row[0] == 0 and row[1] == 1: B = B + 1 if row[0] == 1 and row[1] == 0: A = A +1 if row[0] ==1 and row[1] == 1: AandB = AandB + 1 # Fishers exact test matrix = numpy.matrix([[AandB, B],[A, none]]) p = pvalue(AandB, B, A, none) #output = [Cyt1, Cyt2, p.left_tail, p.right_tail, p.two_tail] output = p.two_tail return output, Cyt1, Cyt2
def dnds_stat(estimations): '''return estimations of windows with dN/dS > 1''' filtered_estimations = [] fname = estimations[0]['file name'] genedS = float(estimations[0]['dS']) for i in range(len(estimations)): if 'nan' in estimations[i].values(): continue name = estimations[i]['file name'] if name != fname: fname = estimations[i]['file name'] genedS = float(estimations[i]['dS']) if genedS != 0: estimations[i]['dN/dS(whole gene)'] = float( estimations[i]['dN']) / genedS else: continue # process numbers with fisher module if estimations[i]['whole gene'] == '1' or \ estimations[i]['dN/dS(whole gene)'] > 1: n = round(float(estimations[i]['dN']) * float(estimations[i]['N'])) N = round(float(estimations[i]['N'])) - n s = round(float(estimations[i]['dS']) * float(estimations[i]['S'])) S = round(float(estimations[i]['S'])) - s mat = [[n, N], [s, S]] p = pvalue(n, N, s, S) estimations[i]['p-value'] = p.two_tail filtered_estimations.append(estimations[i]) return filtered_estimations
def get_vocab(self, input_text, input_scores, max_features): train_mat = self.vectorizer1.transform(input_text) input_score_med = np.median(input_scores) new_scores = [0 if i<=input_score_med else 1 for i in input_scores] ind_max_features = math.floor(max_features/max(input_scores)) all_vocab = [] all_cols = [np.asarray(train_mat.getcol(i).todense().transpose())[0] for i in xrange(0,train_mat.shape[1])] for s in xrange(0,max(input_scores)): sel_inds = [i for i in xrange(0,len(input_scores)) if input_scores[i]==s] out_inds = [i for i in xrange(0,len(input_scores)) if input_scores[i]!=s] pvalues = [] for i in xrange(0,len(all_cols)): lcol = all_cols[i] good_lcol = lcol[sel_inds] bad_lcol = lcol[out_inds] good_lcol_present = len(good_lcol[good_lcol > 0]) good_lcol_missing = len(good_lcol[good_lcol == 0]) bad_lcol_present = len(bad_lcol[bad_lcol > 0]) bad_lcol_missing = len(bad_lcol[bad_lcol == 0]) pval = pvalue(good_lcol_present, bad_lcol_present, good_lcol_missing, bad_lcol_missing) pvalues.append(pval.two_tail) col_inds = list(xrange(0,train_mat.shape[1])) p_frame = pd.DataFrame(np.array([col_inds, pvalues]).transpose(), columns=["inds", "pvalues"]) p_frame = p_frame.sort(['pvalues'], ascending=True) getVar = lambda searchList, ind: [searchList[int(i)] for i in ind] vocab = getVar(self.vectorizer1.get_feature_names(), p_frame['inds'][:ind_max_features+2]) all_vocab.append(vocab) return list(set(list(chain.from_iterable(all_vocab))))
def fisherTest(tab, alternative='two-sided'): """Fisher's exact test on a 2x2 contingency table. Wrapper around fisher.pvalue found in: Fast Fisher's Exact Test (Haibao Tang, Brent Pedersen) https://pypi.python.org/pypi/fisher/ Test is performed in C (100x speed-up) Parameters ---------- tab : list of lists or 2x2 ndarray Each element should contain counts alternative : string Specfies the alternative hypothesis (similar to scipy.fisher_exact) Options: 'two-sided', 'less', 'greater' Returns ------- OR : float Odds-ratio associated with the 2 x 2 table p : float P-value associated with the test and the alternative hypothesis""" res = fisher.pvalue(tab[0][0], tab[0][1], tab[1][0], tab[1][1]) OR = (tab[0][0] * tab[1][1]) / (tab[0][1] * tab[1][0]) if alternative == 'two-sided': return (OR, res.two_tail) elif alternative == 'less': return (OR, res.left_tail) elif alternative == 'greater': return (OR, res.right_tail)
def calculate_differential_methylation_fisher_exact(self, weighted = False): sum_meth_control = 0 sum_meth_affected = 0 sum_cov_control = 0 sum_cov_affected = 0 for cpg in self.cpgs: if weighted: sum_meth_control += cpg.weighted_methylation_control sum_meth_affected += cpg.weighted_methylation_affected sum_cov_control += cpg.cov_control sum_cov_affected += cpg.cov_affected else: sum_meth_control += cpg.meth_control sum_meth_affected += cpg.meth_affected sum_cov_control += cpg.cov_control sum_cov_affected += cpg.cov_affected control = sum_meth_control / sum_cov_control affected = sum_meth_affected / sum_cov_affected control_methylated = sum_cov_control * control / 100 control_unmethylated = sum_cov_control - control_methylated affected_methylated = sum_cov_affected * affected / 100 affected_unmethylated = sum_cov_affected - affected_methylated try: #Try to use the much faster fisher module from http://pypi.python.org/pypi/fisher/ p = fisher_exact.pvalue(control_methylated, control_unmethylated, affected_methylated, affected_unmethylated) pvalue = p.two_tail except: oddsratio, pvalue = stats.fisher_exact([(control_methylated, control_unmethylated), (affected_methylated, affected_unmethylated)], alternative='two-sided') return pvalue
def find_label_associated_sequence_p_values( comparison_data: ComparisonData, repertoires: List[Repertoire], label: Label): sequence_p_values = [] is_first_class = np.array([ repertoire.metadata[label.name] for repertoire in repertoires ]) == label.positive_class for sequence_vector in comparison_data.get_item_vectors( [repertoire.identifier for repertoire in repertoires]): if sequence_vector.sum() > 1: first_class_present = np.sum(sequence_vector[np.logical_and( sequence_vector, is_first_class)]) second_class_present = np.sum(sequence_vector[np.logical_and( sequence_vector, np.logical_not(is_first_class))]) first_class_absent = np.sum( np.logical_and(is_first_class, sequence_vector == 0)) second_class_absent = np.sum( np.logical_and(np.logical_not(is_first_class), sequence_vector == 0)) sequence_p_values.append( fisher.pvalue(first_class_present, second_class_present, first_class_absent, second_class_absent).right_tail) else: sequence_p_values.append(SequenceFilterHelper.INVALID_P_VALUE) return sequence_p_values
def run_compare(flat, adir, bdir, context, window, binary, pvalue_cutoff, ratio_range): #fh = open('fisher.different.%s.%ibp.gff' % (context, window), 'w') fh = sys.stdout print >>sys.stderr, "writing to:", fh.name print >>fh, "##gff-version 3" for chr in flat.seqids: try: bp_max = len(flat.fasta[chr]) except KeyError: print >>sys.stderr, chr, "not found. skipping" continue (a_cs, a_ts, a_mask), (b_cs, b_ts, b_mask) = bin_setup(chr, adir, bdir, context) for start in xrange(0, bp_max + window, window): end = min(start + window, bp_max) if start == end: continue a_t_count = a_ts[start:end].sum() a_c_count = a_cs[start:end].sum() b_t_count = b_ts[start:end].sum() b_c_count = b_cs[start:end].sum() p = pvalue(a_t_count, a_c_count, b_t_count, b_c_count) pv = float(p.two_tail) if not binary and pv > pvalue_cutoff: continue gc = f.fasta[chr][start:end].upper() gc = gc.count("G") + gc.count("C") # if a_tot or b_tot == 0, then use 'na' a_tot = float(a_c_count + a_t_count) a_methyl = (a_c_count / a_tot) if a_tot != 0 else 0#None b_tot = float(b_c_count + b_t_count) b_methyl = (b_c_count / b_tot) if b_tot !=0 else 0#None #strand = "+" if a_methyl > b_methyl else "-" strand = "." # TODO: use absolute? plot = a_methyl - b_methyl if not None in (a_methyl, b_methyl) else 'na' # scale by total. plot = plot / (a_methyl + b_methyl) #print plot, a_methyl, b_methyl #if plot == 'na': continue if binary: if plot != 'na': plot == 1 if (ratio_range[0] <= plot <= ratio_range[1]) else 0 else: if not (ratio_range[0] <= plot <= ratio_range[1]): #print >>sys.stderr, "skipping because of ratio range." continue if binary and plot != 'na': plot = 0 if pv > pvalue_cutoff else 1 attrs="p=%.3G;ac=%i;at=%i;bc=%i;bt=%i;gc=%i;plot=%.3G" % \ (pv, a_c_count, a_t_count, b_c_count, b_t_count, gc, plot) accns = flat.get_features_in_region(chr, start + 1, end) accns = [a["accn"] for a in accns] if accns: attrs +=";accns=" + ",".join(accns) print >>fh, "\t".join(map(str, [chr, "methylation", "dmc", start + 1, end, plot, strand, ".", attrs]))
def contingent(intervals, domain_name, nodoms_only=False): """ intervals should be all intervals in all genes that contain the domain """ import fisher n_domain_variants = sum(len(i.mafs.split(",")) for i in intervals if i.domain == domain_name) if nodoms_only: n_gene_variants = sum(len(i.mafs.split(",")) for i in intervals if i.domain == ".") else: n_gene_variants = sum(len(i.mafs.split(",")) for i in intervals if i.domain != domain_name) gene=set() n_domain_bases, n_gene_bases = 0, 0 for iv in intervals: gene.add(iv.gene) starts = map(int, iv.starts.split(",")) ends = map(int, iv.ends.split(",")) l = sum(e - s for s, e in zip(starts, ends)) assert all(e > s for s, e in zip(starts, ends)), domain_name if iv.domain == domain_name: n_domain_bases += l elif nodoms_only and iv.domain == ".": n_gene_bases += l elif not nodoms_only and iv.domain != domain_name: n_gene_bases += l tbl = "gene:%d/%d,dom:%d/%d" % (n_gene_variants, n_gene_bases, n_domain_variants, n_domain_bases) p = fisher.pvalue(n_gene_bases, n_gene_variants, n_domain_bases, n_domain_variants) denom = float(n_gene_variants) / (n_gene_bases or 1) or 1 return p.two_tail, (float(n_domain_variants) / (n_domain_bases or 1)) / denom, tbl, gene
def calculate_differential_methylation_fisher_exact(self, weighted=False): sum_meth_control = 0 sum_meth_affected = 0 sum_cov_control = 0 sum_cov_affected = 0 for cpg in self.cpgs: if weighted: sum_meth_control += cpg.weighted_methylation_control sum_meth_affected += cpg.weighted_methylation_affected sum_cov_control += cpg.cov_control sum_cov_affected += cpg.cov_affected else: sum_meth_control += cpg.meth_control sum_meth_affected += cpg.meth_affected sum_cov_control += cpg.cov_control sum_cov_affected += cpg.cov_affected control = sum_meth_control / sum_cov_control affected = sum_meth_affected / sum_cov_affected control_methylated = sum_cov_control * control / 100 control_unmethylated = sum_cov_control - control_methylated affected_methylated = sum_cov_affected * affected / 100 affected_unmethylated = sum_cov_affected - affected_methylated try: #Try to use the much faster fisher module from http://pypi.python.org/pypi/fisher/ p = fisher_exact.pvalue(control_methylated, control_unmethylated, affected_methylated, affected_unmethylated) pvalue = p.two_tail except: oddsratio, pvalue = stats.fisher_exact( [(control_methylated, control_unmethylated), (affected_methylated, affected_unmethylated)], alternative='two-sided') return pvalue
def control_comparison(control_cohort, gene_id, sample_hits, sample_size, inheritance_mode, variant_filter, quality_filter): """ Compare the results of num_hits, total against the reference population Return dict of 'num_hits', 'fisher_2sided_palue', """ cohort = get_population_datastore().get_control_cohort(control_cohort) indivs_with_inheritance, gene_variation = get_individuals_with_inheritance_in_gene( get_population_datastore(), get_reference(), cohort, inheritance_mode, gene_id, variant_filter=variant_filter, quality_filter=quality_filter ) control_hits = len(indivs_with_inheritance) fisher_results = fisher.pvalue( sample_hits, sample_size, control_hits, get_population_datastore().get_control_cohort_size(settings.DEFAULT_CONTROL_COHORT) ) return { 'control_hits': control_hits, 'fisher_2sided_pvalue': fisher_results.two_tail, }
def hypergeom_test(patients,anno_dict,anno_pats,categories=[],sign_thr =0.05): # e.g. whether bicluster membership is assicated with group membership in_bicluster = set(patients).intersection(anno_pats) outside_bicluster = anno_pats.difference(set(patients)) best_p_val = 0.05 enriched_cat = "NA" best_fold_enrichment = 0 best_overlap = 0 if len(categories) == 0: categories = anno_dict.keys() for category in categories: in_group = anno_dict[category] outside_group = anno_pats.difference(in_group) #print(field, category,len(in_bicluster), len(outside_bicluster), len(in_group), len(outside_group)) # define group membership overlap = len(in_bicluster.intersection(in_group)) outside_both = len(outside_bicluster.intersection(outside_group)) in_bicluster_outside_group = len(in_bicluster.intersection(outside_group)) outside_bicluster_in_group = len(set(outside_bicluster).intersection(set(in_group))) # right-sided exact Fisher's test p_val = pvalue(overlap,in_bicluster_outside_group,outside_bicluster_in_group,outside_both).right_tail if p_val < 0.05: expected_overlap = float(len(in_group))/len(anno_pats)*len(in_bicluster) fold_enrichment = float(overlap)/expected_overlap #print(p_val, category) log_neg_pval = -np.log10(p_val) if best_p_val < log_neg_pval: best_p_val = log_neg_pval enriched_cat = category best_fold_enrichment = fold_enrichment best_overlap = overlap return best_p_val,best_fold_enrichment,best_overlap, enriched_cat
def calculate_fisher(PATH_PEAKS, biocond, ip_data, input_data, library_size_ip, library_size_input, window_cutoff): with open(PATH_PEAKS + 'Fisher_' + ip_data + '.txt', "r") as ipfile, \ open(PATH_PEAKS + 'Fisher_' + input_data + '.txt', "r") as inputFile, \ open(PATH_PEAKS + 'Fisher_' + biocond + '.txt', "w") as bed_file: # Read transcript result header = [ 'WindowId', 'Windowcov', 'Windowcov_Input', 'Ratio_windowcov', 'pvalue' ] bed_file.write('\t'.join(header) + '\n') ipfile.readline() inputFile.readline() window_name_to_row = dict() for rowInput in inputFile: #print(rowInput) window_id_input = re.split('\t| *', rowInput)[0] #rowInput.split('\t')[0] window_name_to_row[window_id_input] = rowInput # WindowId 0 # Windowcov 1 # RPM 2 index = 1 for row_ip in ipfile: window_id_ip = re.split('\t| *', row_ip)[0] #row_ip.split('\t')[0] row_input = window_name_to_row[window_id_ip] row_input = row_input.replace(window_id_ip, '').strip() new_row = row_ip.strip() + '\t' + row_input.strip() # Calc ratio window window_cov = float( list(filter(None, re.split('\t| *', row_ip))) [1].strip()) #float(row_ip.split('\t')[1].strip()) window_input_cov = float( re.split('\t| *', row_input) [0].strip()) #float(row_input.split('\t')[0].strip()) if window_input_cov == 0: new_row += '\t' + str(window_cov) else: ratio_windows = window_cov / window_input_cov new_row += '\t' + str(ratio_windows) # Calc fisher-test #print('library size input') #print(library_size_input) if window_cov > window_cutoff: p = pvalue(int(window_cov), library_size_ip, int(window_input_cov), library_size_input) new_row += '\t' + str(p.right_tail) else: new_row += '\t1' # write file bed_file.write(new_row + '\n') if index % 1000000 == 0: print('Windows ' + str(index), '/25000000') index += 1 print("Fisher test calculated")
def fisherExact(line, idx): """Apply fisher exact test to appropriate columns of bed line. Columns are selected with the indexes in idx """ ## cnt= [[int(line[3]), int(line[4])], [int(line[5]), int(line[6])]] fet= fisher.pvalue(int(line[idx[0]]), int(line[idx[1]]), int(line[idx[2]]), int(line[idx[3]])) pvalues= [str(round(fet.left_tail, 4)), str(round(fet.right_tail, 4))] line.append('\t'.join(pvalues)) return(line)
def heatmap_v2(chromosomes,pop_counts, num_variants, population_dict,frequency_range, exclude, p_value, muted_dir,tag= '',output= 'pval',row= 24, col= 4, test= 'fisher'): ''' pairwise comparison of count matrices. Chi2 applied cell-wise. p-value or proportion - output argument. - v2: count matrices are provided in pop_counts dictionary. ''' if exclude: files= read_exclude() else: files= {} refpop, pop = list(pop_counts.keys()) ratio_grid = np.zeros((row, col)) sig_x, sig_y = [], [] for i in range(row): for j in range(col): chi_array= np.array([ [pop_counts[pop][i][j], num_variants[pop]], [pop_counts[refpop][i][j], num_variants[refpop]] ]) chi_0= np.sum(chi_array,axis= 1) chi_1= np.sum(chi_array,axis= 0) if chi_0[0] == 0 or chi_0[1] == 0: ratio_grid[i][j] = np.nan sig_x.append(j+0.5) sig_y.append(i+0.5) elif chi_1[0] == 0 or chi_1[1] == 0: ratio_grid[i][j] = 1 else: ## if test == 'chi2': _, this_pval, _, _ = chi2_contingency( chi_array ) else: p= pvalue(pop_counts[pop][i][j], num_variants[pop], pop_counts[refpop][i][j], num_variants[refpop]) this_pval= p.two_tail if output == 'pval': ratio_grid[i][j] = this_pval else: ratio_grid[i][j] = (pop_counts[pop][i][j] * num_variants[refpop] / (num_variants[pop] * pop_counts[refpop][i][j])) if this_pval < p_value: sig_x.append(j+0.5) sig_y.append(i+0.5) return ratio_grid, (sig_x, sig_y)
def calc_gene_overlap_pval(bic, bic2, N): g1 = bic["genes"] g2 = bic2["genes"] g1_g2 = len(g1.intersection(g2)) g1_only = len(g1.difference(g2)) g2_only = len(g2.difference(g1)) p_val = pvalue(g1_g2, g1_only, g2_only, N - g1_g2 - g1_only - g2_only).right_tail return p_val
def computeFisherExact( ): ## compute fisher's exact test, use sum for replicates for k in combined_circ_d: ## for all unique circRNAs cc_1 = [] cc_2 = [] lc_1 = [] lc_2 = [] ufi = combined_circ_d[k][0] dfi = "noKey" if len(combined_circ_d[k]) == 2: ## it has two introns listed dfi = combined_circ_d[k][1] ## upstream and downstream flanking introns ## get cc_1 and lc_1 for i in range(numC1): ## for each replicate cVal = 0 lVal = 0 if k in c_dic[S1][i]: cVal = c_dic[S1][i][k] if ufi in lcs1[i]: ## upstream junction count exists lVal = lcs1[i][ufi] if dfi in lcs1[i]: ## downstream junction count exists lVal += lcs1[i][dfi] cc_1.append(cVal) lc_1.append(lVal) ## get cc_2 and lc_2 for i in range(numC2): ## for each replicate cVal = 0 lVal = 0 if k in c_dic[S2][i]: cVal = c_dic[S2][i][k] if ufi in lcs2[i]: ## upstream junction count exists lVal = lcs2[i][ufi] if dfi in lcs2[i]: ## downstream junction count exists lVal += lcs2[i][dfi] cc_2.append(cVal) lc_2.append(lVal) counts_d[k] = [cc_1, lc_1, cc_2, lc_2] ##print (cc_1, cc_2, lc_1, lc_2); n1 = 2 * sum(cc_1) n2 = sum(lc_1) n3 = 2 * sum(cc_2) n4 = sum(lc_2) p = fisher.pvalue(n1, n2, n3, n4) fp[k] = p.two_tail ## saving p-value. it has p.left_tail, p.right_tail, and p.two_tail values for k in sorted(fp): sortedKey.append(k) ## sort keys for fdr calculation logging.debug("Done computing two-tail fisher exact test")
def fisherTest(tab, alternative='two-sided'): res = fisher.pvalue(tab[0][0], tab[0][1], tab[1][0], tab[1][1]) OR = (tab[0][0] * tab[1][1]) / (tab[0][1] * tab[1][0]) if alternative == 'two-sided': return (OR, res.two_tail) elif alternative == 'less': return (OR, res.left_tail) elif alternative == 'greater': return (OR, res.right_tail)
def hypergeom(m, n, n1, n2): """ From Fury et al., www.nslij-genetics.org/wli/pub/ieee-embs06.pdf :param m: overlapping genes :param n: total genes that could be sampled :param n1: number of genes in set 1 "param n2: number of genes in set 2 """ return fisher.pvalue(*_fury_table(m, n, n1, n2)).right_tail
def count_F(s1, s2): if True: # return corrcoef([code[x] for x in s1],[code[x] for x in s2]) l = len(s1) c = [x + 10 * y for x, y in zip(s1, s2)] x = [] for n in [0, 1, 10, 11]: x.append(c.count(n)) x = [float(y) for y in x] F = fisher.pvalue(x[0], x[1], x[2], x[3]).two_tail return F
def significance_on_tuple(sig_tuple): _, _, w1_occurrence, w2_occurrence, cooccurrences, n_docs = sig_tuple pvalue = fisher.pvalue( cooccurrences, w2_occurrence - cooccurrences, w1_occurrence - cooccurrences, (n_docs - w1_occurrence - w2_occurrence + cooccurrences)) # pvalue = fisher.pvalue(cooccurrences,w2_occurrence-cooccurrences,w1_occurrence,(n_docs-w1_occurrence)) #print sig_tuple, pvalue.left_tail, pvalue.right_tail #return (pvalue.left_tail,pvalue.right_tail,pvalue.two_tail) return pvalue.left_tail
def _fisherStrandBias(record): try: A = record.genotype('TUMOR')['ALT_F1R2'] B = record.genotype('TUMOR')['ALT_F2R1'] C = record.genotype('TUMOR')['REF_F1R2'] D = record.genotype('TUMOR')['REF_F2R1'] FSB = -math.log10(fisher.pvalue(A,B,C,D).two_tail) except: FSB = '.' return(FSB)
def fisherExact(line, idx): """Apply fisher exact test to appropriate columns of bed line. Columns are selected with the indexes in idx """ ## cnt= [[int(line[3]), int(line[4])], [int(line[5]), int(line[6])]] fet = fisher.pvalue(int(line[idx[0]]), int(line[idx[1]]), int(line[idx[2]]), int(line[idx[3]])) pvalues = [str(round(fet.left_tail, 4)), str(round(fet.right_tail, 4))] line.append('\t'.join(pvalues)) return (line)
def _test_hypergeom(m, n, n1, n2): R_pval = r.phyper(min(n1, n2), n1, n - n1, n2)[0] \ - r.phyper(m - 1, n1, n - n1, n2)[0] f_pval = fisher.pvalue(*_fury_table(m, n, n1, n2)).right_tail # at least to 10 sig figs R_str = ('%.10f' % R_pval) f_str = ('%.10f' % f_pval) print 'R:', R_str, 'Fisher:', f_str assert R_str == f_str
def hg_test(rlist1, rlist2, t_u, t_w, return_items=False): N = len(rlist1) overlap = set(rlist1[:t_u]).intersection(set(rlist2[:t_w])) overlap_size = len(overlap) p_val = pvalue(overlap_size, t_u - overlap_size, t_w - overlap_size, N + overlap_size - t_u - t_w).right_tail enrichment = float(overlap_size) / (float((t_u) * (t_w)) / N) if return_items: return p_val, enrichment, overlap else: return p_val, enrichment, overlap_size
def find_threshold(t_u, t_w, N, significance_thr): '''Find min. possible overlap still passing the significance threshold, given t_u,t_w and N.''' prev_x = N for x in range(min(t_u, t_w), 0, -1): p_val = pvalue(x, t_u - x, t_w - x, N - t_u - t_w + x).right_tail #print(t_u,t_w,x, p_val) if p_val < significance_thr: prev_x = x else: break return prev_x
def fisher_chi(study_c, study_p, control_c, control_p): """takes study count, population count, control count and control total""" if study_p <= 1000 and control_p <= 1000: p = fisher.pvalue(study_c, study_p, control_c, control_p) res = [p.two_tail] else: v = ro.IntVector([study_c, study_p, control_c, control_p]) m = ro.r['matrix'](v, 2, 2) res = ro.r['chisq.test'](m)[2] enrichment = 'e' if 1.0 * study_c / study_p > 1.0 * control_c / control_p else 'p' return res[0], enrichment
def run_study_genome_v2(self, association_2_count_dict_foreground, association_2_count_dict_background, foreground_n, background_n): """ ################################################### # contingency table general variable names: # foreground | background | # ------------------------------------------------- # + a = foregr_count | c = backgr_count | r1 # ------------------------------------------------- # - b | d | r2 # ------------------------------------------------- # foregr_n | backgr_n | n """ fisher_dict = {} len_dict = len(association_2_count_dict_foreground) term_arr = np.empty( (len_dict, ), dtype=np.dtype('U13') ) # cat Functions_table_STRING.txt | cut -f 2 | awk '{print length, $0}' | sort -nr | head -1 p_value_arr = np.zeros(shape=(len_dict, ), dtype="float64") foreground_count_arr = np.zeros(shape=(len_dict, ), dtype="int8") for i, (association, foreground_count) in enumerate( association_2_count_dict_foreground.items()): try: background_count = association_2_count_dict_background[ association] except KeyError: self.args_dict[ "ERROR_association_2_count"] = "ERROR retrieving counts for association {} please contact [email protected] with this error message".format( association) return None a = foreground_count # number of proteins associated with given GO-term b = foreground_n - foreground_count # number of proteins not associated with GO-term c = background_count d = background_n - background_count if d < 0: d = 0 ### enriched or overrepresented --> right_tail or greater (but foreground and background are switched) try: p_val_uncorrected = fisher_dict[(a, b, c, d)] except KeyError: p_val_uncorrected = pvalue(a, b, c, d).right_tail fisher_dict[(a, b, c, d)] = p_val_uncorrected term_arr[i] = association p_value_arr[i] = p_val_uncorrected foreground_count_arr[i] = foreground_count df = pd.DataFrame() df["term"] = term_arr df["p_value"] = p_value_arr df["foreground_count"] = foreground_count_arr df = multiple_testing.BH_fast_v3(df) return df
def run_study_genome(self, association_2_count_dict_foreground, association_2_count_dict_background, foreground_n, background_n): """ ################################################### # contingency table general variable names: # foreground | background | # ------------------------------------------------- # + a = foregr_count | c = backgr_count | r1 # ------------------------------------------------- # - b | d | r2 # ------------------------------------------------- # foregr_n | backgr_n | n """ fisher_dict = {} term_list, description_list, p_value_list, foreground_count_list = [], [], [], [] for association, foreground_count in association_2_count_dict_foreground.items( ): try: background_count = association_2_count_dict_background[ association] except KeyError: self.args_dict[ "ERROR_association_2_count"] = "ERROR retrieving counts for association {} please contact [email protected] with this error message".format( association) return None # background_count = np.nan a = foreground_count # number of proteins associated with given GO-term b = foreground_n - foreground_count # number of proteins not associated with GO-term c = background_count d = background_n - background_count if d < 0: d = 0 ### enriched or overrepresented --> right_tail or greater (but foreground and background are switched) try: p_val_uncorrected = fisher_dict[(a, b, c, d)] except KeyError: p_val_uncorrected = pvalue(a, b, c, d).right_tail #p_val_uncorrected = stats.fisher_exact([[a, b], [c, d]], alternative='greater')[1] fisher_dict[(a, b, c, d)] = p_val_uncorrected term_list.append(association) p_value_list.append(p_val_uncorrected) # foreground_ids_list.append(';'.join(self.association_2_ANs_dict_foreground[association])) # !!! remove this and add infos after FDR filtering foreground_count_list.append(foreground_count) # create DataFrame from List compare time setup df = pd.DataFrame({ "term": term_list, "p_value": p_value_list, # "foreground_ids": foreground_ids_list, # do later "foreground_count": foreground_count_list }) df = multiple_testing.BH_fast_v3(df) return df
def fisher_chi(study_c, study_p, control_c, control_p): """takes study count, population count, control count and control total""" if study_p <= 1000 and control_p <= 1000: p = fisher.pvalue(study_c, study_p, control_c, control_p) res = [p.two_tail] else: v = ro.IntVector([study_c, study_p, control_c, control_p]) m = ro.r["matrix"](v, 2, 2) res = ro.r["chisq.test"](m)[2] enrichment = "e" if 1.0 * study_c / study_p > 1.0 * control_c / control_p else "p" return res[0], enrichment
def rtest(comps, genome, motif, rcounts, nums): # rcounts print "%s.%s.%s: fisher test on real and perm data" % (comps, genome, motif) val_class = rcounts.get("t", 0) val_control = rcounts.get("c", 0) #num_class = nums.get("t.all", 0) # v_18 #num_control = nums.get("c.all", 0) # v_18 num_class = nums.get("t", 0) # v_17 num_control = nums.get("c", 0) # v_17 num_all = float(num_class+num_control) val_all = float(val_class+val_control) fisher = pvalue(val_class, val_control, num_class-val_class, num_control-val_control).right_tail # information gain (g at the end) i1 = (num_class/num_all)*math.log(num_class/num_all, 2) i2 = (num_control/num_all)*math.log(num_control/num_all, 2) i = -(i1+i2) c1_num = max(1, val_all) # dont allow it to be 0 if (val_class/c1_num)>0 and val_control/c1_num>0: c1_num = -( (val_class/c1_num)*math.log(val_class/c1_num, 2) + (val_control/c1_num)*math.log(val_control/c1_num, 2) ) c1_num = val_all/num_all * c1_num else: c1_num = 0 c2_num = max(1, float( (num_class-val_class) + (num_control-val_control) ) ) # dont allow it to be 0 if (num_class-val_class)/c2_num>0 and (num_control-val_control)/c2_num>0: c2_num = -( ((num_class-val_class)/c2_num)*math.log((num_class-val_class)/c2_num, 2) + ((num_control-val_control)/c2_num)*math.log((num_control-val_control)/c2_num, 2) ) c2_num = (num_all-val_all)/num_all * c2_num else: c2_num = 0 ig = i - (c1_num + c2_num) p_emp = [] for p in range(0, rnamotifs2.config.perms): val_class = rcounts.get("%s.p%s" % (event_class, p), 0) val_control = rcounts.get("c.p%s" % p, 0) num_class = rnamotifs2.perm.ec_dist[p].get(event_class, 0) num_control = rnamotifs2.perm.ec_dist[p].get("c", 0) val = pvalue(val_class, val_control, num_class-val_class, num_control-val_control).right_tail p_emp.append(val) return (fisher, p_emp, ig)
def fisher_right_tail(intersection_size, gse_size, query_size, module_size): """ :type query_size: int :type gse_size: int :type intersection_size: int :type module_size: int """ a = intersection_size b = module_size - a c = query_size - a d = gse_size - query_size - b return fisher.pvalue(a, b, c, d).right_tail
def main(): options, args = cmdparameter(sys.argv) #----------------------------------- go_file = options.go gene_file = options.gene output = options.output pvalue_thresh = float(options.pvalue) if output: fh_out = open(output, 'w') else: fh_out = sys.stdout verbose = options.verbose debug = options.debug #----------------------------------- annoGeneD = set([j for i in open(go_file) \ for j in i.split('\t')[2].split(',')]) geneD = set([i.strip() for i in open(gene_file)]) geneD = geneD.intersection(annoGeneD) geneD_len = len(geneD) #------------------------------------ #-------------------------------- annoL = [] header = 1 for line in open(go_file): lineL = line.strip().split('\t') if header: print >>fh_out, "%s\t%s\tTargetGene\tTargetCount\tTargetTotal\tp\tfracT" \ % (lineL[0], lineL[1]) header -= 1 continue #lineL = line.split('\t') go_gene = lineL[2].split(',') anno_gene = [gene for gene in go_gene if gene in geneD] if anno_gene: termCount = int(lineL[3]) totalCount = int(lineL[4]) annoCount = len(anno_gene) p = pvalue(annoCount, termCount-annoCount, geneD_len-annoCount, totalCount-geneD_len+annoCount-termCount) p = p.two_tail fracT = annoCount * 1.0 / geneD_len / termCount * totalCount if fracT > 1 and p <= pvalue_thresh: print >>fh_out, "%s\t%s\t%s\t%d\t%d\t%f\t%f" \ % (lineL[0], lineL[1], ','.join(anno_gene), annoCount, geneD_len, p, fracT) #-------------END reading file---------- if output: fh_out.close() os.system("multipleTest.sh -f %s" % output) if verbose: print >>sys.stderr,\ "--Successful %s" % strftime(timeformat, localtime())
def _fisher(rule): ''' Fisher's p-value for one rule. ''' N = float(len(rule.kb.examples)) nX = float(rule.coverage) nY = rule.kb.distribution[rule.target] nXY = rule.distribution[rule.target] nXnotY = nX - nXY nnotXY = nY - nXY nnotXnotY = N - nXnotY - nnotXY return pvalue(nXY, nXnotY, nnotXY, nnotXnotY)
def calc_overlap_pval_J(bic, bic2, all_samples): s1 = bic["samples"] s2 = bic2["samples"] if bic["direction"] != bic2["direction"]: s2 = all_samples.difference(s2) s1_s2 = len(s1.intersection(s2)) s1_only = len(s1.difference(s2)) s2_only = len(s2.difference(s1)) p_val = pvalue(s1_s2, s1_only, s2_only, len(all_samples) - s1_s2 - s1_only - s2_only).right_tail J = 1.0 * s1_s2 / (s1_s2 + s1_only + s2_only) return p_val, J
def filtering(control_file, affected_file, filtered_control_file, filtered_affected_file, max_pvalue = None, min_cov = None, max_cov = None, min_delta_methylation = None, filter_quantil = None): control_quantil = None affected_quantil = None if filter_quantil: control_quantil = mquantiles( np.loadtxt(control_file, delimiter='\t', usecols=(3,)), prob = [filter_quantil])[0] affected_quantil = mquantiles( np.loadtxt(affected_file, delimiter='\t', usecols=(3,)), prob = [filter_quantil])[0] non_filtered_sites = 0 for site_counter, (control_line, affected_line) in enumerate( izip(open(control_file), open(affected_file)) ): c_chrom, c_start, c_end, c_cov, c_meth, c_strand = control_line.strip().split('\t') a_chrom, a_start, a_end, a_cov, a_meth, a_strand = affected_line.strip().split('\t') try: assert( c_chrom == a_chrom ) assert( c_start == a_start ) assert( c_end == a_end ) assert( c_strand == a_strand ) except AssertionError: sys.exit('That file needs intersected inputfiles, so that each site is present in both files, affected and control.\n %s : %s \n %s : %s \n %s : %s \n %s : %s \n' % (c_chrom, a_chrom, c_start, a_start, c_end, a_end, c_strand, a_strand)) c_cov, c_meth, a_cov, a_meth = map(float, [c_cov, c_meth, a_cov, a_meth]) if min_cov != None and (a_cov < min_cov or c_cov < min_cov): continue if max_cov != None and (a_cov > max_cov or c_cov > max_cov): continue if min_delta_methylation != None and abs(a_meth - c_meth) < min_delta_methylation: continue if filter_quantil and (c_cov > control_quantil or a_cov > affected_quantil): continue if max_pvalue != None: control_methylated = c_cov * c_meth / 100 control_unmethylated = c_cov - control_methylated affected_methylated = a_cov * a_meth / 100 affected_unmethylated = a_cov - affected_methylated try: #Try to use the much faster fisher module from http://pypi.python.org/pypi/fisher/ p = fisher_exact.pvalue(control_methylated, control_unmethylated, affected_methylated, affected_unmethylated) pvalue = p.two_tail except: oddsratio, pvalue = stats.fisher_exact([(control_methylated, control_unmethylated), (affected_methylated, affected_unmethylated)], alternative='two-sided') if pvalue > max_pvalue: continue non_filtered_sites += 1 filtered_control_file.write(control_line) filtered_affected_file.write(affected_line) sys.stdout.write( "%s from %s filtered.\n" % (site_counter+1 - non_filtered_sites, site_counter + 1) ) filtered_affected_file.close() filtered_control_file.close()
def main(): lensysargv = len(sys.argv) if lensysargv < 4: print >>sys.stderr, "Print the result to screen" print >>sys.stderr, 'Using python %s filename total_first_col \ total_second_col two_tail[left_tail, right_tail] head[number of lines\ needs to skip, default 1] divide_a_value[used when your count larger \ than 4294967296]' % sys.argv[0] sys.exit(0) #----------------------------------- file = sys.argv[1] total_first_col = int(sys.argv[2]) total_second_col = int(sys.argv[3]) #print total_first_col, total_second_col if lensysargv > 4: tail = sys.argv[4] else: tail = "two_tail" if lensysargv > 5: head = int(sys.argv[5]) else: head = 1 if lensysargv > 6: scale = int(sys.argv[6]) total_first_col = total_first_col/scale total_second_col = total_second_col/scale else: scale = 1 #----------------------------------------------- for line in open(file): line = line.rstrip() if head: print "%s\t%s" % (line, 'p') head -= 1 continue #---------------------------------- lineL = line.split() q = int(lineL[1]) / scale m = int(lineL[2]) / scale if q == 0 and m == 0: continue p = pvalue(q, m, total_first_col-q, total_second_col-m) if tail == 'two_tail': print "%s\t%s" % (line, p.two_tail) elif tail == 'left_tail': print "%s\t%s" % (line, p.left_tail) elif tail == 'right_tail': print "%s\t%s" % (line, p.right_tail)
def temporal_scan( \ baseline_filters, target_filters, analysis_start, analysis_end, keylist = None, cur_window = 7, ref_window = 91, lag = 0, constant_baseline = False, index = None, time_field = None): start = None end = None if EventDetector.cfg == None: EventDetector.load_configuration('config/tad.cfg') if start is None: start = analysis_start - dt.timedelta(days = cur_window + lag + ref_window - 1) if end is None: end = analysis_end counts = EventDetector.get_counts( start, end, baseline_filters, target_filters, keylist, index, time_field, constant_baseline) if isinstance(counts, str): raise Exception(counts) elif len(counts) == 0: raise Exception('ERROR: No results returned. Valid analysis range specified?') kernel_ref = np.ones(ref_window) kernel_cur = np.ones(cur_window) n_days = (analysis_end - analysis_start).days + 1 baseline_ref = np.correlate(counts['baseline'], kernel_ref)[:n_days] target_ref = np.correlate(counts['target'] , kernel_ref)[:n_days] baseline_cur = np.correlate(counts['baseline'], kernel_cur)[-n_days:] target_cur = np.correlate(counts['target'] , kernel_cur)[-n_days:] on_date = analysis_start results = [] for si in xrange(n_days): p = pvalue(baseline_ref[si], target_ref[si], baseline_cur[si], target_cur[si]) results.append([ on_date, baseline_ref[si], target_ref[si], baseline_cur[si], target_cur[si], p.left_tail, p.two_tail, p.right_tail]) on_date += dt.timedelta(days = 1) return results
def test_discrete(a, b): # multiple classes, Fisher's exact test, followed by Bonferonni correction # returns the smallest p-value for all tested classes all_categories = set(flatten(a)) pvalues = [] for category in all_categories: # calculate number of items with this category a1, a0 = get_counts(a, category) b1, b0 = get_counts(b, category) # we are only interested in enrichment, so right_tail pvalue = fisher.pvalue(a1, a0, b1, b0).right_tail pvalues.append((pvalue, category)) # fisher's exact test plus bonferroni correction of number of tests min_pvalue, min_category = min(pvalues) min_pvalue *= len(pvalues) return min_pvalue, min_category
def _enrichment (self, genes1, genes2, part=''): ''' computes an enrichment test between two sets of lists of genes ''' len_genes1 = len (genes1) len_genes2 = len (genes2) dico = self.gsea part = '|' + part # start fishers for ann, annot_genes in self.annot.iteritems(): p1 = len (set (annot_genes) & genes1) p2 = len (set (annot_genes) & genes2) n1 = len_genes1 - p1 n2 = len_genes2 - p2 dico [ann + part] = {'p1' : p1, 'n1': n1, 'p2' : p2, 'n2': n2, 'pv' : pvalue (p1, n1, p2, n2).two_tail, # 3/4 of time spent here 'odd': _get_odd_ratio (p1, p2, n1, n2)}
def get_vocab(text, score, max_feats=750, max_feats2=200): """ Uses a fisher test to find words that are significant in that they separate high scoring essays from low scoring essays. text is a list of input essays. score is a list of scores, with score[n] corresponding to text[n] max_feats is the maximum number of features to consider in the first pass max_feats2 is the maximum number of features to consider in the second (final) pass Returns a list of words that constitute the significant vocabulary """ dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats) dict_mat = dict.fit_transform(text) set_score = numpy.asarray(score, dtype=numpy.int) med_score = numpy.median(set_score) new_score = set_score if(med_score == 0): med_score = 1 new_score[set_score < med_score] = 0 new_score[set_score >= med_score] = 1 fish_vals = [] for col_num in range(0, dict_mat.shape[1]): loop_vec = dict_mat.getcol(col_num).toarray() good_loop_vec = loop_vec[new_score == 1] bad_loop_vec = loop_vec[new_score == 0] good_loop_present = len(good_loop_vec[good_loop_vec > 0]) good_loop_missing = len(good_loop_vec[good_loop_vec == 0]) bad_loop_present = len(bad_loop_vec[bad_loop_vec > 0]) bad_loop_missing = len(bad_loop_vec[bad_loop_vec == 0]) fish_val = pvalue(good_loop_present, bad_loop_present, good_loop_missing, bad_loop_missing).two_tail fish_vals.append(fish_val) cutoff = 1 if(len(fish_vals) > max_feats2): cutoff = sorted(fish_vals)[max_feats2] good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff]) getVar = lambda searchList, ind: [searchList[i] for i in ind] vocab = getVar(dict.get_feature_names(), good_cols) return vocab
def allelic_association_comb(self, phenotype_list,genotype_list): associations = [] allele_list = [] alleles = set() for g in genotype_list: if g == "": allele_list.append(["",""]) alleles.add("") else: allele_list.append(g.split(",")) for al in g.split(","): alleles.add(al) for a_al in alleles: if True: a_al_case = 0; a_al_control = 0; b_al_case = 0; b_al_control = 0; for phenotype, genotype in zip(phenotype_list,allele_list): for allele in genotype: if allele == a_al: if phenotype == 1: a_al_case += 1 else: a_al_control += 1 if genotype != a_al: if phenotype == 1: b_al_case += 1 else: b_al_control += 1 p_val = pvalue(a_al_case, a_al_control,b_al_case, b_al_control) associations.append({ "a_al" : a_al, "b_al" : "All Others", "p_val" : round(p_val.two_tail,4) }) return associations
def get_vocab(self, input_text, input_scores): train_mat = self.vectorizer1.transform(input_text) input_score_med = np.median(input_scores) new_scores = [0 if i<=input_score_med else 1 for i in input_scores] pvalues = [] for i in xrange(0,train_mat.shape[1]): lcol = np.asarray(train_mat.getcol(i).todense().transpose())[0] good_lcol = lcol[[n for n in xrange(0,len(new_scores)) if new_scores[n]==1]] bad_lcol = lcol[[n for n in xrange(0,len(new_scores)) if new_scores[n]==0]] good_lcol_present = len(good_lcol[good_lcol > 0]) good_lcol_missing = len(good_lcol[good_lcol == 0]) bad_lcol_present = len(bad_lcol[bad_lcol > 0]) bad_lcol_missing = len(bad_lcol[bad_lcol == 0]) pval = pvalue(good_lcol_present, bad_lcol_present, good_lcol_missing, bad_lcol_missing) pvalues.append(pval.two_tail) col_inds = list(xrange(0,train_mat.shape[1])) p_frame = pd.DataFrame(np.array([col_inds, pvalues]).transpose(), columns=["inds", "pvalues"]) p_frame = p_frame.sort(['pvalues'], ascending=True) getVar = lambda searchList, ind: [searchList[int(i)] for i in ind] vocab = getVar(self.vectorizer1.get_feature_names(), p_frame['inds'][:2000]) return vocab