def motif_corr(motif,n=1000): """find correlated columns in motif, correcting for multiple hypothesis testing""" ps = [mi_permute(col1,col2,p_value=True,n=n,mi_method=lambda xs,ys:mi(xs,ys,correct=False)) for (col1,col2) in (choose2(transpose(motif)))] q = fdr(ps) if q is None: return None else: L = len(motif[0]) return [((i,j),p) for (i,j),p in zip(choose2(range(L)),ps) if p <= q]
def pairwise_mi_ref(ps,M=None): w = int(log(len(ps),4)) dimer_freqs = defaultdict(lambda: np.zeros(16)) dimer_base_index = {"".join(comb):k for k,comb in enumerate(product("ACGT","ACGT"))} psfm = marginalize(ps,M) for k,(kmer,p) in enumerate(zip(make_kmers(w),ps)): for (i,j) in choose2(range(w)): dimer = kmer[i] + kmer[j] dimer_freqs[(i,j)][dimer_base_index[dimer]]+=p return sum(dimer_freqs[(i,j)][k]*log2(dimer_freqs[(i,j)][k]/(psfm[i][k/4]*psfm[j][k%4])) for k in range(16) for (i,j) in choose2(range(w)))
def analyze_motif(motif, trials=1000): cols = transpose(motif) L = len(cols) ps = [] for col1, col2 in (choose2(cols)): actual_mi = dna_mi(col1,col2) perm_mis = [dna_mi(col1,permute(col2)) for i in xrange(trials)] p = percentile(actual_mi, perm_mis) #print p ps.append(p) q = fdr(ps) correlated_pairs = [(i,j) for (i,j),p in zip(choose2(range(L)),ps) if p < q] num_correlated = len(correlated_pairs) print "correlated column pairs:", num_correlated, "%1.2f" % ((num_correlated)/choose(L,2)) return correlated_pairs
def expected_mi(L, k): """expected MI between two columns, given site length L and k mismatches per site""" L = float(L) q = k / L # prob mismatch p = 1 - q match = lambda b: b == "A" mismatch = lambda b: b != "A" def joint(b1, b2): if match(b1): if match(b2): return (L - k) / L * (L - k - 1) / (L - 1) else: return (L - k) / L * k / (L - 1) / 3.0 else: if match(b2): return (k) / L * (L - k) / (L - 1) / 3.0 else: return (k) / L * (k - 1) / (L - 1) / 9.0 def marg(b): return p if match(b) else q / 3 return sum( joint(b1, b2) * log2(joint(b1, b2) / (marg(b1) * marg(b2))) if joint(b1, b2) else 0 for b1, b2 in choose2("ACGT"))
def analyze_motif(motif, trials=1000): cols = transpose(motif) L = len(cols) ps = [] for col1, col2 in (choose2(cols)): actual_mi = dna_mi(col1, col2) perm_mis = [dna_mi(col1, permute(col2)) for i in xrange(trials)] p = percentile(actual_mi, perm_mis) #print p ps.append(p) q = fdr(ps) correlated_pairs = [(i, j) for (i, j), p in zip(choose2(range(L)), ps) if p < q] num_correlated = len(correlated_pairs) print "correlated column pairs:", num_correlated, "%1.2f" % ( (num_correlated) / choose(L, 2)) return correlated_pairs
def interpret_main_experiment(results_dict): taus = sorted(results_dict.keys()) print taus data = [(tau,f,motif_ic(extract_sites(s)),total_motif_mi(extract_sites(s))) for tau in taus for (s,f) in results_dict[tau][0]] cols = transpose(data) names = "tau,f,motif_ic,total_motif_mi".split(",") for (i,name1),(j,name2) in choose2(list(enumerate(names))): xs = cols[i] ys = cols[j] print name1,name2,pearsonr(xs,ys),spearmanr(xs,ys)
def analyze_column_frequencies(): """Do columnwise frequencies reveal stable patterns that could be explained by amino acid preferences?""" def dna_freqs(xs): return [xs.count(b)/float(len(xs)) for b in "ACGT"] all_freqs = concat([map(dna_freqs,transpose(getattr(tfdf_obj,tf))) for tf in tfdf_obj.tfs]) for k,(i,j) in enumerate(choose2(range(4))): plt.subplot(4,4,k) cols = transpose(all_freqs) plt.scatter(cols[i],cols[j])
def get_pairwise_freqs(motif, pc=1/16.0): cols = transpose(motif) L = len(cols) N = len(motif) fs = [{(b1, b2):0 for (b1,b2) in dinucs} for _ in range(int(choose(L,2)))] for f, (col1, col2) in zip(fs, choose2(cols)): for b1, b2 in zip(col1, col2): f[b1, b2] += 1 for b1, b2 in dinucs: f[b1, b2] += pc f[b1, b2] /= float(N + 16*pc) return fs
def interpret_main_experiment(results_dict): taus = sorted(results_dict.keys()) print taus data = [(tau, f, motif_ic(extract_sites(s)), total_motif_mi(extract_sites(s))) for tau in taus for (s, f) in results_dict[tau][0]] cols = transpose(data) names = "tau,f,motif_ic,total_motif_mi".split(",") for (i, name1), (j, name2) in choose2(list(enumerate(names))): xs = cols[i] ys = cols[j] print name1, name2, pearsonr(xs, ys), spearmanr(xs, ys)
def analyze_all_pvals_at_once(org_obj=Escherichia_coli): """conclusion: fdr-adjusted p-values identify 25 significantly correlated column-pairs in 3753 pairwise tests (0.5%). """ ps = [mi_permute(col1,col2,p_value=True,n=1000,mi_method=lambda xs,ys:mi(xs,ys,correct=False)) for tf in tqdm(org_obj.tfs) for (col1,col2) in (choose2(transpose(getattr(org_obj,tf))))] q_bh = fdr(ps) q_bhy = bhy(ps) print "bh procedure: %s/%s" % (len(filter(lambda p:p <= q_bh,ps)),len(ps)) print "bhy procedure: %s/%s" % (len(filter(lambda p:p <= q_bhy,ps)),len(ps)) return ps
def get_pairwise_freqs(motif, pc=1 / 16.0): cols = transpose(motif) L = len(cols) N = len(motif) fs = [{(b1, b2): 0 for (b1, b2) in dinucs} for _ in range(int(choose(L, 2)))] for f, (col1, col2) in zip(fs, choose2(cols)): for b1, b2 in zip(col1, col2): f[b1, b2] += 1 for b1, b2 in dinucs: f[b1, b2] += pc f[b1, b2] /= float(N + 16 * pc) return fs
def log_fitness_approx(matrix,motif,G,terms=2): n = len(motif) eps = [score_seq(matrix,site) for site in motif] fgs = [exp(-ep) for ep in eps] Zf = sum(fgs) Zb = Zb_from_matrix(matrix,G) Z = Zf + Zb zeroth_term = log(n+Zb) * (terms >= 0) first_term = (-1/(n+Zb)*sum(eps)) * (terms >= 1) second_term = 1/2.0*1/(n+Zb)**2*((n + Zb - 1)*sum(ep**2 for ep in eps) - sum(epi*epj for epi,epj in choose2(eps))) * (terms >= 2) print zeroth_term,first_term,second_term # first_order = -sum(eps) - n*(log(n+Zb) + (-1/(n+Zb)*sum(eps))) # second_order = -sum(eps) - n*(log(n+Zb) + (-1/(n+Zb)*sum(eps)) + 1/2.0*1/(n+Zb)**2*((n))) return -sum(eps) - n*(zeroth_term + first_term + second_term)
def analyze_composition_of_correlated_columns(obj,ps): p_idx = 0 cor_adj_counts = defaultdict(int) cor_nonadj_counts = defaultdict(int) uncor_counts = defaultdict(int) fdr_cutoff = 0 for tf in obj.tfs: motif = getattr(obj,tf) cols = transpose(motif) for (i,col1),(j,col2) in choose2(list(enumerate(cols))): if ps[p_idx] <= 0: print tf,i,j for pair in zip(cols[i],cols[j]): if i + 1 == j: cor_adj_counts[pair] += 1 else: cor_nonadj_counts[pair] += 1 #print mi_table(col1,col2) else: for pair in zip(cols[i],cols[j]): uncor_counts[pair] += 1 p_idx += 1 cor_adj_N = float(sum(cor_adj_counts.values())) cor_nonadj_N = float(sum(cor_nonadj_counts.values())) uncor_N = float(sum(uncor_counts.values())) # all_N = float(sum(all_counts.values())) # print "---" # for b1,b2 in sorted(counts.keys()): # # print b1,b2,"freq:",fmt(counts[(b1,b2)]/N),"background:",fmt(all_counts[(b1,b2)]/all_N),"OR:",fmt(counts[(b1,b2)]/N/(all_counts[(b1,b2)]/all_N)),p print "bases, adj, nonadj, noncor | adj freq, nonadj freq | noncor freq| adj OR, nonadj OR" # XXX split into adj_uncor, nonadj_uncor for b1,b2 in sorted(cor_adj_counts.keys()): cor_adj_freq = fmt(cor_adj_counts[(b1,b2)]/cor_adj_N) cor_nonadj_freq = fmt(cor_nonadj_counts[(b1,b2)]/cor_nonadj_N) uncor_freq = fmt(uncor_counts[(b1,b2)]/uncor_N) cor_adj_OR = fmt(cor_adj_freq/uncor_freq) cor_nonadj_OR = fmt(cor_nonadj_freq/uncor_freq) _,adj_p,_,_ = stats.chi2_contingency(np.array([[uncor_N,uncor_counts[(b1,b2)]], [cor_adj_N,cor_adj_counts[(b1,b2)]]])) _,non_adj_p,_,_ = stats.chi2_contingency(np.array([[uncor_N,uncor_counts[(b1,b2)]], [cor_nonadj_N,cor_nonadj_counts[(b1,b2)]]])) print b1,b2,cor_adj_counts[b1,b2],cor_nonadj_counts[b1,b2],uncor_counts[b1,b2],"|",cor_adj_freq,cor_nonadj_freq,"|",uncor_freq,"|",cor_adj_OR, significance(adj_p),cor_nonadj_OR,significance(non_adj_p) return cor_adj_counts, cor_nonadj_counts, uncor_counts
def expected_mi(L,k): """expected MI between two columns, given site length L and k mismatches per site""" L = float(L) q = k/L # prob mismatch p = 1 - q match = lambda b:b=="A" mismatch = lambda b:b!="A" def joint(b1,b2): if match(b1): if match(b2): return (L-k)/L * (L-k-1)/(L-1) else: return (L - k)/L * k/(L - 1) / 3.0 else: if match(b2): return (k)/L * (L-k)/(L-1) / 3.0 else: return (k)/L * (k-1)/(L-1) / 9.0 def marg(b): return p if match(b) else q/3 return sum(joint(b1,b2)*log2(joint(b1,b2)/(marg(b1)*marg(b2))) if joint(b1,b2) else 0 for b1,b2 in choose2("ACGT"))
def site_sampling_methods_study(n=50, num_motifs=10, plot=True): """validate that the three proposed sampling methods: brute force rejection sampling metropolis hastings do in fact sample from the same distribution """ L = 10 sigma = 1 matrix = sample_matrix(L, sigma) Ne = 5 mu = -10 print "bf" t0 = time.time() bf_motifs = [sample_motif_bf(matrix, mu, Ne, n,verbose=True) for i in trange(num_motifs)] bf_time = time.time() - t0 print "ar" t0 = time.time() ar_motifs = [sample_motif_ar(matrix, mu, Ne, n) for i in range(num_motifs)] ar_time = time.time() - t0 print "mh" t0 = time.time() mh_motifs = [sample_motif_mh(matrix, mu, Ne, n) for i in range(num_motifs)] mh_time = time.time() - t0 icss = mmap(motif_ic,[bf_motifs, ar_motifs, mh_motifs]) print "ics:", map(mean_ci, icss) print "time per motif:", [t/num_motifs for t in [bf_time, ar_time, mh_time]] if plot: plt.boxplot(icss) for xs, ys in choose2(icss): print mannwhitneyu(xs,ys)
def Psi(xs): """return probability weight associated with configuration, up to Z. P(xs) = Psi(xs)/Z""" single_terms = product(psi_jjp(x) for x in xs) pair_terms = product(psi_jjp(xj,xjp) for (xj,xjp) in choose2(xs)) return single_terms * pair_terms
def score(model, site): return sum(wi[(b1, b2)] for wi, (b1, b2) in zip(model, choose2(site)))
def score(model, site): return sum(wi[(b1, b2)] for wi, (b1,b2) in zip(model, choose2(site)))
def alo2(ps): return (sum(ps) - sum(pi*pj for pi,pj in choose2(ps)))