def analyze_mi_tests(prok_tests, euk_tests): pass prok_q = fdr(concat(prok_tests)) euk_q = fdr(concat(euk_tests)) prok_correlated_percentage = count(lambda x:x <= prok_q,(concat(prok_tests)))/float(len(concat(prok_tests))) euk_correlated_percentage = count(lambda x:x <= euk_q,(concat(euk_tests)))/float(len(concat(euk_tests))) prok_ds = [[j - i for (i, coli), (j,colj) in choose2(list(enumerate(transpose(motif))))] for motif in prok_motifs] euk_ds = [[j - i for (i, coli), (j,colj) in choose2(list(enumerate(transpose(motif))))] for motif in euk_motifs] def binom_ci(xs): """return width of error bar""" bs_means = sorted([mean(bs(xs)) for x in range(1000)]) mu = mean(xs) return (mu - bs_means[25], bs_means[975] - mu) prok_cis = [binom_ci([t <= prok_q for t,d in zip(concat(prok_tests), concat(prok_ds)) if d == i]) for i in trange(1,20)] euk_cis = [binom_ci([t <= euk_q for t,d in zip(concat(euk_tests), concat(euk_ds)) if d == i]) for i in trange(1,20)] plt.errorbar(range(1,20), [mean([t <= prok_q for t,d in zip(concat(prok_tests), concat(prok_ds)) if d == i]) for i in range(1,20)],yerr=transpose(prok_cis),label="Prokaryotic Motifs",capthick=1) plt.errorbar(range(1,20), [mean([t <= euk_q for t,d in zip(concat(euk_tests), concat(euk_ds)) if d == i]) for i in range(1,20)],yerr=transpose(euk_cis),label="Eukaryotic Motifs",capthick=1) plt.xlabel("Distance (bp)",fontsize="large") plt.ylabel("Proportion of Significant Correlations",fontsize="large") plt.legend(fontsize='large')
def on_off_experiment2(num_motifs=100,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf"): """compare MI vs Gini on biological_motifs""" bio_motifs = [getattr(tfdf,tf) for tf in tfdf.tfs] Ns = map(len, bio_motifs) spoofses = [spoof_on_off_motif(motif,num_motifs=num_motifs,trials=1) for motif in bio_motifs] spoof_ginises = mmap(motif_gini,tqdm(spoofses)) spoof_mises = mmap(total_motif_mi,tqdm(spoofses)) cors, ps = [],[] for ginis, mis in zip(ginises, mises): cor, p = pearsonr(ginis,mis) cors.append(cor) ps.append(p) q = fdr(ps) plt.scatter(cors,ps,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf") plt.plot([-1,1],[q,q],linestyle='--',label="FDR-Adjusted Significance Level") plt.semilogy() plt.legend() plt.xlabel("Pearson Correlation Coefficient") plt.ylabel("P value") plt.xlim([-1,1]) plt.ylim([10**-4,1+1]) cor_ps = zip(cors,ps) sig_negs = [(c,p) for (c,p) in cor_ps if c < 0 and p < q] sig_poses = [(c,p) for (c,p) in cor_ps if c > 0 and p < q] insigs = [(c,p) for (c,p) in cor_ps if p > q] def weighted_correlation(cor_p_Ns): cors,ps,Ns = transpose(cor_p_Ns) return sum([cor*N for (cor,N) in zip (cors,Ns)])/sum(Ns) plt.title("Gini-MI Correlation Coefficient vs. P-value for On-Off Simulations from Prokaryotic Motifs") maybesave(filename)
def make_correlation_structure_by_length(): q = fdr(concat(euk_tests)) plt.close() # get rid of output from cluster_motif lens = map(len, euk_motifs) jss = [indices_where(lens, lambda x:10**i <= x < 10**(i+1)) for i in range(1, 4+1)] for i,js in tqdm(enumerate(jss)): analyze_mi_tests2(rslice(euk_tests, js), rslice(euk_motifs, js), label=str("10**%s" % (i+1)), q=q)
def make_correlation_structure_by_cluster_figure(): from motif_clustering import cluster_motif q = fdr(concat(euk_tests)) euk_clusterses = [map(cluster_motif, tqdm(euk_motifs)) for i in range(3)] plt.close() # get rid of output from cluster_motif mean_lens = map(lambda xs:round(mean(xs)), transpose([map(len,cs) for cs in euk_clusterses])) jss = [indices_where(mean_lens, lambda x:x==i) for i in range(1, 5+1)] for i,js in tqdm(enumerate(jss)): analyze_mi_tests2(rslice(euk_tests, js), rslice(euk_motifs, js), label=str(i+1), q=q)
def analyze_motif(motif, trials=1000): cols = transpose(motif) L = len(cols) ps = [] for col1, col2 in (choose2(cols)): actual_mi = dna_mi(col1,col2) perm_mis = [dna_mi(col1,permute(col2)) for i in xrange(trials)] p = percentile(actual_mi, perm_mis) #print p ps.append(p) q = fdr(ps) correlated_pairs = [(i,j) for (i,j),p in zip(choose2(range(L)),ps) if p < q] num_correlated = len(correlated_pairs) print "correlated column pairs:", num_correlated, "%1.2f" % ((num_correlated)/choose(L,2)) return correlated_pairs
def analyze_motif(motif, trials=1000): cols = transpose(motif) L = len(cols) ps = [] for col1, col2 in (choose2(cols)): actual_mi = dna_mi(col1, col2) perm_mis = [dna_mi(col1, permute(col2)) for i in xrange(trials)] p = percentile(actual_mi, perm_mis) #print p ps.append(p) q = fdr(ps) correlated_pairs = [(i, j) for (i, j), p in zip(choose2(range(L)), ps) if p < q] num_correlated = len(correlated_pairs) print "correlated column pairs:", num_correlated, "%1.2f" % ( (num_correlated) / choose(L, 2)) return correlated_pairs
def analyze_mi_tests2(tests, motifs, q=None, label=None): q = fdr(concat(tests)) correlated_percentage = count(lambda x:x <= q,(concat(tests)))/float(len(concat(tests))) ds = [[j - i for (i, coli), (j,colj) in choose2(list(enumerate(transpose(motif))))] for motif in motifs] def binom_ci(xs): """return width of error bar""" bs_means = sorted([mean(bs(xs)) for x in range(1000)]) mu = mean(xs) return (mu - bs_means[25], bs_means[975] - mu) tests_by_dist = [[t <= q for t,d in zip(concat(tests), concat(ds)) if d == i] for i in range(1, 20)] mean_vals = map(lambda xs:mean(xs) if xs else 0, tests_by_dist) cis = map(lambda xs:binom_ci(xs) if xs else (0,0), tests_by_dist) plt.errorbar(range(1,20), mean_vals,yerr=transpose(cis),label=label,capthick=1) plt.xlabel("Distance (bp)",fontsize="large") plt.ylabel("Proportion of Significant Correlations",fontsize="large") plt.legend()
def analyze_correlation_positions(all_tests, alpha="fdr"): if alpha == "fdr": alpha = fdr(concat(all_tests)) print "alpha:",alpha ds = [] d_controls = [] for tests in all_tests: K = len(tests) L = find(lambda l:round(choose(l,2))==K, range(50)) if L is None: print K raise Exception() for k, (i,j) in enumerate(choose2(range(L))): if j == i + 1 and tests[k] <= alpha: d = i/float(L) ds.append(d) d_controls.append(random.randrange(L-1)/float(L)) plt.scatter(d, tests[k]) return ds, d_controls
def on_off_experiment2(num_motifs=100, filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf"): """compare MI vs Gini on biological_motifs""" bio_motifs = [getattr(tfdf, tf) for tf in tfdf.tfs] Ns = map(len, bio_motifs) spoofses = [ spoof_on_off_motif(motif, num_motifs=num_motifs, trials=1) for motif in bio_motifs ] spoof_ginises = mmap(motif_gini, tqdm(spoofses)) spoof_mises = mmap(total_motif_mi, tqdm(spoofses)) cors, ps = [], [] for ginis, mis in zip(ginises, mises): cor, p = pearsonr(ginis, mis) cors.append(cor) ps.append(p) q = fdr(ps) plt.scatter(cors, ps, filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf") plt.plot([-1, 1], [q, q], linestyle='--', label="FDR-Adjusted Significance Level") plt.semilogy() plt.legend() plt.xlabel("Pearson Correlation Coefficient") plt.ylabel("P value") plt.xlim([-1, 1]) plt.ylim([10**-4, 1 + 1]) cor_ps = zip(cors, ps) sig_negs = [(c, p) for (c, p) in cor_ps if c < 0 and p < q] sig_poses = [(c, p) for (c, p) in cor_ps if c > 0 and p < q] insigs = [(c, p) for (c, p) in cor_ps if p > q] def weighted_correlation(cor_p_Ns): cors, ps, Ns = transpose(cor_p_Ns) return sum([cor * N for (cor, N) in zip(cors, Ns)]) / sum(Ns) plt.title( "Gini-MI Correlation Coefficient vs. P-value for On-Off Simulations from Prokaryotic Motifs" ) maybesave(filename)
def analyze_correlated_digrams_canonical(prok_tests, euk_tests, filename=None): digrams = [(b1,b2) for b1 in "ACGT" for b2 in "ACGT"] canonical_digrams = sorted(list(set([min(dg,tuple(wc(dg))) for dg in digrams]))) prok_q = fdr(concat(prok_tests)) euk_q = fdr(concat(euk_tests)) prok_digrams = defaultdict(int) prok_corr_digrams = defaultdict(int) prok_adj_digrams = defaultdict(int) for tests, motif in tqdm(zip(prok_tests, prok_motifs)): for test, ((i,coli),(j,colj)) in zip(tests, choose2(list(enumerate(transpose((motif)))))): for bi,bj in transpose((coli,colj)): rev_comp = tuple(wc((bi,bj))) if (bi, bj) > rev_comp: bi, bj = rev_comp prok_digrams[(bi,bj)] += 1 if j == i + 1: prok_adj_digrams[(bi,bj)] += 1 if test <= prok_q: prok_corr_digrams[(bi,bj)] += 1 prok_corr_N = float(sum(prok_corr_digrams.values())) prok_adj_N = float(sum(prok_adj_digrams.values())) prok_N = float(sum(prok_digrams.values())) #prok_ps = normalize(prok_digrams.values()) #prok_adj_ps = normalize(prok_adj_digrams.values()) #prok_corr_ps = normalize(prok_corr_digrams.values()) prok_ps = normalize([prok_digrams[dg] for dg in canonical_digrams]) prok_adj_ps = normalize([prok_adj_digrams[dg] for dg in canonical_digrams]) prok_corr_ps = normalize([prok_corr_digrams[dg] for dg in canonical_digrams]) prok_yerr = [1.96*sqrt(1.0/prok_N*p*(1-p)) for p in prok_ps] prok_adj_yerr = [1.96*sqrt(1.0/prok_adj_N*p*(1-p)) for p in prok_adj_ps] prok_corr_yerr = [1.96*sqrt(1.0/prok_corr_N*p*(1-p)) for p in prok_corr_ps] euk_digrams = defaultdict(int) euk_corr_digrams = defaultdict(int) euk_adj_digrams = defaultdict(int) for tests, motif in tqdm(zip(euk_tests, euk_motifs)): for test, ((i,coli),(j,colj)) in zip(tests, choose2(list(enumerate(transpose((motif)))))): for bi,bj in transpose((coli,colj)): rev_comp = tuple(wc((bi,bj))) if (bi, bj) > rev_comp: bi, bj = rev_comp euk_digrams[(bi,bj)] += 1 if j == i + 1: euk_adj_digrams[(bi,bj)] += 1 if test <= euk_q: euk_corr_digrams[(bi,bj)] += 1 euk_corr_N = float(sum(euk_corr_digrams.values())) euk_adj_N = float(sum(euk_adj_digrams.values())) euk_N = float(sum(euk_digrams.values())) # euk_ps = normalize(euk_digrams.values()) # euk_adj_ps = normalize(euk_adj_digrams.values()) # euk_corr_ps = normalize(euk_corr_digrams.values()) euk_ps = normalize([euk_digrams[dg] for dg in canonical_digrams]) euk_adj_ps = normalize([euk_adj_digrams[dg] for dg in canonical_digrams]) euk_corr_ps = normalize([euk_corr_digrams[dg] for dg in canonical_digrams]) euk_yerr = [1.96*sqrt(1.0/euk_N*p*(1-p)) for p in euk_ps] euk_adj_yerr = [1.96*sqrt(1.0/euk_adj_N*p*(1-p)) for p in euk_adj_ps] euk_corr_yerr = [1.96*sqrt(1.0/euk_corr_N*p*(1-p)) for p in euk_corr_ps] palette = sns.cubehelix_palette(4) ax = plt.subplot(211) # plt.bar(range(16),normalize(prok_digrams.values())) # plt.bar(range(16),normalize(prok_corr_digrams.values()),color='g') # plt.bar([x-0.2 for x in range(16)], prok_relative_ratios.values(), color='g', label="Correlated Column-pairs",width=0.2) # plt.bar([x for x in range(16)],prok_adj_relative_ratios.values(),color='r',alpha=1,yerr=prok_adj_yerr,label="Adjacent Column-pairs",width=0.2) # plt.bar([x+0.2 for x in range(16)],[1]*16,color='b',alpha=1,yerr=(prok_yerr),capsize=10,capstyle='butt',label="All Column-pairs",width=0.2) plt.bar([x-0.2 for x in range(len(canonical_digrams))], prok_ps, label="All Column-Pairs",width=0.2,yerr=prok_yerr,color=palette[0]) plt.bar([x for x in range(len(canonical_digrams))],prok_adj_ps,label="Adj. Column-Pairs", width=0.2,yerr=prok_adj_yerr,color=palette[1]) plt.bar([x+0.2 for x in range(len(canonical_digrams))],prok_corr_ps,alpha=1, capstyle='butt',label="Corr. Adj. Column-Pairs",width=0.2,yerr=prok_corr_yerr,color=palette[3]) #plt.plot([0,16],[1.0/16, 1.0/16],linestyle='--',color=palette[3],label="Equiprobability",linewidth=1) ax.set_xticks([x for x in range(len(canonical_digrams))]) ax.set_xticklabels( ["".join(dg) for dg in canonical_digrams],fontsize='large') plt.xlim(-0.5,10.5) plt.ylim(0,0.3) #plt.xlabel("Dimer",fontsize='large') plt.ylabel("Prokaryotic Frequency",fontsize='large') #plt.ylim(0,2) plt.legend(loc='upper right') ax2 = plt.subplot(212) #plt.plot([0,16],[1.0/16, 1.0/16],linestyle='--',color=palette[3],label="Equiprobability",linewidth=1) plt.bar([x-0.2 for x in range(len(canonical_digrams))], euk_ps, label="All Column-Pairs",width=0.2,yerr=euk_yerr,color=palette[0]) plt.bar([x for x in range(len(canonical_digrams))],euk_adj_ps,label="Adj. Column-Pairs", width=0.2,yerr=euk_adj_yerr,color=palette[1]) plt.bar([x+0.2 for x in range(len(canonical_digrams))],euk_corr_ps,alpha=1, capstyle='butt',label="Corr. Adj. Column-Pairs",width=0.2,yerr=euk_corr_yerr,color=palette[3]) ax2.set_xticks([x for x in range(len(canonical_digrams))]) ax2.set_xticklabels( ["".join(dg) for dg in canonical_digrams],fontsize='large') #plt.xlabel("Dimer",fontsize='large') plt.xlim(-0.5,10.5) plt.ylim(0,0.2) plt.ylabel("Eukaryotic Frequency",fontsize='large') #plt.ylim(0,2) plt.legend(loc='upper right') maybesave(filename)