Ejemplo n.º 1
0
def gini_pred_vs_obs_plot(filename=None):
    plt.scatter(np.array(pred_ginis), np.array(obs_ginis))
    plt.xlabel("Predicted Gini coeff")
    plt.ylabel("Observed Gini coeff")
    plt.text(0.8, 0.2, "r^2 = 0.25")
    plt.plot([0, 1], [0, 1], lienstyle='--')
    maybesave(filename)
def plot_results_dict_gini_qq(results_dict,filename=None):
    bios = []
    maxents = []
    uniforms = []
    for i,k in enumerate(results_dict):
        g1,g2,tf = k.split("_")
        genome = g1 + "_" + g2
        bio_motif = extract_tfdf_sites(genome,tf)
        bio_ic = motif_ic(bio_motif)
        bio_gini = motif_gini(bio_motif)
        d = results_dict[k]
        bios.append(bio_gini)
        maxents.append(mean(d['maxent']['motif_gini']))
        uniforms.append(mean(d['uniform']['motif_gini']))
    plt.scatter(bios,maxents,label='ME')
    plt.scatter(bios,uniforms,label='TURS',color='g')
    minval = min(bios+maxents+uniforms)
    maxval = max(bios+maxents+uniforms)
    plt.plot([minval,maxval],[minval,maxval],linestyle='--')
    plt.xlabel("Observed Gini Coefficient")
    plt.ylabel("Mean Sampled Gini Coefficient")
    plt.legend(loc='upper left')
    print "bio vs maxent:",pearsonr(bios,maxents)
    print "bio vs uniform:",pearsonr(bios,uniforms)
    maybesave(filename)
Ejemplo n.º 3
0
def main(prok_motifs, euk_motifs, filename='motif_summary_stats.eps'):
    sns.set(style="darkgrid", color_codes=True)
    #df = pd.DataFrame(columns="Type N L IC Gini".split(), index=range(len(prok_motifs) + len(euk_motifs)))
    df = pd.DataFrame()
    df['Domain'] = ["Eukaryotic" for _ in euk_motifs
                    ] + ["Prokaryotic" for _ in prok_motifs]
    motifs = euk_motifs + prok_motifs
    df['N'] = [log(len(motif)) / log(10) for motif in motifs]
    df['L (bp)'] = [len(motif[0]) for motif in motifs]
    df['IC (bits)'] = [motif_ic(motif) for motif in motifs]
    df['IGC'] = [motif_gini(motif) for motif in motifs]
    pg = sns.pairplot(df,
                      hue='Domain',
                      markers='s o'.split(),
                      palette='cubehelix')
    #hue_order=["Prokaryotic", "Eukaryotic"])
    for i in range(4):
        pg.axes[i][3].set_xlim(-0.01, 0.6)
    for j in range(4):
        pg.axes[3][j].set_ylim(-0.01, 0.6)
    pg.axes[0][0].set_yticks(range(1, 5))
    pg.axes[0][0].set_yticklabels(["$10^%i$" % i for i in range(1, 5)])
    pg.axes[3][0].set_xticks(range(1, 5))
    pg.axes[3][0].set_xticklabels(["$10^%i$" % i for i in range(1, 5)])
    maybesave(filename)
def redo_ic_igc_plot(filename=None):
    xmin, xmax, ymin, ymax = 0, 0.6, 0, 0.6
    marker_size = 40
    sns.set_style('white')
    #sns.set_style('darkgrid')
    plt.subplot(1,2,1)
    # plt.xlim(0,0.4)
    # plt.ylim(0,0.6)
    plt.xlim(xmin, xmax)
    plt.ylim(ymin, ymax)
    for x,y,p in zip(prok_maxent_ginis, prok_ginis, prok_patterns):
        plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size)
    plt.plot([0,1],[0,1],linestyle='--',color='black')
    print "prok maxent"
    print pearsonr(prok_maxent_ginis,prok_ginis)
    plt.xlabel("MaxEnt IGC",fontsize='large')
    plt.ylabel("Prokaryotic IGC",fontsize='large')
    plt.subplot(1,2,2)
    # plt.xlim(0,0.4)
    # plt.ylim(0,0.6)
    plt.xlim(xmin, xmax)
    plt.ylim(ymin, ymax)
    print "euk maxent:"
    print pearsonr(euk_maxent_ginis,euk_ginis)
    for x,y,p in zip(euk_maxent_ginis, euk_ginis, euk_patterns):
        plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size)
    plt.plot([0,1],[0,1],linestyle='--',color='black')
    plt.ylabel("Eukaroytic IGC",fontsize='large')
    plt.xlabel("MaxEnt IGC",fontsize='large')
    maybesave(filename)
def sigma_Ne_contour_plot(filename=None):
    sigmas = np.linspace(0,5,20)
    Nes = np.linspace(1,20,20)
    L = 10
    n = 50
    copies = 10*n
    trials = 100
    motifss = [[[(sample_motif(sigma, Ne, L, copies, n))
               for i in range(trials)]
          for sigma in sigmas] for Ne in tqdm(Nes)]
    occ_M = [[expected_occupancy(sigma, Ne, L, copies)
          for sigma in sigmas] for Ne in tqdm(Nes)]
    print "ic_M"
    ic_M = mmap(lambda ms:mean(map(motif_ic,ms)),motifss)
    print "gini_M"
    gini_M = mmap(lambda ms:mean(map(motif_gini,ms)),motifss)
    print "mi_M"
    mi_M = mmap(lambda ms:mean(map(total_motif_mi,ms)),tqdm(motifss))
    plt.subplot(2,2,1)
    plt.contourf(sigmas,Nes,occ_M,cmap='jet')
    plt.colorbar()
    plt.subplot(2,2,2)
    plt.contourf(sigmas,Nes,ic_M,cmap='jet')
    plt.colorbar()
    plt.subplot(2,2,3)
    plt.contourf(sigmas,Nes,gini_M,cmap='jet')
    plt.colorbar()
    plt.subplot(2,2,4)
    plt.contourf(sigmas,Nes,mi_M,cmap='jet')
    plt.colorbar()
    maybesave(filename)
def interpret_biological_experiment2(results_dict,filename=None):
    spoof_names = sorted([k for k in results_dict.values()[0] if not k == 'bio'])
    stat_names = sorted([k for k in results_dict.values()[0]['bio']])
    tf_names = sorted(results_dict.keys(),key=lambda tf:results_dict[tf]["bio"]["motif_ic"])
    def order_tfs_by(stat_name):
        return sorted(results_dict.keys(),key=lambda tf:results_dict[tf]["bio"][stat_name])
    def bio_stats(fname,order_by_stat="motif_ic"):
        ordered_tfs = order_tfs_by(order_by_stat)
        return [results_dict[tf]["bio"][fname] for tf in ordered_tfs]
    def spoof_stats(spoof_name,fname,order_by_stat="motif_ic"):
        ordered_tfs = order_tfs_by(order_by_stat)
        return [results_dict[tf][spoof_name][fname] for tf in ordered_tfs]
    for spoof_idx,spoof_name in enumerate(spoof_names):
        for stat_idx,stat_name in enumerate(stat_names):
            plt.subplot(len(spoof_names),len(stat_names),spoof_idx*3+stat_idx+1)
            if spoof_idx == 0 and stat_idx == 0:
                plt.ylabel("MaxEnt Sampling")
            if spoof_idx == 1 and stat_idx == 0:
                plt.ylabel("Uniform Sampling")
            plt.boxplot(spoof_stats(spoof_name,stat_name,order_by_stat=stat_name))
            plt.scatter(range(1,len(bio_stats(stat_name))+1),bio_stats(stat_name,order_by_stat=stat_name),
                        marker='^',color='r')
            plt.title(stat_name)
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                labelbottom='off')
    maybesave(filename)
Ejemplo n.º 7
0
def L_vs_sigma_plot(filename=None, with_bio=False):
    if with_bio:
        tfdf = extract_motif_object_from_tfdf()
        motifs = [getattr(tfdf, tf) for tf in tfdf.tfs]
        Ls = [len(motif[0]) for motif in motifs]
        cs = [len(motif) for motif in motifs]
        ics = [motif_ic(motif) for motif in motifs]
        ic_density = [ic / L for ic, L in zip(ics, Ls)]
        sigmas = [mean(map(sd, make_pssm(motif))) for motif in motifs]
        ginis = [motif_gini(motif, correct=False) for motif in motifs]
        mi_density = [
            total_motif_mi(motif) / choose(L, 2)
            for motif, L in zip(motifs, Ls)
        ]
    min_sigma = 0.1
    max_sigma = 10
    plt.xlim(0, max_sigma)
    plt.ylim(0, 60)
    plt.plot(*pl(crit_L, np.linspace(min_sigma, max_sigma, 1000)),
             label="Binding Transition")
    plt.plot([min_sigma, max_sigma],
             [log(G, 2) / 2, log(G, 2) / 2],
             linestyle='--',
             label="Info Theory Threshold")
    # plt.plot(*pl(lambda sigma:log(G)/sigma,np.linspace(min_sigma,max_sigma,1000)),
    #          linestyle='--',label="Zero Discrimination Asymptote")
    if with_bio:
        plt.scatter(sigmas, Ls, label="Biological Motifs")
    plt.xlabel("sigma")
    plt.ylabel("L")
    plt.legend()
    maybesave(filename)
def sigma_Ne_contour_plot(filename=None):
    sigmas = np.linspace(0, 5, 20)
    Nes = np.linspace(1, 20, 20)
    L = 10
    n = 50
    copies = 10 * n
    trials = 100
    motifss = [[[(sample_motif(sigma, Ne, L, copies, n))
                 for i in range(trials)] for sigma in sigmas]
               for Ne in tqdm(Nes)]
    occ_M = [[expected_occupancy(sigma, Ne, L, copies) for sigma in sigmas]
             for Ne in tqdm(Nes)]
    print "ic_M"
    ic_M = mmap(lambda ms: mean(map(motif_ic, ms)), motifss)
    print "gini_M"
    gini_M = mmap(lambda ms: mean(map(motif_gini, ms)), motifss)
    print "mi_M"
    mi_M = mmap(lambda ms: mean(map(total_motif_mi, ms)), tqdm(motifss))
    plt.subplot(2, 2, 1)
    plt.contourf(sigmas, Nes, occ_M, cmap='jet')
    plt.colorbar()
    plt.subplot(2, 2, 2)
    plt.contourf(sigmas, Nes, ic_M, cmap='jet')
    plt.colorbar()
    plt.subplot(2, 2, 3)
    plt.contourf(sigmas, Nes, gini_M, cmap='jet')
    plt.colorbar()
    plt.subplot(2, 2, 4)
    plt.contourf(sigmas, Nes, mi_M, cmap='jet')
    plt.colorbar()
    maybesave(filename)
def gini_vs_mi_comparison(filename=None):
    sys.path.append("/home/pat/jaspar")
    from parse_jaspar import euk_motifs
    euk_motifs = [motif if len(motif) <= 200 else sample(200,motif,replace=False)
                     for motif in euk_motifs]
    prok_ginis = map(motif_gini,bio_motifs)
    prok_mis = map(total_motif_mi,tqdm(bio_motifs))
    prok_mipps = map(motif_mi_pp,tqdm(bio_motifs))
    eu_ginis = map(motif_gini,jaspar_motifs)
    eu_mis = map(total_motif_mi,tqdm(jaspar_motifs))
    eu_mipps = map(motif_mi_pp,tqdm(jaspar_motifs))

    plt.subplot(1,2,1)
    plt.scatter(prok_ginis,prok_mipps)
    plt.xlabel("Gini Coefficient")
    plt.ylabel("MI (bits / column pair)")
    plt.title("Prokaryotic Motifs")
    plt.xlim(-.1,.7)
    plt.ylim(-0.1,0.7)
    plt.subplot(1,2,2)
    plt.scatter(eu_ginis,eu_mipps)
    plt.xlabel("Gini Coefficient")
    plt.xlim(-.1,.7)
    plt.ylim(-0.1,0.7)
    plt.title("Eukaryotic Motifs")
    plt.suptitle("Mutual Information vs Gini Coefficient")
    maybesave(filename)
Ejemplo n.º 10
0
def make_plot(filename=None):
    trials_per_iteration = 3
    iterations = [10**i for i in [0,1,2,3,4]]
    n          = 50
    L          = 10
    desired_ic = 10
    tv         = 0.01
    correction_per_col = 3/(2*log(2)*n)
    desired_ic_for_beta = desired_ic + L * correction_per_col
    beta       = find_beta_for_mean_motif_ic(n,L,desired_ic_for_beta)
    epsilon    = 0.1
    alpha = exp(-2*beta*epsilon)
    opt_iterations = int(ceil(log(tv)/log(1-alpha)))
    opt_epsilon = 1/(2*beta)
    print "optimum iterations:", opt_iterations
    print "optimum epsilon:", opt_epsilon
    results = {}
    for iteration in iterations:
        print "starting on:", iteration
        motifs = [uniform_motif_with_ic_imh(n,L,desired_ic,epsilon=epsilon,beta=beta,iterations=iteration)[-1]
                  for trial in trange(trials_per_iteration)]
        ics = map(motif_ic, motifs)
        results[iteration] = ics
    opt_ics = [uniform_motif_imh_tv(n,L,desired_ic,beta=beta,epsilon=epsilon)
               for trial in range(trials_per_iteration)]
    icss = [results[iteration] for iteration in iterations]
    plt.boxplot(icss + [opt_ics])
    maybesave(filename)
def bio_detector_experiment_prok_euk(filename=None,pickle_filename=None):
    #use data from prok_euk_ic_gini_experiment; Figure 4 in Gini Paper
    if pickle_filename is None:
        prok_motifs = bio_motifs
        euk_motifs = [motif if len(motif) <= 200 else sample(200,motif,replace=False)
                      for motif in euk_motifs]
        with open("prok_euk_ic_gini_experiment.pkl") as f:
            (prok_maxents, prok_uniforms, euk_maxents, euk_uniforms) = cPickle.load(f)
        prok_bio_ginis = map(motif_gini, prok_motifs)
        euk_bio_ginis = map(motif_gini, euk_motifs)
        prok_ps = [percentile(bio_gini,map(motif_gini,spoofs)) for bio_gini,spoofs in zip(prok_bio_ginis,prok_maxents)]
        prok_spoofs = [spoofs[0] for spoofs in prok_maxents]
        prok_neg_ps = [percentile(motif_gini(spoof),map(motif_gini,spoofs))
                   for spoof,spoofs in zip(prok_spoofs,prok_maxents)]
        euk_ps = [percentile(bio_gini,map(motif_gini,spoofs)) for bio_gini,spoofs in zip(euk_bio_ginis,euk_maxents)]
        euk_spoofs = [spoofs[0] for spoofs in euk_maxents]
        euk_neg_ps = [percentile(motif_gini(spoof),map(motif_gini,spoofs))
                      for spoof,spoofs in zip(euk_spoofs,euk_maxents)]
        with open("bio_detector_experiment_prok_euk.pkl",'w') as f:
            cPickle.dump((prok_ps,euk_ps,prok_neg_ps,euk_neg_ps),f)
    else:
        with open(pickle_filename) as f:
            (prok_ps,euk_ps,prok_neg_ps,euk_neg_ps) = cPickle.load(f)
    sns.set_style('white')
    #sns.set_palette('gray')
    sns.set_palette(sns.cubehelix_palette(3))
    roc_curve(prok_ps + euk_ps,prok_neg_ps + euk_neg_ps,color='black')
    plt.xlabel("FPR",fontsize='large')
    plt.ylabel("TPR",fontsize='large')
    maybesave(filename)
Ejemplo n.º 12
0
def on_off_experiment2(num_motifs=100,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf"):
    """compare MI vs Gini on biological_motifs"""
    bio_motifs = [getattr(tfdf,tf) for tf in tfdf.tfs]
    Ns = map(len, bio_motifs)
    spoofses = [spoof_on_off_motif(motif,num_motifs=num_motifs,trials=1) for motif in bio_motifs]
    spoof_ginises = mmap(motif_gini,tqdm(spoofses))
    spoof_mises = mmap(total_motif_mi,tqdm(spoofses))
    cors, ps = [],[]
    for ginis, mis in zip(ginises, mises):
        cor, p = pearsonr(ginis,mis)
        cors.append(cor)
        ps.append(p)
    q = fdr(ps)
    
    plt.scatter(cors,ps,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf")
    plt.plot([-1,1],[q,q],linestyle='--',label="FDR-Adjusted Significance Level")
    plt.semilogy()
    plt.legend()
    plt.xlabel("Pearson Correlation Coefficient")
    plt.ylabel("P value")
    plt.xlim([-1,1])
    plt.ylim([10**-4,1+1])
    cor_ps = zip(cors,ps)
    sig_negs = [(c,p) for (c,p) in cor_ps if c < 0 and p < q]
    sig_poses = [(c,p) for (c,p) in cor_ps if c > 0 and p < q]
    insigs = [(c,p) for (c,p) in cor_ps if p > q]
    def weighted_correlation(cor_p_Ns):
        cors,ps,Ns = transpose(cor_p_Ns)
        return sum([cor*N for (cor,N) in zip (cors,Ns)])/sum(Ns)
    plt.title("Gini-MI Correlation Coefficient vs. P-value for On-Off Simulations from Prokaryotic Motifs")
    maybesave(filename)
def make_occupancy_figure():
    occs2 = occ_matrix(
        G=5 * 10**6,
        occ_f=lambda *args: mean(occ2(*args) for i in range(1000)),
        sigmas=np.linspace(0, 10, 100))
    plot_matrix(occs2, show=False)
    plt.title("Mean Occupancy of Random Gaussian Ensembles of PWMs")
    maybesave("basic_occupancy1000.png")
Ejemplo n.º 14
0
def plot_matrix_chain(mc,true_ll,filename=None):
    lls = [m[1] for m in mc]
    plt.plot(lls)
    plt.plot([true_ll for l in lls],label='True Log-likelihood',linestyle='--')
    plt.xlabel("Iteration")
    plt.ylabel("Log-likelihood")
    plt.legend(loc='lower right')
    maybesave(filename)
Ejemplo n.º 15
0
def graph_acceptance_ratio(filename):
    ps = normalize([exp(-random.gauss(0,5)) for i in xrange(5000000)])
    ars = [show(ncp(ps,i)) for i in range(10+1)]
    plt.plot(ars)
    plt.semilogy()
    plt.xlabel("Copy Number")
    plt.ylabel("Acceptance Ratio")
    plt.title("Acceptance Ratio vs. Copy number for 5*10^6 LN(0,5) sites")
    maybesave(filename)
def discrete_parallelogram_plot(filename=None):
    motifs = concat([maxent_motifs_with_ic(200,10,ic,10) for ic in tqdm(np.linspace(0.5,19.5,100))])
    ics = map(motif_ic,motifs)
    mis = map(total_motif_mi,motifs)
    plt.scatter(ics,mis)
    plt.xlabel("IC (bits)")
    plt.ylabel("Pairwise MI (bits)")
    plt.title("IC vs Pairwise MI for MaxEnt Motifs")
    maybesave(filename)
Ejemplo n.º 17
0
def mi_pred_vs_obs_plot(filename=None):
    plt.scatter(
        np.array(pred_mis) / np.array(sizes),
        np.array(obs_mis) / np.array(sizes))
    plt.xlabel("Predicted MI Density (bits/comparison)")
    plt.ylabel("Observed MI Density (bits/comparison)")
    plt.plot([0, 0.3], [0, 0.3], linestyle='--')
    plt.text(0.05, 0.2, "r^2 = 0.98")
    maybesave(filename)
Ejemplo n.º 18
0
def viz_sample(sample,filename=None):
    """Visualize a sample trajectory"""
    plt.subplot(211)
    plt.imshow(transpose(sample),interpolation='nearest',aspect='auto')
    plt.ylabel("Position")
    plt.subplot(212)
    energies = map(hamiltonian,sample)
    plt.plot(energies)
    plt.ylabel("Energy")
    plt.xlabel("Iteration")
    maybesave(filename)
def mu_approx_fig(filename=None):
    sigma = 1
    L = 10
    copy_range = np.linspace(1,10**5,100)
    plt.plot(*pl(lambda copy_num:mu_from(G,sigma,L,copy_num=copy_num),copy_range),label="Exact")
    plt.plot(*pl(lambda copy_num:approx_mu(G,sigma,L,copy_num=copy_num),copy_range),label="Approx")
    plt.xlabel("Copy number")
    plt.ylabel("$\mu$")
    plt.semilogx()
    plt.legend(loc='ul')
    plt.title("Exact vs. Approximate Chemical Potential")
    maybesave(filename)
def bio_detector_experiment(filename=None):
    """use high Gini to detect biological motifs"""
    bio_ginis = map(motif_gini, bio_motifs)
    maxent_spoofs = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(bio_motifs)]
    maxent_ginis = mmap(motif_gini, maxent_spoofs)
    ps = zipWith(percentile,bio_ginis, maxent_ginis)
    neg_controls = map(first, maxent_spoofs)
    neg_control_spoofs = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(neg_controls)]
    nc_ps = zipWith(percentile,map(motif_gini,neg_controls), mmap(motif_gini, neg_control_spoofs))
    roc_curve(ps, nc_ps)
    plt.xlabel("FPR",fontsize='large')
    plt.ylabel("TPR",fontsize='large')
    maybesave(filename)
def plot_matrices(*args, **kwargs):
    print kwargs
    n = len(args)
    for i, arg in enumerate(args):
        plt.subplot(1, n, i + 1)
        if "labels" in kwargs:
            plt.title(kwargs["labels"][i])
        plt.imshow(arg, interpolation='none')
        #plot_matrix(arg,show=False,xlabel=None,ylabel=None)
        plt.xlabel("Binding Site Length")
        plt.ylabel("Standard Deviation of Weight Matrix")
    plt.colorbar(label='Occupancy')
    fname = kwargs.get('fname', None)
    maybesave(fname)
def make_sigma_infty_asymptote_figure():
    Ls = range(1, 20)
    sigma = 100
    plt.plot(*pl(
        lambda L: mean(occ2(sigma, L, G=5 * 10**6) for i in range(100)), Ls),
             label='Occupancy')
    plt.ylabel("Occupancy")
    plt.xlabel("Length")
    plt.plot([11.12, 11.12], [0, 1],
             linestyle='--',
             label='Predicted Critical Length')
    plt.plot(Ls, [0.5] * len(Ls), linestyle='--', label="occ = 1/2")
    plt.legend(loc='upper left')
    plt.title("Mean Occupancy for sigma = 100")
    maybesave("sigma_infty_asymptote.png")
def plot_results_dict_gini_vs_ic(results_dict,filename=None):
    for i,k in enumerate(results_dict):
        g1,g2,tf = k.split("_")
        genome = g1 + "_" + g2
        bio_motif = extract_tfdf_sites(genome,tf)
        bio_ic = motif_ic(bio_motif)
        bio_gini = motif_gini(bio_motif)
        d = results_dict[k]
        plt.scatter(bio_ic,bio_gini,color='b',label="Bio"*(i==0))
        plt.scatter(mean(d['maxent']['motif_ic']),mean(d['maxent']['motif_gini']),color='g',label='ME'*(i==0))
        plt.scatter(mean(d['uniform']['motif_ic']),mean(d['uniform']['motif_gini']),color='r',label="TURS"*(i==0))
    plt.xlabel("IC (bits)")
    plt.ylabel("Gini Coefficient")
    plt.legend()
    maybesave(filename)
Ejemplo n.º 24
0
def interpret_chain(chain, motif, filename=None):
    N = len(motif)
    log_fhats = [log_fhat(theta,motif) for theta in chain]
    log_Zs = [log_ZM_hack(theta,N) for theta in chain]
    log_ps = [lf - log_Z for (lf, log_Z) in zip(log_fhats, log_Zs)]
    plt.plot(map(logmod, [mean(score_seq(x[0],site) for site in motif) for x in chain]),
             label="Mean Site Energy (kBT)")
    plt.plot(map(logmod, [x[1] for x in chain]),label="$\mu$ (kBT)")
    plt.plot(map(logmod, [x[2] for x in chain]),label="$Ne$")
    plt.plot(map(logmod, log_fhats),label="log fhat")
    plt.plot(map(logmod, log_Zs),label="log_ZM")
    plt.plot(map(logmod, log_ps),label="log p")
    plt.plot(map(logmod, [mean(occs(x, motif)) for x in chain]),label="Mean Occupancy")
    plt.legend(loc='right',fontsize='large')
    plt.xlabel("Iteration",fontsize='large')
    maybesave(filename)
def mu_approx_fig(filename=None):
    sigma = 1
    L = 10
    copy_range = np.linspace(1, 10**5, 100)
    plt.plot(*pl(lambda copy_num: mu_from(G, sigma, L, copy_num=copy_num),
                 copy_range),
             label="Exact")
    plt.plot(*pl(lambda copy_num: approx_mu(G, sigma, L, copy_num=copy_num),
                 copy_range),
             label="Approx")
    plt.xlabel("Copy number")
    plt.ylabel("$\mu$")
    plt.semilogx()
    plt.legend(loc='ul')
    plt.title("Exact vs. Approximate Chemical Potential")
    maybesave(filename)
def rfreq_rseq_experiment(obj,filename="rfreq_vs_rseq_in_sefas_collection.png"):
    Rfreqs = []
    Rseqs = []
    G = 5.0*10**6
    min_rfreq = log2(G/500)
    for tf in obj.tfs:
        motif = getattr(obj,tf)
        Rfreqs.append(log(G/len(motif),2))
        Rseqs.append(motif_ic(motif))
    plt.scatter(Rfreqs,Rseqs)
    plt.xlabel("log(G/n) (bits)")
    plt.ylabel("Motif Information Content (bits)")
    plt.plot([0,20],[0,20],linestyle='--',label='Theory')
    plt.plot([min_rfreq,min_rfreq],[0,30],linestyle='--',label='Maximum Plausible Regulon Size')
    plt.title("Motif Information Content vs. Search Difficulty")
    plt.legend(loc='upper left')
    maybesave(filename)
Ejemplo n.º 27
0
def on_off_experiment2(num_motifs=100,
                       filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf"):
    """compare MI vs Gini on biological_motifs"""
    bio_motifs = [getattr(tfdf, tf) for tf in tfdf.tfs]
    Ns = map(len, bio_motifs)
    spoofses = [
        spoof_on_off_motif(motif, num_motifs=num_motifs, trials=1)
        for motif in bio_motifs
    ]
    spoof_ginises = mmap(motif_gini, tqdm(spoofses))
    spoof_mises = mmap(total_motif_mi, tqdm(spoofses))
    cors, ps = [], []
    for ginis, mis in zip(ginises, mises):
        cor, p = pearsonr(ginis, mis)
        cors.append(cor)
        ps.append(p)
    q = fdr(ps)

    plt.scatter(cors,
                ps,
                filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf")
    plt.plot([-1, 1], [q, q],
             linestyle='--',
             label="FDR-Adjusted Significance Level")
    plt.semilogy()
    plt.legend()
    plt.xlabel("Pearson Correlation Coefficient")
    plt.ylabel("P value")
    plt.xlim([-1, 1])
    plt.ylim([10**-4, 1 + 1])
    cor_ps = zip(cors, ps)
    sig_negs = [(c, p) for (c, p) in cor_ps if c < 0 and p < q]
    sig_poses = [(c, p) for (c, p) in cor_ps if c > 0 and p < q]
    insigs = [(c, p) for (c, p) in cor_ps if p > q]

    def weighted_correlation(cor_p_Ns):
        cors, ps, Ns = transpose(cor_p_Ns)
        return sum([cor * N for (cor, N) in zip(cors, Ns)]) / sum(Ns)

    plt.title(
        "Gini-MI Correlation Coefficient vs. P-value for On-Off Simulations from Prokaryotic Motifs"
    )
    maybesave(filename)
def controlling_for_gc_experiment():
    euk_downmotifs = [downsample(200, motif) for motif in euk_motifs]
    prok_spoofses = [spoof_maxent_motifs(motif, 1000) for motif in tqdm(prok_motifs)]
    euk_spoofses = [spoof_maxent_motifs(motif, 1000) for motif in tqdm(euk_downmotifs)]
    prok_spoofses_gc = [spoof_maxent_motifs_gc(motif, 1000) for motif in tqdm(prok_motifs)]
    euk_spoofses_gc = [spoof_maxent_motifs_gc(motif, 1000) for motif in tqdm(euk_downmotifs)]
    with open("prok_spoofses.pkl",'w') as f:
        cPickle.dump(prok_spoofses, f)
    with open("euk_spoofses.pkl",'w') as f:
        cPickle.dump(euk_spoofses, f)
    with open("prok_spoofses_gc.pkl",'w') as f:
        cPickle.dump(prok_spoofses_gc, f)
    with open("euk_spoofses_gc.pkl",'w') as f:
        cPickle.dump(euk_spoofses_gc, f)
        
    prok_ginis = map(motif_gini, prok_motifs)
    euk_ginis = map(motif_gini, euk_downmotifs)
    prok_spoof_ginis = [mean(map(motif_gini, spoofs)) for spoofs in tqdm(prok_spoofses)]
    euk_spoof_ginis = [mean(map(motif_gini, spoofs)) for spoofs in tqdm(euk_spoofses)]
    prok_spoof_gc_ginis = [mean(map(motif_gini, spoofs)) for spoofs in tqdm(prok_spoofses_gc)]
    euk_spoof_gc_ginis = [mean(map(motif_gini, spoofs)) for spoofs in tqdm(euk_spoofses_gc)]

    sns.set_style('white')
    palette = sns.cubehelix_palette(3)
    sns.set_palette(palette)
    plt.subplot(1,2,1)
    plt.plot([0,0.5], [0,0.5], linestyle='--', color='black')
    plt.scatter(prok_spoof_ginis, prok_spoof_gc_ginis,
                color=palette[1], edgecolor='black', label='Prokaryotic Motifs')
    plt.xlim(0, 0.5)
    plt.ylim(0, 0.5)
    plt.xlabel("Mean Replicate IGC")
    plt.ylabel("Mean %GC-controlled Replicate IGC")
    plt.title("Prokaryotic Motifs")
    plt.subplot(1,2,2)
    plt.plot([0,0.5], [0,0.5], linestyle='--', color='black')
    plt.scatter(euk_spoof_ginis, euk_spoof_gc_ginis, color=palette[1], edgecolor='black')
    plt.xlim(0, 0.5)
    plt.ylim(0, 0.5)
    plt.xlabel("Mean Replicate IGC")
    plt.ylabel("Mean %GC-controlled Replicate IGC")
    plt.title("Eukaryotic Motifs")
    maybesave("control-gc.eps")
Ejemplo n.º 29
0
def interpret_chain(chain, motif, filename=None):
    N = len(motif)
    log_fhats = [log_fhat(theta, motif) for theta in chain]
    log_Zs = [log_ZM_hack(theta, N) for theta in chain]
    log_ps = [lf - log_Z for (lf, log_Z) in zip(log_fhats, log_Zs)]
    plt.plot(
        map(logmod,
            [mean(score_seq(x[0], site) for site in motif) for x in chain]),
        label="Mean Site Energy (kBT)")
    plt.plot(map(logmod, [x[1] for x in chain]), label="$\mu$ (kBT)")
    plt.plot(map(logmod, [x[2] for x in chain]), label="$Ne$")
    plt.plot(map(logmod, log_fhats), label="log fhat")
    plt.plot(map(logmod, log_Zs), label="log_ZM")
    plt.plot(map(logmod, log_ps), label="log p")
    plt.plot(map(logmod, [mean(occs(x, motif)) for x in chain]),
             label="Mean Occupancy")
    plt.legend(loc='right', fontsize='large')
    plt.xlabel("Iteration", fontsize='large')
    maybesave(filename)
def make_sigma_0_figure(sigma=0.1, fname="sigma_0.png"):
    G = 5 * 10**6

    def critical_L(sigma):
        return log(G) / (sigma * (1 - sigma / 2.0))

    Lstar = critical_L(sigma)
    print "Lstar:", Lstar
    Ls = range(1, int(2 * Lstar))
    plt.plot(*pl(
        lambda L: mean(occ2(sigma, L, G=5 * 10**6) for i in range(100)), Ls),
             label='Occupancy')
    plt.ylabel("Occupancy")
    plt.xlabel("Length")
    plt.plot([Lstar, Lstar], [0, 1],
             linestyle='--',
             label='Predicted Critical Length')
    plt.plot(Ls, [0.5] * len(Ls), linestyle='--', label="occ = 1/2")
    plt.legend(loc='upper left')
    plt.title("Mean Occupancy for sigma = %s" % sigma)
    maybesave(fname)
Ejemplo n.º 31
0
def illustrate_rho_partitioning(filename=None, Ne=2):
    n = 10
    L = 10
    sigma = 1
    nu = Ne - 1
    rhos = range(n * L + 1)
    log_fs = [predict_log_f(rho, n, L, sigma=sigma) for rho in rhos]
    log_ws = [log_w(n, L, rho) for rho in rhos]
    mean_ics = [mean_ic_from_rho(rho, n, L) for rho in rhos]
    plt.plot(map(exp, log_fs), label="Mean Fitness")
    plt.plot(map(exp, log_ws), label="Degeneracy")
    plt.plot(mean_ics, label="Mean IC")
    ps = normalize(map(exp, [nu * lf + lw for lf, lw in zip(log_fs, log_ws)]))
    integrand = [ic * p for ic, p in zip(mean_ics, ps)]
    print sum(ps)
    plt.plot(ps, label="Probability")
    plt.plot(integrand, label="Integrand")
    plt.semilogy()
    plt.xlabel("Rho (Mutational distance from optimal genotype)")
    plt.legend()
    maybesave(filename)
Ejemplo n.º 32
0
def interpret_results_dict(results_dict, filename=None, annotate=False):
    ic_in_range = 0
    ic_lower = 0
    ic_upper = 0
    fnames = "motif_ic motif_gini total_motif_mi".split()
    rel_tfs = [tf for tf in tfdf.tfs if motif_ic(getattr(tfdf, tf)) > 5]
    for tf in rel_tfs:
        motif = getattr(tfdf, tf)
        print tf
        for fname_idx, fname in enumerate(fnames):
            f = eval(fname)
            bio_stat = f(motif)
            lb, ub = mean_ci(results_dict[tf][fname])
            in_range = (lb <= bio_stat <= ub)
            if fname == 'motif_ic':
                ic_in_range += (lb < bio_stat < ub)
                ic_lower += (bio_stat < lb)
                ic_upper += (ub < bio_stat)
            print fname, bio_stat, "(%1.2f, %1.2f)" % (lb, ub), in_range
    print "motif ic in range:", ic_in_range / float(len(rel_tfs))
    print "motif ic lower:", ic_lower / float(len(rel_tfs))
    print "motif ic higher:", ic_upper / float(len(rel_tfs))
    for fname_idx, fname in enumerate(fnames):
        f = eval(fname)
        plt.subplot(1, len(fnames), fname_idx + 1)
        plt.title(fname)
        bio_stats = [f(getattr(tfdf, tf)) for tf in rel_tfs]
        sim_stats = [mean(results_dict[tf][fname]) for tf in rel_tfs]
        pred_obs(zip(bio_stats, sim_stats), show=False)
        if annotate:
            for s, xy in zip(rel_tfs, zip(bio_stats, sim_stats)):
                plt.annotate(s=s, xy=xy)
        plt.xlabel("Biological Value")
        plt.ylabel("Simulated Value")
        r, p = pearsonr(bio_stats, sim_stats)
        print fname, r, r**2, p
    plt.tight_layout()
    maybesave(filename)
Ejemplo n.º 33
0
def occ_matrix_analysis(n=10, occ_matrices=None, filename=None):
    Ls = range(1, 30)
    sigmas = (np.linspace(0, 20, 50))
    Nes = np.linspace(1, 5, 25)
    num_plots = len(Nes)
    rc = int(ceil(sqrt(num_plots)))
    if occ_matrices is None:
        occ_matrices = [[[
            predict_stat(
                n,
                L,
                sigma=sigma,
                Ne=Ne,
                T=lambda rho: mean_occ_from_rho(rho, n, L, sigma=sigma))
            for L in Ls
        ] for sigma in sigmas] for Ne in tqdm(Nes)]
    fig, axes = plt.subplots(nrows=rc, ncols=rc, sharex=True, sharey=True)
    for i, ax in zip(range(len(Nes)), axes.flat):
        #for i,ax in enumerate(Nes):
        im = ax.imshow(np.matrix(occ_matrices[i]).transpose()[::-1],
                       interpolation='none',
                       aspect='auto',
                       vmin=0,
                       vmax=1)
        #ax.set_xticks(Ls)
        #ax.set_xticks()
        #plt.tick_params(axis='x',pad=15)
        #plt.xticks(rotation=90)
        ax.axis('off')
        #ax.set_yticks(sigmas)
    # cax,kw = mpl.colorbar.make_axes([ax for ax in axes.flat])
    # plt.colorbar(im, cax=cax, **kw)
    fig.colorbar(im, ax=axes.ravel().tolist())
    #plt.set_xticks(Ls)
    #maxes = [max(map(max,mat)) for mat in occ_matrices]
    #print maxes
    maybesave(filename)
    return occ_matrices
def prokaryotic_gini_comparison(filename=None):
    """spoof prokaryotic motifs using maxent, uniform and GLE evosims,
    showing gini is higher in GLE than in maxent, uniform"""
    maxent_spoofs = [spoof_motifs_maxent(motif,10,verbose=True)
                     for motif in tqdm(bio_motifs,desc='bio_motifs')]
    uniform_spoofs = [spoof_motifs_uniform(motif,10,verbose=True)
                     for motif in tqdm(bio_motifs,desc='bio_motifs')]
    oo_spoofs = [spoof_motifs_oo(motif,10)
                     for motif in tqdm(bio_motifs,desc='bio_motifs')]
    gle_spoofs = [concat([spoof_motif_gle(motif,10,verbose=True) for i in range(1)])
                  for motif in tqdm(bio_motifs,desc='bio_motifs')]
    maxent_ginis = [mean(map(motif_gini,spoofs)) for spoofs in maxent_spoofs]
    uniform_ginis = [mean(map(motif_gini,spoofs)) for spoofs in uniform_spoofs]
    gle_ginis = [mean(map(motif_gini,spoofs)) for spoofs in gle_spoofs]
    plt.subplot(1,2,1)
    scatter(maxent_ginis,gle_ginis)
    plt.xlabel("MaxEnt")
    plt.ylabel("GLE")
    plt.subplot(1,2,2)
    plt.xlabel("TU")
    scatter(uniform_ginis,gle_ginis)
    plt.suptitle("Gini Coefficients for GLE Simulations vs. MaxEnt, TU Distributions")
    maybesave(filename)
Ejemplo n.º 35
0
def make_bin_col_plot(filename=None):
    plot_bin_col(10,interpolate(.01,10,100),100,sort=True,color='b')
    plot_bin_col(10,interpolate(.01,10,100),100,sort=False,color='g')
    maybesave(filename)
Ejemplo n.º 36
0
def main_experiment(generate_data=False):
    if generate_data:
        iterations = 10000
        prok_chains = [posterior_chain2(motif,iterations=iterations) for motif in tqdm(prok_motifs)]
        prok_bayes_spoofs = [[motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations/2::500])]
                       for chain, motif in tqdm(zip(prok_chains, prok_motifs))]
        prok_psfms = [psfm_from_motif(motif, pc=1/4.0) for motif in prok_motifs]
        prok_psfm_spoofs = [[[sample_from_psfm(psfm) for _ in range(len(motif))] for _ in range(10)]
                            for psfm, motif in zip(prok_psfms, prok_motifs)]
        prok_maxent_spoofs = [spoof_maxent_motifs(motif, 10) for motif in tqdm(prok_motifs)]
        prok_apws = map(lambda m:code_from_motif(m, pc=1/16.0),tqdm(prok_motifs))
        prok_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)]
                             for apw, motif in tqdm(zip(prok_apws,prok_motifs))]
        euk_submotifs = map(subsample, euk_motifs)
        euk_chains = [posterior_chain2(motif,iterations=iterations) for motif in tqdm(euk_submotifs)]
        euk_bayes_spoofs = [[motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations/2::500])]
                            for chain, motif in tqdm(zip(euk_chains, euk_submotifs))]
        euk_psfms = [psfm_from_motif(motif, pc=1/4.0) for motif in euk_submotifs]
        euk_psfm_spoofs = [[[sample_from_psfm(psfm) for _ in range(len(motif))] for _ in range(10)]
                           for psfm, motif in zip(euk_psfms, euk_submotifs)]
        euk_maxent_spoofs = [spoof_maxent_motifs(motif, 10) for motif in tqdm(euk_submotifs)]
        euk_apws = map(lambda m:code_from_motif(m, pc=1/16.0),tqdm(euk_submotifs))
        euk_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)]
                          for apw, motif in tqdm(zip(euk_apws,euk_submotifs))]
        with open("prok_chains.pkl",'w') as f:
            cPickle.dump(prok_chains,f)
        with open("prok_bayes_spoofs.pkl",'w') as f:
            cPickle.dump(prok_bayes_spoofs,f)
        with open("prok_maxent_spoofs.pkl",'w') as f:
            cPickle.dump(prok_maxent_spoofs,f)
        with open("prok_psfm_spoofs.pkl",'w') as f:
            cPickle.dump(prok_psfm_spoofs,f)
        with open("prok_apw_spoofs.pkl",'w') as f:
            cPickle.dump(prok_apw_spoofs,f)

        with open("euk_submotifs.pkl",'w') as f:
            cPickle.dump(euk_submotifs,f)
        with open("euk_chains.pkl",'w') as f:
            cPickle.dump(euk_chains,f)
        with open("euk_bayes_spoofs.pkl",'w') as f:
            cPickle.dump(euk_bayes_spoofs,f)
        with open("euk_maxent_spoofs.pkl",'w') as f:
            cPickle.dump(euk_maxent_spoofs,f)
        with open("euk_psfm_spoofs.pkl",'w') as f:
            cPickle.dump(euk_psfm_spoofs,f)
        with open("euk_apw_spoofs.pkl",'w') as f:
            cPickle.dump(euk_apw_spoofs,f)

    else:
        with open("prok_chains.pkl") as f:
            prok_chains = cPickle.load(f)
        with open("prok_bayes_spoofs.pkl") as f:
            prok_bayes_spoofs = cPickle.load(f)
        with open("prok_maxent_spoofs.pkl") as f:
            prok_maxent_spoofs = cPickle.load(f)
        with open("prok_psfm_spoofs.pkl") as f:
            prok_psfm_spoofs = cPickle.load(f)
        with open("prok_apw_spoofs.pkl") as f:
            prok_apw_spoofs = cPickle.load(f)

        with open("euk_submotifs.pkl") as f:
            euk_submotifs = cPickle.load(f)
        with open("euk_chains.pkl") as f:
            euk_chains = cPickle.load(f)
        with open("euk_bayes_spoofs.pkl") as f:
            euk_bayes_spoofs = cPickle.load(f)
        with open("euk_maxent_spoofs.pkl") as f:
            euk_maxent_spoofs = cPickle.load(f)
        with open("euk_apw_spoofs.pkl") as f:
            euk_apw_spoofs = cPickle.load(f)
        with open("euk_psfm_spoofs.pkl") as f:
            euk_psfm_spoofs = cPickle.load(f)

    #--------
    prok_ics = map(motif_ic, prok_motifs)
    prok_mis = map(mi_per_col, prok_motifs)
    prok_maxent_ics = [mean(map(motif_ic,xs)) for xs in prok_maxent_spoofs]
    prok_maxent_mis = [mean(map(mi_per_col,xs)) for xs in prok_maxent_spoofs]
    prok_psfm_ics = [mean(map(motif_ic,xs)) for xs in prok_psfm_spoofs]
    prok_psfm_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(prok_psfm_spoofs)]
    prok_bayes_ics = [mean(map(motif_ic,xs)) for xs in prok_bayes_spoofs]
    prok_bayes_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(prok_bayes_spoofs)]
    prok_apw_ics = [mean(map(motif_ic,xs)) for xs in prok_apw_spoofs]
    prok_apw_mis = [mean(map(mi_per_col,xs)) for xs in prok_apw_spoofs]

    prok_ics_pp = map(motif_ic_per_col, prok_motifs)
    prok_maxent_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_maxent_spoofs]
    prok_psfm_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_psfm_spoofs]
    prok_bayes_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_bayes_spoofs]
    prok_apw_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_apw_spoofs]
    

    #--------
    euk_ics = map(motif_ic, tqdm(euk_submotifs))
    euk_mis = map(mi_per_col, tqdm(euk_submotifs))
    euk_maxent_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_maxent_spoofs)]
    euk_maxent_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_maxent_spoofs)]
    euk_psfm_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_psfm_spoofs)]
    euk_psfm_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_psfm_spoofs)]
    euk_bayes_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_bayes_spoofs)]
    euk_bayes_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_bayes_spoofs)]
    euk_apw_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_apw_spoofs)]
    euk_apw_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_apw_spoofs)]

    euk_ics_pp = map(motif_ic_per_col, euk_motifs)
    euk_maxent_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_maxent_spoofs]
    euk_psfm_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_psfm_spoofs]
    euk_bayes_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_bayes_spoofs]
    euk_apw_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_apw_spoofs]



    #ic_min, ic_max, mi_min, mi_max = 4.5, 25, -0.1, 0.7
    ic_min, ic_max, mi_min, mi_max = -.1, 2.6, -0.05, 1
    #ic_xtext, ic_ytext, mi_xtext, mi_ytext = 5, 20, -0.05, 0.5
    ic_xtext, ic_ytext, mi_xtext, mi_ytext = -0.05, 2.2, -0.05, 0.85
    mi_xticks = [0, 0.25, 0.5, 0.75, 1]
    ic_yticks = [0, 0.5, 1, 1.5, 2]
    revscatter = lambda xs, ys:scatter(ys, xs)
    sns.set_style('dark')
    plt.subplot(4,4,1)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    r, p = revscatter(prok_ics_pp, prok_maxent_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_maxent_ics_pp)))
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("MaxEnt",fontsize='large')
    plt.subplot(4,4,3)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_maxent_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_maxent_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,5)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_psfm_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_psfm_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("PSFM",fontsize='large')
    plt.subplot(4,4,7)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_psfm_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_psfm_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,9)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_apw_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_apw_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("APW",fontsize='large')
    #plt.xlabel("IC (bits)",fontsize='large')
    plt.subplot(4,4,11)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_apw_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_apw_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,13)
    #plt.xticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_bayes_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_bayes_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.xlabel("Prok IC",fontsize='large')
    plt.ylabel("Bayes",fontsize='large')
    plt.subplot(4,4,15)
    #plt.xticks([])
    plt.xticks(mi_xticks, mi_xticks)
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_bayes_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_bayes_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.xlabel("Prok MI",fontsize='large')

    #--- euk plots ---#
    plt.subplot(4,4,2)
    plt.xticks([])
    plt.yticks([])
    r, p = revscatter(euk_ics_pp, euk_maxent_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_maxent_ics_pp)))
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("MaxEnt",fontsize='large')
    plt.subplot(4,4,4)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_maxent_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_maxent_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,6)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_psfm_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_psfm_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("PSFM",fontsize='large')
    plt.subplot(4,4,8)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_psfm_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_psfm_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,10)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_apw_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_apw_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("APW",fontsize='large')
    #plt.xlabel("IC (bits)",fontsize='large')
    plt.subplot(4,4,12)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_apw_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_apw_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,14)
    #plt.xticks([])
    #
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_bayes_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_bayes_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("Bayes",fontsize='large')
    plt.xlabel("Euk IC",fontsize='large')
    plt.subplot(4,4,16)
    #plt.xticks([])
    plt.xticks(mi_xticks, mi_xticks)
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_bayes_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_bayes_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    #plt.axis('off')
    #plt.xlabel("MI (bits/column pair)",fontsize='large')
    plt.xlabel("Euk MI",fontsize='large')
    plt.tight_layout()
    maybesave("spoof-statistics-rmsd.pdf")
def interpret_gle_evo_sim_spoofs(bio_motifs_, spoofs_,filename=None):
    # assume that structure of spoofs is such that all spoofs for bio_motifs[0] are contained in spoofs[0]
    trials_per_motif = len(spoofs_[0])
    bio_motifs = [bio_motif for bio_motif in bio_motifs_ for i in range(trials_per_motif)]
    sim_motifs = concat(spoofs_)
    print len(bio_motifs), len(sim_motifs)
    assert len(bio_motifs) == len(sim_motifs)
    # bio_ics = [motif_ic(motif) for motif in bio_motifs
    #            for _ in range(trials_per_motif)]
    bio_ics = map(motif_ic, bio_motifs)
    sim_ics = map(motif_ic, sim_motifs)
    # sim_ics = [mean(map(motif_ic,motifs))
    #            for spoof in spoofs for motifs in spoof]
    # bio_ginis = [motif_gini(motif) for motif in bio_motifs
    #            for _ in range(trials_per_motif)]
    # sim_ginis = [mean(map(motif_gini,motifs))
    #              for spoof in spoofs for motifs in spoof]
    bio_ginis = map(motif_gini,bio_motifs)
    sim_ginis = map(motif_gini,sim_motifs)
    # bio_log_mis = [log(total_motif_mi(motif)) for motif in bio_motifs
    #            for _ in range(trials_per_motif)]
    # sim_log_mis = map(log,[mean(map(total_motif_mi,motifs))
    #            for spoof in tqdm(spoofs) for
    #            motifs in spoof])
    lens = [len(motif[0]) for motif in bio_motifs]
    # bio_mis = [total_motif_mi(motif)/choose(l,2)
    #            for (l, motif) in zip(lens, bio_motifs)]
    # sim_mis = [total_motif_mi(motif)/choose(l,2)
    #            for (l, motif) in zip(lens, spoofs)]
    print "finding mutual information"
    bio_mis = [total_motif_mi(motif)/choose(l,2) for (l, motif) in tqdm(zip(lens, bio_motifs))]
    sim_mis = [total_motif_mi(motif)/choose(l,2) for (l, motif) in tqdm(zip(lens, sim_motifs))]
    print "finding motif structures"
    bio_patterns_ = [find_pattern(motif)[0] for motif in tqdm(bio_motifs_)]
    bio_patterns = [pattern for pattern in bio_patterns_ for _ in xrange(trials_per_motif)]
    pattern_colors = {'direct-repeat':'g','inverted-repeat':'b','single-box':'r'}
    colors = [pattern_colors[p] for p in bio_patterns]
    plt.subplot(1,3,1)
    plt.title("Motif IC (bits)") 
    scatter(bio_ics,sim_ics,color=colors,
            line_color='black')
    ic_f = poly1d(polyfit(bio_ics, sim_ics,1))
    #plt.plot(*pl(ic_f,[min(bio_ics),max(bio_ics)]),linestyle='--',color='b')
    plt.xlim(*find_limits(bio_ics, sim_ics))
    plt.ylim(*find_limits(bio_ics, sim_ics))
    plt.ylabel("Simulated")
    plt.subplot(1,3,2)
    plt.xlabel("Observed")
    plt.title("Gini Coefficient")
    scatter(bio_ginis,sim_ginis,color=colors,
            line_color='black')
    gini_f = poly1d(polyfit(bio_ginis, sim_ginis,1))
    #plt.plot(*pl(gini_f,[min(bio_ginis),max(bio_ginis)]),
     #        linestyle='--',color='b')
    plt.xlim(*find_limits(bio_ginis, sim_ginis))
    plt.ylim(*find_limits(bio_ginis, sim_ginis))
    plt.subplot(1,3,3)
    plt.title("Pairwise MI per pair (bits)")
    draft = False
    end = 10 if draft else 108
    scatter(bio_mis,sim_mis,color=colors,
            line_color='black')
    mi_f = poly1d(polyfit(bio_mis, sim_mis,1))
    # plt.plot(*pl(mi_f,[min(bio_mis),max(bio_mis)]),
    #          linestyle='--',color='b')
    plt.xlim(*find_limits(bio_mis, sim_mis))
    plt.ylim(*find_limits(bio_mis, sim_mis))
    plt.legend()
    # #ax.set_bg_color('none')
    # ax.set_xlabel("Biological")
    # ax.set_ylabel("Simulated")
    plt.tight_layout()
    maybesave(filename)
Ejemplo n.º 38
0
def main_experiment(generate_data=False):
    if generate_data:
        iterations = 10000
        prok_chains = [
            posterior_chain2(motif, iterations=iterations)
            for motif in tqdm(prok_motifs)
        ]
        prok_bayes_spoofs = [[
            motif_from_theta(theta, len(motif))
            for theta in tqdm(chain[iterations / 2::500])
        ] for chain, motif in tqdm(zip(prok_chains, prok_motifs))]
        prok_psfms = [
            psfm_from_motif(motif, pc=1 / 4.0) for motif in prok_motifs
        ]
        prok_psfm_spoofs = [[[
            sample_from_psfm(psfm) for _ in range(len(motif))
        ] for _ in range(10)] for psfm, motif in zip(prok_psfms, prok_motifs)]
        prok_maxent_spoofs = [
            spoof_maxent_motifs(motif, 10) for motif in tqdm(prok_motifs)
        ]
        prok_apws = map(lambda m: code_from_motif(m, pc=1 / 16.0),
                        tqdm(prok_motifs))
        prok_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))]
                            for __ in range(10)]
                           for apw, motif in tqdm(zip(prok_apws, prok_motifs))]
        euk_submotifs = map(subsample, euk_motifs)
        euk_chains = [
            posterior_chain2(motif, iterations=iterations)
            for motif in tqdm(euk_submotifs)
        ]
        euk_bayes_spoofs = [[
            motif_from_theta(theta, len(motif))
            for theta in tqdm(chain[iterations / 2::500])
        ] for chain, motif in tqdm(zip(euk_chains, euk_submotifs))]
        euk_psfms = [
            psfm_from_motif(motif, pc=1 / 4.0) for motif in euk_submotifs
        ]
        euk_psfm_spoofs = [[[
            sample_from_psfm(psfm) for _ in range(len(motif))
        ] for _ in range(10)] for psfm, motif in zip(euk_psfms, euk_submotifs)]
        euk_maxent_spoofs = [
            spoof_maxent_motifs(motif, 10) for motif in tqdm(euk_submotifs)
        ]
        euk_apws = map(lambda m: code_from_motif(m, pc=1 / 16.0),
                       tqdm(euk_submotifs))
        euk_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))]
                           for __ in range(10)]
                          for apw, motif in tqdm(zip(euk_apws, euk_submotifs))]
        with open("prok_chains.pkl", 'w') as f:
            cPickle.dump(prok_chains, f)
        with open("prok_bayes_spoofs.pkl", 'w') as f:
            cPickle.dump(prok_bayes_spoofs, f)
        with open("prok_maxent_spoofs.pkl", 'w') as f:
            cPickle.dump(prok_maxent_spoofs, f)
        with open("prok_psfm_spoofs.pkl", 'w') as f:
            cPickle.dump(prok_psfm_spoofs, f)
        with open("prok_apw_spoofs.pkl", 'w') as f:
            cPickle.dump(prok_apw_spoofs, f)

        with open("euk_submotifs.pkl", 'w') as f:
            cPickle.dump(euk_submotifs, f)
        with open("euk_chains.pkl", 'w') as f:
            cPickle.dump(euk_chains, f)
        with open("euk_bayes_spoofs.pkl", 'w') as f:
            cPickle.dump(euk_bayes_spoofs, f)
        with open("euk_maxent_spoofs.pkl", 'w') as f:
            cPickle.dump(euk_maxent_spoofs, f)
        with open("euk_psfm_spoofs.pkl", 'w') as f:
            cPickle.dump(euk_psfm_spoofs, f)
        with open("euk_apw_spoofs.pkl", 'w') as f:
            cPickle.dump(euk_apw_spoofs, f)

    else:
        with open("prok_chains.pkl") as f:
            prok_chains = cPickle.load(f)
        with open("prok_bayes_spoofs.pkl") as f:
            prok_bayes_spoofs = cPickle.load(f)
        with open("prok_maxent_spoofs.pkl") as f:
            prok_maxent_spoofs = cPickle.load(f)
        with open("prok_psfm_spoofs.pkl") as f:
            prok_psfm_spoofs = cPickle.load(f)
        with open("prok_apw_spoofs.pkl") as f:
            prok_apw_spoofs = cPickle.load(f)

        with open("euk_submotifs.pkl") as f:
            euk_submotifs = cPickle.load(f)
        with open("euk_chains.pkl") as f:
            euk_chains = cPickle.load(f)
        with open("euk_bayes_spoofs.pkl") as f:
            euk_bayes_spoofs = cPickle.load(f)
        with open("euk_maxent_spoofs.pkl") as f:
            euk_maxent_spoofs = cPickle.load(f)
        with open("euk_apw_spoofs.pkl") as f:
            euk_apw_spoofs = cPickle.load(f)
        with open("euk_psfm_spoofs.pkl") as f:
            euk_psfm_spoofs = cPickle.load(f)

    #--------
    prok_ics = map(motif_ic, prok_motifs)
    prok_mis = map(mi_per_col, prok_motifs)
    prok_maxent_ics = [mean(map(motif_ic, xs)) for xs in prok_maxent_spoofs]
    prok_maxent_mis = [mean(map(mi_per_col, xs)) for xs in prok_maxent_spoofs]
    prok_psfm_ics = [mean(map(motif_ic, xs)) for xs in prok_psfm_spoofs]
    prok_psfm_mis = [
        mean(map(mi_per_col, xs)) for xs in tqdm(prok_psfm_spoofs)
    ]
    prok_bayes_ics = [mean(map(motif_ic, xs)) for xs in prok_bayes_spoofs]
    prok_bayes_mis = [
        mean(map(mi_per_col, xs)) for xs in tqdm(prok_bayes_spoofs)
    ]
    prok_apw_ics = [mean(map(motif_ic, xs)) for xs in prok_apw_spoofs]
    prok_apw_mis = [mean(map(mi_per_col, xs)) for xs in prok_apw_spoofs]

    prok_ics_pp = map(motif_ic_per_col, prok_motifs)
    prok_maxent_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in prok_maxent_spoofs
    ]
    prok_psfm_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in prok_psfm_spoofs
    ]
    prok_bayes_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in prok_bayes_spoofs
    ]
    prok_apw_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in prok_apw_spoofs
    ]

    #--------
    euk_ics = map(motif_ic, tqdm(euk_submotifs))
    euk_mis = map(mi_per_col, tqdm(euk_submotifs))
    euk_maxent_ics = [
        mean(map(motif_ic, xs)) for xs in tqdm(euk_maxent_spoofs)
    ]
    euk_maxent_mis = [
        mean(map(mi_per_col, xs)) for xs in tqdm(euk_maxent_spoofs)
    ]
    euk_psfm_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_psfm_spoofs)]
    euk_psfm_mis = [mean(map(mi_per_col, xs)) for xs in tqdm(euk_psfm_spoofs)]
    euk_bayes_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_bayes_spoofs)]
    euk_bayes_mis = [
        mean(map(mi_per_col, xs)) for xs in tqdm(euk_bayes_spoofs)
    ]
    euk_apw_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_apw_spoofs)]
    euk_apw_mis = [mean(map(mi_per_col, xs)) for xs in tqdm(euk_apw_spoofs)]

    euk_ics_pp = map(motif_ic_per_col, euk_motifs)
    euk_maxent_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in euk_maxent_spoofs
    ]
    euk_psfm_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in euk_psfm_spoofs
    ]
    euk_bayes_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in euk_bayes_spoofs
    ]
    euk_apw_ics_pp = [mean(map(motif_ic_per_col, xs)) for xs in euk_apw_spoofs]

    #ic_min, ic_max, mi_min, mi_max = 4.5, 25, -0.1, 0.7
    ic_min, ic_max, mi_min, mi_max = -.1, 2.6, -0.05, 1
    #ic_xtext, ic_ytext, mi_xtext, mi_ytext = 5, 20, -0.05, 0.5
    ic_xtext, ic_ytext, mi_xtext, mi_ytext = -0.05, 2.2, -0.05, 0.85
    mi_xticks = [0, 0.25, 0.5, 0.75, 1]
    ic_yticks = [0, 0.5, 1, 1.5, 2]
    revscatter = lambda xs, ys: scatter(ys, xs)
    sns.set_style('dark')
    plt.subplot(4, 4, 1)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    r, p = revscatter(prok_ics_pp, prok_maxent_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp,
                     prok_maxent_ics_pp)))
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("MaxEnt", fontsize='large')
    plt.subplot(4, 4, 3)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_maxent_mis)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_maxent_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 5)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_psfm_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_psfm_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("PSFM", fontsize='large')
    plt.subplot(4, 4, 7)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_psfm_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, prok_mis,
                             prok_psfm_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 9)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_apw_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_apw_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("APW", fontsize='large')
    #plt.xlabel("IC (bits)",fontsize='large')
    plt.subplot(4, 4, 11)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_apw_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_apw_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 13)
    #plt.xticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_bayes_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_bayes_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.xlabel("Prok IC", fontsize='large')
    plt.ylabel("Bayes", fontsize='large')
    plt.subplot(4, 4, 15)
    #plt.xticks([])
    plt.xticks(mi_xticks, mi_xticks)
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_bayes_mis)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_bayes_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.xlabel("Prok MI", fontsize='large')

    #--- euk plots ---#
    plt.subplot(4, 4, 2)
    plt.xticks([])
    plt.yticks([])
    r, p = revscatter(euk_ics_pp, euk_maxent_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_maxent_ics_pp)))
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("MaxEnt",fontsize='large')
    plt.subplot(4, 4, 4)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_maxent_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis,
                             euk_maxent_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 6)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_psfm_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_psfm_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("PSFM",fontsize='large')
    plt.subplot(4, 4, 8)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_psfm_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_psfm_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 10)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_apw_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_apw_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("APW",fontsize='large')
    #plt.xlabel("IC (bits)",fontsize='large')
    plt.subplot(4, 4, 12)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_apw_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_apw_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 14)
    #plt.xticks([])
    #
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_bayes_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_bayes_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("Bayes",fontsize='large')
    plt.xlabel("Euk IC", fontsize='large')
    plt.subplot(4, 4, 16)
    #plt.xticks([])
    plt.xticks(mi_xticks, mi_xticks)
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_bayes_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_bayes_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    #plt.axis('off')
    #plt.xlabel("MI (bits/column pair)",fontsize='large')
    plt.xlabel("Euk MI", fontsize='large')
    plt.tight_layout()
    maybesave("spoof-statistics-rmsd.pdf")
Ejemplo n.º 39
0
def plot_fragments(fragments,filename=None):
    plt.plot(map_fragments(fragments))
    plt.xlabel("Genomic coordinate")
    plt.ylabel("Read Density")
    plt.title("Simulated ChIP-Seq Read Map")
    maybesave(filename)
def prok_euk_ic_gini_experiment(filename=None,pickle_filename=None):
    """figure 3 in gini paper"""
    if pickle_filename is None:
        sys.path.append("/home/pat/jaspar")
        from parse_jaspar import euk_motifs
        prok_motifs = bio_motifs
        euk_motifs = [motif if len(motif) <= 200 else sample(200,motif,replace=False)
                      for motif in euk_motifs]
        print "prok maxents"
        prok_maxents = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(prok_motifs)]
        print "prok uniforms"
        prok_uniforms = [spoof_motifs_uniform(motif,num_motifs=100) for motif in tqdm(prok_motifs)]
        print "euk maxents"
        euk_maxents = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(euk_motifs)]
        print "euk uniforms"
        euk_uniforms = [spoof_motifs_uniform(motif,num_motifs=100) for motif in tqdm(euk_motifs)]
        with open("prok_euk_ic_gini_experiment.pkl",'w') as f:
            cPickle.dump((prok_maxents, prok_uniforms, euk_maxents, euk_uniforms), f)
        prok_ics = map(motif_ic, prok_motifs)
        prok_ginis = map(motif_gini, prok_motifs)
        euk_ics = map(motif_ic, euk_motifs)
        euk_ginis = map(motif_gini, euk_motifs)
        prok_maxent_ics = [mean(map(motif_ic,motifs)) for motifs in prok_maxents]
        prok_maxent_ginis = [mean(map(motif_gini,motifs)) for motifs in prok_maxents]
        prok_uniform_ics = [mean(map(motif_ic,motifs)) for motifs in prok_uniforms]
        prok_uniform_ginis = [mean(map(motif_gini,motifs)) for motifs in prok_uniforms]
        euk_maxent_ics = [mean(map(motif_ic,motifs)) for motifs in euk_maxents]
        euk_maxent_ginis = [mean(map(motif_gini,motifs)) for motifs in euk_maxents]
        euk_uniform_ics = [mean(map(motif_ic,motifs)) for motifs in euk_uniforms]
        euk_uniform_ginis = [mean(map(motif_gini,motifs)) for motifs in euk_uniforms]
        prok_patterns = [find_pattern(motif)[0] for motif in tqdm(prok_motifs)]
        euk_patterns = [find_pattern(motif)[0] for motif in tqdm(euk_motifs)]
        #pattern_colors = {'direct-repeat':'g','inverted-repeat':'b','single-box':'r'}
        prok_colors = [pattern_colors[p] for p in prok_patterns]
        euk_colors = [pattern_colors[p] for p in euk_patterns]
        with open("prok_euk_ic_gini_all_data.pkl",'w') as f:
            cPickle.dump((prok_motifs, euk_motifs,
                          prok_ics,prok_ginis,
                          prok_maxent_ics,prok_maxent_ginis,
                          prok_uniform_ics,prok_uniform_ginis,
                          euk_ics,euk_ginis,
                          euk_maxent_ics,euk_maxent_ginis,
                          euk_uniform_ics,euk_uniform_ginis,
                          prok_patterns, euk_patterns),f)
    else:
        with open(pickle_filename) as f:
            (prok_motifs, euk_motifs,
             prok_ics, prok_ginis, 
             prok_maxent_ics, prok_maxent_ginis, 
             prok_uniform_ics, prok_uniform_ginis, 
             euk_ics, euk_ginis, 
             euk_maxent_ics, euk_maxent_ginis, 
             euk_uniform_ics, euk_uniform_ginis,
             prok_patterns,euk_patterns) = cPickle.load(f)
    color_dict = {pat:col for pat,col in zip("direct-repeat inverted-repeat single-box".split(),
                                                     sns.cubehelix_palette(3))}
    marker_dict = {pat:col for pat,col in zip("direct-repeat inverted-repeat single-box".split(),
                                                     "o x ^".split())}
    dmap = lambda d,xs: [d[x] for x in xs]
    # plt.subplot(2,2,1)
    # scatter(prok_maxent_ics,prok_ics,color=prok_colors)
    # plt.ylabel("Prokaryotic IC (bits)")
    # plt.xlim(0,35)
    # plt.ylim(0,35)
    # plt.subplot(2,2,2)
    # scatter(prok_uniform_ics,prok_ics,color=prok_colors)
    # plt.xlim(0,35)
    # plt.ylim(0,35)
    # plt.subplot(2,2,3)
    # scatter(euk_maxent_ics,euk_ics, color=euk_colors)
    # plt.ylabel("Eukaroytic IC (bits)")
    # plt.xlabel("Maxent IC (bits)")
    # plt.xlim(0,35)
    # plt.ylim(0,35)
    # plt.subplot(2,2,4)
    # plt.xlim(0,35)
    # plt.ylim(0,35)
    # scatter(euk_uniform_ics,euk_ics,color=euk_colors)
    # plt.xlabel("Uniform IC (bits)")
    # maybesave("biological-ics.eps")
    # marker_dict = {pat:col for pat,col in zip("direct-repeat inverted-repeat single-box".split(),"s x ^".split())}
    # get_markers = lambda patterns:[marker_dict[pat] for pat in pats]
    left1, left2, bottom1, bottom2 = 0.16, 0.59, 0.77, 0.33
    xmin, xmax, ymin, ymax = 0, 0.6, 0, 0.6
    marker_size = 10
    sns.set_style('white')
    #sns.set_style('darkgrid')
    plt.subplot(2,2,1)
    # plt.xlim(0,0.4)
    # plt.ylim(0,0.6)
    plt.xlim(xmin, xmax)
    plt.ylim(ymin, ymax)
    for x,y,p in zip(prok_maxent_ginis, prok_ginis, prok_patterns):
        plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size)
    plt.plot([0,1],[0,1],linestyle='--',color='black')
    print "prok maxent"
    print pearsonr(prok_maxent_ginis,prok_ginis)
    plt.ylabel("Prokaryotic IGC",fontsize='large')
    # sns.set_style('white')
    # a1 = plt.axes([left1, bottom1, .1, .1])
    # plt.scatter(prok_ics,prok_maxent_ics,s=10,color='black')
    # plt.plot([0,40],[0,40],linewidth=0.5,linestyle='--',color='black')
    # plt.xlim(0,40)
    # plt.ylim(0,40)
    # plt.xlabel("MaxEnt IC")
    # plt.ylabel("Prok IC")
    # plt.xticks([])
    # plt.yticks([])
    plt.subplot(2,2,2)
    # plt.xlim(0,0.4)
    # plt.ylim(0,0.6)
    plt.xlim(xmin, xmax)
    plt.ylim(ymin, ymax)
    print "prok uniform"
    print pearsonr(prok_uniform_ginis,prok_ginis)
    for x,y,p in zip(prok_uniform_ginis, prok_ginis,prok_patterns):
        plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size)
    plt.plot([0,1],[0,1],linestyle='--',color='black')
    # sns.set_style('white')
    # a2 = plt.axes([left2, bottom1, .1, .1])
    # plt.scatter(prok_ics,prok_uniform_ics,s=10,color='black')
    # plt.plot([0,40],[0,40],linewidth=0.5,linestyle='--',color='black')
    # plt.xlim(0,40)
    # plt.ylim(0,40)
    # plt.xlabel("TU IC")
    # plt.ylabel("Prok IC")
    # plt.xticks([])
    # plt.yticks([])
    plt.subplot(2,2,3)
    # plt.xlim(0,0.4)
    # plt.ylim(0,0.6)
    plt.xlim(xmin, xmax)
    plt.ylim(ymin, ymax)
    print "euk maxent:"
    print pearsonr(euk_maxent_ginis,euk_ginis)
    for x,y,p in zip(euk_maxent_ginis, euk_ginis, euk_patterns):
        plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size)
    plt.plot([0,1],[0,1],linestyle='--',color='black')
    plt.ylabel("Eukaroytic IGC",fontsize='large')
    plt.xlabel("MaxEnt IGC",fontsize='large')
    #sns.set_style('white')
    # a3 = plt.axes([left1, bottom2, .1, .1])
    # plt.scatter(euk_ics,euk_maxent_ics,s=10,color='black')
    # plt.plot([0,40],[0,40],linewidth=0.5,linestyle='--',color='black')
    # plt.xlim(0,40)
    # plt.ylim(0,40)
    # plt.xlabel("MaxEnt IC")
    # plt.ylabel("Euk IC")
    # plt.xticks([])
    # plt.yticks([])
    plt.subplot(2,2,4)
    # plt.xlim(0,0.4)
    # plt.ylim(0,1)
    plt.xlim(xmin, xmax)
    plt.ylim(ymin, ymax)
    print "euk uniform"
    print pearsonr(euk_uniform_ginis,euk_ginis)
    for x,y,p in zip(euk_uniform_ginis, euk_ginis,euk_patterns):
        plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size)
    plt.plot([0,1],[0,1],linestyle='--',color='black')
    plt.xlabel("TU IGC",fontsize='large')
    # sns.set_style('white')
    # a4 = plt.axes([left2, bottom2, .1, .1])
    # plt.scatter(euk_ics,euk_uniform_ics,s=10,color='black')
    # plt.plot([0,40],[0,40],linewidth=0.5,linestyle='--',color='black')
    # plt.xlim(0,40)
    # plt.ylim(0,40)
    # plt.xlabel("TU IC")
    # plt.ylabel("Euk IC")
    # sns.set_style('darkgrid')
    # plt.xticks([])
    # plt.yticks([])
    maybesave(filename)
Ejemplo n.º 41
0
def grand_spoofing_experiment(prok_motifs, euk_motifs):
    # should we subsample once or each time??
    prok_maxent_spoofs = [spoof_maxent_motifs(motif,10) for motif in tqdm(prok_motifs)]
    euk_maxent_spoofs = [spoof_maxent_motifs(subsample(motif), 10) for motif in tqdm(euk_motifs)]
    prok_cftp_spoofs = [spoof_motif_cftp_occ(motif,10) for motif in tqdm(prok_motifs)]
    euk_cftp_spoofs = [spoof_motif_cftp_occ(subsample(motif),10) for motif in tqdm(euk_motifs)]
    prok_oo_spoofs = [spoof_oo_motifs(motif,10) for motif in tqdm(prok_motifs)]
    prok_oo_occ_spoofs = [spoof_oo_motifs_occ(motif,10) for motif in tqdm(prok_motifs)]
    euk_oo_spoofs = [spoof_oo_motifs(subsample(motif),10) for motif in tqdm(euk_motifs)]
    euk_oo_occ_spoofs = [spoof_oo_motifs_occ(motif,10) for motif in tqdm(euk_motifs)]
    with open("prok_maxent_spoofs",'w') as f:
        cPickle.dump(prok_maxent_spoofs, f)
    with open("euk_maxent_spoofs",'w') as f:
        cPickle.dump(euk_maxent_spoofs, f)
    with open("prok_cftp_spoofs",'w') as f:
        cPickle.dump(prok_cftp_spoofs, f)
    with open("euk_cftp_spoofs",'w') as f:
        cPickle.dump(euk_cftp_spoofs, f)
    with open("prok_oo_spoofs",'w') as f:
        cPickle.dump(prok_oo_spoofs, f)
    with open("euk_oo_spoofs",'w') as f:
        cPickle.dump(euk_oo_spoofs, f)

    with open("prok_maxent_spoofs.pkl") as f:
        prok_maxent_spoofs = cPickle.load(f)
    with open("euk_maxent_spoofs.pkl") as f:
        euk_maxent_spoofs = cPickle.load(f)
    with open("prok_cftp_spoofs") as f:
        prok_cftp_spoofs = cPickle.load(f)
    with open("euk_cftp_spoofs") as f:
        euk_cftp_spoofs = cPickle.load(f)
    with open("prok_oo_spoofs.pkl") as f:
        prok_oo_spoofs = cPickle.load(f)
    with open("euk_oo_spoofs.pkl") as f:
        euk_oo_spoofs = cPickle.load(f)

    prok_mis = map(mi_per_col, prok_motifs)
    prok_maxent_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(prok_maxent_spoofs)]
    euk_mis = map(mi_per_col, map(subsample,euk_motifs))
    euk_maxent_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(euk_maxent_spoofs)]
    prok_cftp_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(prok_cftp_spoofs)]
    euk_cftp_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(euk_cftp_spoofs)]
    prok_oo_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(prok_oo_spoofs)]
    euk_oo_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(euk_oo_spoofs)]
    
    plt.subplot(1,3,1)
    scatter(prok_maxent_mis,
            prok_mis)
    plt.xlabel("Predicted MI",fontsize='large')
    plt.ylabel("Observed MI",fontsize='large')
    plt.title("MaxEnt",fontsize='large')
    scatter(euk_maxent_mis,
            euk_mis,color='g')
    plt.subplot(1,3,2)
    scatter(prok_cftp_mis,
            prok_mis)
    scatter(euk_cftp_mis,
            euk_mis,color='g')
    plt.xlabel("Predicted MI",fontsize='large')
    plt.ylabel("Observed MI",fontsize='large')
    plt.title("Gaussian Linear Ensemble",fontsize='large')
    plt.subplot(1,3,3)
    scatter(prok_oo_mis,
            prok_mis)
    scatter(euk_oo_mis,
            euk_mis,color='g')
    plt.xlabel("Predicted MI",fontsize='large')
    plt.ylabel("Observed MI",fontsize='large')
    plt.title("Match-Mismatch",fontsize='large')
    plt.tight_layout()
    maybesave("mi-spoof-plot.eps")
Ejemplo n.º 42
0
def analyze_correlated_digrams_canonical(prok_tests, euk_tests, filename=None):
    digrams = [(b1,b2) for b1 in "ACGT" for b2 in "ACGT"]
    canonical_digrams = sorted(list(set([min(dg,tuple(wc(dg))) for dg in digrams])))
    prok_q = fdr(concat(prok_tests))
    euk_q = fdr(concat(euk_tests))
    prok_digrams = defaultdict(int)
    prok_corr_digrams = defaultdict(int)
    prok_adj_digrams = defaultdict(int)
    for tests, motif in tqdm(zip(prok_tests, prok_motifs)):
        for test, ((i,coli),(j,colj)) in zip(tests, choose2(list(enumerate(transpose((motif)))))):
            for bi,bj in transpose((coli,colj)):
                rev_comp = tuple(wc((bi,bj)))
                if (bi, bj) > rev_comp:
                    bi, bj = rev_comp
                prok_digrams[(bi,bj)] += 1
                if j == i + 1:
                    prok_adj_digrams[(bi,bj)] += 1
                if test <= prok_q:
                    prok_corr_digrams[(bi,bj)] += 1
    prok_corr_N = float(sum(prok_corr_digrams.values()))
    prok_adj_N = float(sum(prok_adj_digrams.values()))
    prok_N = float(sum(prok_digrams.values()))
    #prok_ps = normalize(prok_digrams.values())
    #prok_adj_ps = normalize(prok_adj_digrams.values())
    #prok_corr_ps = normalize(prok_corr_digrams.values())
    prok_ps = normalize([prok_digrams[dg] for dg in canonical_digrams])
    prok_adj_ps = normalize([prok_adj_digrams[dg] for dg in canonical_digrams])
    prok_corr_ps = normalize([prok_corr_digrams[dg] for dg in canonical_digrams])
    prok_yerr = [1.96*sqrt(1.0/prok_N*p*(1-p)) for p in prok_ps]
    prok_adj_yerr = [1.96*sqrt(1.0/prok_adj_N*p*(1-p)) for p in prok_adj_ps]
    prok_corr_yerr = [1.96*sqrt(1.0/prok_corr_N*p*(1-p)) for p in prok_corr_ps]

    euk_digrams = defaultdict(int)
    euk_corr_digrams = defaultdict(int)
    euk_adj_digrams = defaultdict(int)
    for tests, motif in tqdm(zip(euk_tests, euk_motifs)):
        for test, ((i,coli),(j,colj)) in zip(tests, choose2(list(enumerate(transpose((motif)))))):
            for bi,bj in transpose((coli,colj)):
                rev_comp = tuple(wc((bi,bj)))
                if (bi, bj) > rev_comp:
                    bi, bj = rev_comp
                euk_digrams[(bi,bj)] += 1
                if j == i + 1:
                    euk_adj_digrams[(bi,bj)] += 1
                if test <= euk_q:
                    euk_corr_digrams[(bi,bj)] += 1
    euk_corr_N = float(sum(euk_corr_digrams.values()))
    euk_adj_N = float(sum(euk_adj_digrams.values()))
    euk_N = float(sum(euk_digrams.values()))
    # euk_ps = normalize(euk_digrams.values())
    # euk_adj_ps = normalize(euk_adj_digrams.values())
    # euk_corr_ps = normalize(euk_corr_digrams.values())
    euk_ps = normalize([euk_digrams[dg] for dg in canonical_digrams])
    euk_adj_ps = normalize([euk_adj_digrams[dg] for dg in canonical_digrams])
    euk_corr_ps = normalize([euk_corr_digrams[dg] for dg in canonical_digrams])
    euk_yerr = [1.96*sqrt(1.0/euk_N*p*(1-p)) for p in euk_ps]
    euk_adj_yerr = [1.96*sqrt(1.0/euk_adj_N*p*(1-p)) for p in euk_adj_ps]
    euk_corr_yerr = [1.96*sqrt(1.0/euk_corr_N*p*(1-p)) for p in euk_corr_ps]

    palette = sns.cubehelix_palette(4)
    ax = plt.subplot(211)
    # plt.bar(range(16),normalize(prok_digrams.values()))
    # plt.bar(range(16),normalize(prok_corr_digrams.values()),color='g')
    # plt.bar([x-0.2 for x in range(16)], prok_relative_ratios.values(), color='g', label="Correlated Column-pairs",width=0.2)
    # plt.bar([x for x in range(16)],prok_adj_relative_ratios.values(),color='r',alpha=1,yerr=prok_adj_yerr,label="Adjacent Column-pairs",width=0.2)
    # plt.bar([x+0.2 for x in range(16)],[1]*16,color='b',alpha=1,yerr=(prok_yerr),capsize=10,capstyle='butt',label="All Column-pairs",width=0.2)
    plt.bar([x-0.2 for x in range(len(canonical_digrams))], prok_ps, label="All Column-Pairs",width=0.2,yerr=prok_yerr,color=palette[0])
    plt.bar([x for x in range(len(canonical_digrams))],prok_adj_ps,label="Adj. Column-Pairs",
            width=0.2,yerr=prok_adj_yerr,color=palette[1])
    plt.bar([x+0.2 for x in range(len(canonical_digrams))],prok_corr_ps,alpha=1,
            capstyle='butt',label="Corr. Adj. Column-Pairs",width=0.2,yerr=prok_corr_yerr,color=palette[3])
    #plt.plot([0,16],[1.0/16, 1.0/16],linestyle='--',color=palette[3],label="Equiprobability",linewidth=1)
    ax.set_xticks([x for x in range(len(canonical_digrams))])
    ax.set_xticklabels( ["".join(dg) for dg in canonical_digrams],fontsize='large')
    plt.xlim(-0.5,10.5)
    plt.ylim(0,0.3)
    #plt.xlabel("Dimer",fontsize='large')
    plt.ylabel("Prokaryotic Frequency",fontsize='large')
    #plt.ylim(0,2)
    plt.legend(loc='upper right')
    
    ax2 = plt.subplot(212)
    #plt.plot([0,16],[1.0/16, 1.0/16],linestyle='--',color=palette[3],label="Equiprobability",linewidth=1)
    plt.bar([x-0.2 for x in range(len(canonical_digrams))], euk_ps, label="All Column-Pairs",width=0.2,yerr=euk_yerr,color=palette[0])
    plt.bar([x for x in range(len(canonical_digrams))],euk_adj_ps,label="Adj. Column-Pairs",
            width=0.2,yerr=euk_adj_yerr,color=palette[1])
    plt.bar([x+0.2 for x in range(len(canonical_digrams))],euk_corr_ps,alpha=1,
            capstyle='butt',label="Corr. Adj. Column-Pairs",width=0.2,yerr=euk_corr_yerr,color=palette[3])
    ax2.set_xticks([x for x in range(len(canonical_digrams))])
    ax2.set_xticklabels( ["".join(dg) for dg in canonical_digrams],fontsize='large')
    #plt.xlabel("Dimer",fontsize='large')
    plt.xlim(-0.5,10.5)
    plt.ylim(0,0.2)
    plt.ylabel("Eukaryotic Frequency",fontsize='large')
    #plt.ylim(0,2)
    plt.legend(loc='upper right')
    maybesave(filename)
Ejemplo n.º 43
0
def plot_energy_matrix(matrix,filename=None):
    plt.imshow(transpose([[x - max(row) for x in row] for row in matrix]),
               interpolation='nearest')
    plt.colorbar()
    maybesave(filename)