def motif_corr(motif,n=1000):
    """find correlated columns in motif, correcting for multiple hypothesis testing"""
    ps = [mi_permute(col1,col2,p_value=True,n=n,mi_method=lambda xs,ys:mi(xs,ys,correct=False))
          for (col1,col2) in (choose2(transpose(motif)))]
    q = fdr(ps)
    if q is None:
        return None
    else:
        L = len(motif[0])
        return [((i,j),p) for (i,j),p in zip(choose2(range(L)),ps) if p <= q]
Example #2
0
def pairwise_mi_ref(ps,M=None):
    w = int(log(len(ps),4))
    dimer_freqs = defaultdict(lambda: np.zeros(16))
    dimer_base_index = {"".join(comb):k for k,comb in enumerate(product("ACGT","ACGT"))}
    psfm = marginalize(ps,M)
    for k,(kmer,p) in enumerate(zip(make_kmers(w),ps)):
        for (i,j) in choose2(range(w)):
            dimer = kmer[i] + kmer[j]
            dimer_freqs[(i,j)][dimer_base_index[dimer]]+=p
    return sum(dimer_freqs[(i,j)][k]*log2(dimer_freqs[(i,j)][k]/(psfm[i][k/4]*psfm[j][k%4]))
                      for k in range(16) for (i,j) in choose2(range(w)))
def analyze_motif(motif, trials=1000):
    cols = transpose(motif)
    L = len(cols)
    ps = []
    for col1, col2 in (choose2(cols)):
        actual_mi = dna_mi(col1,col2)
        perm_mis = [dna_mi(col1,permute(col2)) for i in xrange(trials)]
        p = percentile(actual_mi, perm_mis)
        #print p
        ps.append(p)
    q = fdr(ps)
    correlated_pairs = [(i,j) for (i,j),p in zip(choose2(range(L)),ps) if p < q]
    num_correlated = len(correlated_pairs)
    print "correlated column pairs:", num_correlated, "%1.2f" % ((num_correlated)/choose(L,2))
    return correlated_pairs
def expected_mi(L, k):
    """expected MI between two columns, given site length L and k mismatches per site"""
    L = float(L)
    q = k / L  # prob mismatch
    p = 1 - q
    match = lambda b: b == "A"
    mismatch = lambda b: b != "A"

    def joint(b1, b2):
        if match(b1):
            if match(b2):
                return (L - k) / L * (L - k - 1) / (L - 1)
            else:
                return (L - k) / L * k / (L - 1) / 3.0
        else:
            if match(b2):
                return (k) / L * (L - k) / (L - 1) / 3.0
            else:
                return (k) / L * (k - 1) / (L - 1) / 9.0

    def marg(b):
        return p if match(b) else q / 3

    return sum(
        joint(b1, b2) * log2(joint(b1, b2) /
                             (marg(b1) * marg(b2))) if joint(b1, b2) else 0
        for b1, b2 in choose2("ACGT"))
def analyze_motif(motif, trials=1000):
    cols = transpose(motif)
    L = len(cols)
    ps = []
    for col1, col2 in (choose2(cols)):
        actual_mi = dna_mi(col1, col2)
        perm_mis = [dna_mi(col1, permute(col2)) for i in xrange(trials)]
        p = percentile(actual_mi, perm_mis)
        #print p
        ps.append(p)
    q = fdr(ps)
    correlated_pairs = [(i, j) for (i, j), p in zip(choose2(range(L)), ps)
                        if p < q]
    num_correlated = len(correlated_pairs)
    print "correlated column pairs:", num_correlated, "%1.2f" % (
        (num_correlated) / choose(L, 2))
    return correlated_pairs
def interpret_main_experiment(results_dict):
    taus = sorted(results_dict.keys())
    print taus
    data = [(tau,f,motif_ic(extract_sites(s)),total_motif_mi(extract_sites(s)))
            for tau in taus for (s,f) in results_dict[tau][0]]
    cols = transpose(data)
    names = "tau,f,motif_ic,total_motif_mi".split(",")
    for (i,name1),(j,name2) in choose2(list(enumerate(names))):
        xs = cols[i]
        ys = cols[j]
        print name1,name2,pearsonr(xs,ys),spearmanr(xs,ys)
def analyze_column_frequencies():
    """Do columnwise frequencies reveal stable patterns that could be
explained by amino acid preferences?"""
    def dna_freqs(xs):
        return [xs.count(b)/float(len(xs)) for b in "ACGT"]
    all_freqs = concat([map(dna_freqs,transpose(getattr(tfdf_obj,tf)))
                         for tf in tfdf_obj.tfs])
    for k,(i,j) in enumerate(choose2(range(4))):
        plt.subplot(4,4,k)
        cols = transpose(all_freqs)
        plt.scatter(cols[i],cols[j])
def get_pairwise_freqs(motif, pc=1/16.0):
    cols = transpose(motif)
    L = len(cols)
    N = len(motif)
    fs = [{(b1, b2):0 for (b1,b2) in dinucs} for _ in range(int(choose(L,2)))]
    for f, (col1, col2) in zip(fs, choose2(cols)):
        for b1, b2 in zip(col1, col2):
            f[b1, b2] += 1
        for b1, b2 in dinucs:
            f[b1, b2] += pc
            f[b1, b2] /= float(N + 16*pc)
    return fs
Example #9
0
def interpret_main_experiment(results_dict):
    taus = sorted(results_dict.keys())
    print taus
    data = [(tau, f, motif_ic(extract_sites(s)),
             total_motif_mi(extract_sites(s))) for tau in taus
            for (s, f) in results_dict[tau][0]]
    cols = transpose(data)
    names = "tau,f,motif_ic,total_motif_mi".split(",")
    for (i, name1), (j, name2) in choose2(list(enumerate(names))):
        xs = cols[i]
        ys = cols[j]
        print name1, name2, pearsonr(xs, ys), spearmanr(xs, ys)
def analyze_all_pvals_at_once(org_obj=Escherichia_coli):
    """conclusion: fdr-adjusted p-values identify 25 significantly
    correlated column-pairs in 3753 pairwise tests (0.5%).  
    """
    ps = [mi_permute(col1,col2,p_value=True,n=1000,mi_method=lambda xs,ys:mi(xs,ys,correct=False))
          for tf in tqdm(org_obj.tfs)
          for (col1,col2) in (choose2(transpose(getattr(org_obj,tf))))]
    q_bh = fdr(ps)
    q_bhy = bhy(ps)
    print "bh procedure: %s/%s" % (len(filter(lambda p:p <= q_bh,ps)),len(ps))
    print "bhy procedure: %s/%s" % (len(filter(lambda p:p <= q_bhy,ps)),len(ps))
    return ps
Example #11
0
def get_pairwise_freqs(motif, pc=1 / 16.0):
    cols = transpose(motif)
    L = len(cols)
    N = len(motif)
    fs = [{(b1, b2): 0
           for (b1, b2) in dinucs} for _ in range(int(choose(L, 2)))]
    for f, (col1, col2) in zip(fs, choose2(cols)):
        for b1, b2 in zip(col1, col2):
            f[b1, b2] += 1
        for b1, b2 in dinucs:
            f[b1, b2] += pc
            f[b1, b2] /= float(N + 16 * pc)
    return fs
def log_fitness_approx(matrix,motif,G,terms=2):
    n = len(motif)
    eps = [score_seq(matrix,site) for site in motif]
    fgs = [exp(-ep) for ep in eps]
    Zf = sum(fgs)
    Zb = Zb_from_matrix(matrix,G)
    Z = Zf + Zb
    zeroth_term = log(n+Zb) * (terms >= 0)
    first_term = (-1/(n+Zb)*sum(eps)) * (terms >= 1)
    second_term = 1/2.0*1/(n+Zb)**2*((n + Zb - 1)*sum(ep**2 for ep in eps) -
                                     sum(epi*epj for epi,epj in choose2(eps))) * (terms >= 2)
    print zeroth_term,first_term,second_term
    # first_order = -sum(eps) - n*(log(n+Zb) + (-1/(n+Zb)*sum(eps)))
    # second_order = -sum(eps) - n*(log(n+Zb) + (-1/(n+Zb)*sum(eps)) + 1/2.0*1/(n+Zb)**2*((n)))
    return -sum(eps) - n*(zeroth_term + first_term + second_term)
def analyze_composition_of_correlated_columns(obj,ps):
    p_idx = 0
    cor_adj_counts = defaultdict(int)
    cor_nonadj_counts = defaultdict(int)
    uncor_counts = defaultdict(int)
    fdr_cutoff = 0
    for tf in obj.tfs:
        motif = getattr(obj,tf)
        cols = transpose(motif)
        for (i,col1),(j,col2) in choose2(list(enumerate(cols))):
            if ps[p_idx] <= 0:
                print tf,i,j
                for pair in zip(cols[i],cols[j]):
                    if i + 1 == j:
                        cor_adj_counts[pair] += 1
                    else:
                        cor_nonadj_counts[pair] += 1
                #print mi_table(col1,col2)
            else:
                for pair in zip(cols[i],cols[j]):
                    uncor_counts[pair] += 1
            p_idx += 1
    cor_adj_N = float(sum(cor_adj_counts.values()))
    cor_nonadj_N = float(sum(cor_nonadj_counts.values()))
    uncor_N = float(sum(uncor_counts.values()))
    # all_N = float(sum(all_counts.values()))
    # print "---"
    # for b1,b2 in sorted(counts.keys()):
    #     
    #     print b1,b2,"freq:",fmt(counts[(b1,b2)]/N),"background:",fmt(all_counts[(b1,b2)]/all_N),"OR:",fmt(counts[(b1,b2)]/N/(all_counts[(b1,b2)]/all_N)),p
    print "bases, adj, nonadj, noncor | adj freq, nonadj freq | noncor freq| adj OR, nonadj OR"
    # XXX split into adj_uncor, nonadj_uncor
    for b1,b2 in sorted(cor_adj_counts.keys()):
        cor_adj_freq = fmt(cor_adj_counts[(b1,b2)]/cor_adj_N)
        cor_nonadj_freq = fmt(cor_nonadj_counts[(b1,b2)]/cor_nonadj_N)
        uncor_freq = fmt(uncor_counts[(b1,b2)]/uncor_N)
        cor_adj_OR = fmt(cor_adj_freq/uncor_freq)
        cor_nonadj_OR = fmt(cor_nonadj_freq/uncor_freq)
        _,adj_p,_,_ = stats.chi2_contingency(np.array([[uncor_N,uncor_counts[(b1,b2)]],
                                                       [cor_adj_N,cor_adj_counts[(b1,b2)]]]))
        _,non_adj_p,_,_ = stats.chi2_contingency(np.array([[uncor_N,uncor_counts[(b1,b2)]],
                                                       [cor_nonadj_N,cor_nonadj_counts[(b1,b2)]]]))
        print b1,b2,cor_adj_counts[b1,b2],cor_nonadj_counts[b1,b2],uncor_counts[b1,b2],"|",cor_adj_freq,cor_nonadj_freq,"|",uncor_freq,"|",cor_adj_OR, significance(adj_p),cor_nonadj_OR,significance(non_adj_p)
    return cor_adj_counts, cor_nonadj_counts, uncor_counts
def expected_mi(L,k):
    """expected MI between two columns, given site length L and k mismatches per site"""
    L = float(L)
    q = k/L # prob mismatch
    p = 1 - q
    match = lambda b:b=="A"
    mismatch = lambda b:b!="A"
    def joint(b1,b2):
        if match(b1):
             if match(b2):
                 return (L-k)/L * (L-k-1)/(L-1)
             else:
                 return (L - k)/L * k/(L - 1) / 3.0
        else:
            if match(b2):
                return (k)/L * (L-k)/(L-1) / 3.0
            else:
                return (k)/L * (k-1)/(L-1) / 9.0
    def marg(b):
        return p if match(b) else q/3
    return sum(joint(b1,b2)*log2(joint(b1,b2)/(marg(b1)*marg(b2))) if joint(b1,b2) else 0
               for b1,b2 in choose2("ACGT"))
def site_sampling_methods_study(n=50, num_motifs=10, plot=True):
    """validate that the three proposed sampling methods:

    brute force
    rejection sampling
    metropolis hastings

    do in fact sample from the same distribution
    """

    L = 10
    sigma = 1
    matrix = sample_matrix(L, sigma)
    Ne = 5
    mu = -10
    print "bf"
    t0 = time.time()
    bf_motifs = [sample_motif_bf(matrix, mu, Ne, n,verbose=True)
                 for i in trange(num_motifs)]
    bf_time = time.time() - t0
    print "ar"
    t0 = time.time()
    ar_motifs = [sample_motif_ar(matrix, mu, Ne, n)
                 for i in range(num_motifs)]
    ar_time = time.time() - t0
    print "mh"
    t0 = time.time()
    mh_motifs = [sample_motif_mh(matrix, mu, Ne, n)
                 for i in range(num_motifs)]
    mh_time = time.time() - t0
    icss = mmap(motif_ic,[bf_motifs, ar_motifs, mh_motifs])
    print "ics:", map(mean_ci, icss)
    print "time per motif:", [t/num_motifs
                              for t in [bf_time, ar_time, mh_time]]
    if plot:
        plt.boxplot(icss)
    for xs, ys in choose2(icss):
        print mannwhitneyu(xs,ys)
Example #16
0
def Psi(xs):
    """return probability weight associated with configuration, up to
    Z.  P(xs) = Psi(xs)/Z"""
    single_terms = product(psi_jjp(x) for x in xs)
    pair_terms = product(psi_jjp(xj,xjp) for (xj,xjp) in choose2(xs))
    return  single_terms * pair_terms 
Example #17
0
def score(model, site):
    return sum(wi[(b1, b2)] for wi, (b1, b2) in zip(model, choose2(site)))
def score(model, site):
    return sum(wi[(b1, b2)] for wi, (b1,b2) in zip(model, choose2(site)))
Example #19
0
def alo2(ps):
    return (sum(ps) - sum(pi*pj for pi,pj in choose2(ps)))