-
Notifications
You must be signed in to change notification settings - Fork 0
/
pvalues.py
133 lines (119 loc) · 5.32 KB
/
pvalues.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""Use maxent sampling to derive pvalues for motifs based on ic"""
from maxent_sampling import *
from utils import log_sum, random_motif, mean, random_site
from matplotlib import pyplot as plt
from math import log, exp, sqrt, erf, pi
from tqdm import *
from scipy.special import erfcx
def ic_log_pvalue(N, L, des_ic, verbose=False, trials=100, method="ub"):
print des_ic
correction_per_col = 3/(2*log(2)*N)
K = L * correction_per_col # correction per motif
ic_for_beta = des_ic + K
tolerance = 10**-10
beta = find_beta_for_mean_motif_ic(N,L,ic_for_beta,tolerance=tolerance,verbose=verbose) # correct val of beta
countses = enumerate_counts(N)
entropies = np.array(map(entropy_from_counts, countses))
iterator = tqdm(countses) if verbose else countses
log_cols = np.array(map(log_counts_to_cols, iterator))
log_Zq = log_sum(log_cols + -beta*entropies)*L
log_Zp = N*L*log(4)
#log_prefactor = log_Zq - log_Zp + beta*2*L
log_prefactor = log_Zq - log_Zp + beta*(2*L-K)
if method == "UB":
log_expectation_ub = (-beta*(des_ic))
log_pval_ub = log_prefactor + log_expectation_ub
return log_pval_ub - log(2)
elif method == "analytic":
mu, sigma = calc_params(N, L, beta)
log_expectation = log(compute_expectation_spec(beta, mu, sigma))
log_pval = log_prefactor + log_expectation
return log_pval
else:
ms = maxent_motifs(N, L, des_ic, trials, beta=beta)
ics = map(motif_ic, ms)
print "des_ic, mean ics:", des_ic, mean(ics)
log_expectation = log_sum([-beta*ic for ic in ics if ic > des_ic]) - log(trials) # Xxx loss of precision
log_pval = log_prefactor + log_expectation
return log_pval
def calc_params(N, L, beta):
"""compute ic sd of maxent motifs"""
countses = enumerate_counts(N)
entropies = np.array(map(entropy_from_counts, countses))
iterator = tqdm(countses)
log_cols = np.array(map(log_counts_to_cols, iterator))
log_phats = np_log_normalize(log_cols + -beta*entropies)
ps = np.exp(log_phats)
correction_per_col = 3/(2*log(2)*N)
K = L * correction_per_col # correction per motif
mu = ps.dot(entropies)
sigma_sq = ps.dot(entropies**2) - mu**2
return 2*L - (mu*L + K), sqrt(sigma_sq*L)
def validation_plot(L=10, N=50, ref_trials=1000):
check_points = np.linspace(0,10,10)
#ics_ref = sorted([motif_ic(random_motif(L, N)) for i in range(ref_trials)])
ics_ref = sorted([motif_ic(random_motif(L, N), correct=True) for i in trange(ref_trials)])
plt.plot(ics_ref, 1 - np.linspace(0,1,len(ics_ref)),label="Empirical Complementary CDF", marker='o',linestyle='')
plt.plot(check_points, [exp(ic_log_pvalue(N, L, ic, method="MC")) for ic in check_points],
label="Importance Sampling Estimate")
plt.plot(check_points, [exp(ic_log_pvalue(N, L, ic, method="UB")) for ic in check_points],
label="Analytic Upper Bound")
plt.plot(check_points, [exp(ic_log_pvalue(N, L, ic, method="analytic")) for ic in check_points],
label="Analytic P-value")
plt.semilogy()
plt.legend()
plt.xlabel("Information Content (bits)")
plt.ylabel("P-value")
plt.xlim(0,1.2)
plt.show()
#return ics_ref
def test_integral(beta, mu, sigma, trials=10000):
obs = mean(exp(beta*x) for x in [random.gauss(mu, sigma) for i in trange(trials)])
pred = exp(beta**2*sigma**2/2.0 + beta*mu)
return pred, obs
def compute_expectation(beta, mu, sigma):
"""compute \int(exp(-beta*x)*phi(x;mu,sigma^2)) from mu to infty"""
print beta, mu, sigma
# we do this because the first term is liable to overflow
second_term = erfc((beta*sigma)/(sqrt(2)))
if second_term > 0:
first_term = 1/2.0 * exp(beta**2*sigma**2/2.0 + -beta*mu)
return first_term * second_term
else:
return 0
def compute_expectation_spec(beta, mu, sigma):
"""compute \int(exp(-beta*x)*phi(x;mu,sigma^2)) from mu to infty"""
print beta, mu, sigma
# we do this because the first term is liable to overflow
u = beta*sigma/sqrt(2)
prefactor = 1/2.0 * exp(-beta*mu)
# first_term_ref = exp(u**2)
# second_term_ref = erfc(u)
# ans_ref = prefactor * first_term * second_term
ans = prefactor * erfcx(u)
# print ans_ref, ans
return ans
def compute_log_expectation(beta, mu, sigma):
"""compute \int(exp(-beta*x)*phi(x;mu,sigma^2)) from mu to infty"""
print beta, mu, sigma
# we do this because the first term is liable to overflow
log_second_term = log1p(-erf((beta*sigma**2)/(sigma*sqrt(2))))
log_first_term = 1/2.0 * exp(beta**2*sigma**2/2.0 + -beta*mu)
return first_term * second_term
def approx_expectation(beta, mu, sigma, trials=1000):
return mean(exp(-beta*x) if x > mu else 0 for x in np.random.normal(mu,sigma,trials))
def test_expectation():
beta = random.random() * 10
mu = random.random() * 10 - 5
sigma = random.random() * 10
return compute_expectation(beta, mu, sigma), approx_expectation(beta, mu, sigma, trials=10000)
def alignment_simulation():
ell = 100
L = 10
N = 50
trials = 100000
seqs = [random_site(ell) for i in range(N)]
def random_alignment():
rs = [random.randrange(ell-L+1) for _ in range(N)]
return [seq[r:r+L] for seq, r in zip(seqs,rs)]
ics = [motif_ic(random_alignment()) for _ in trange(trials)]