-
Notifications
You must be signed in to change notification settings - Fork 0
/
bootstrap.py
140 lines (111 loc) · 3.58 KB
/
bootstrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
Find changepoints using binary segmentation and the bootstrap. The main function is find_all_cps.
"""
import numpy as np
import bottleneck as bn
def welch(xs, ys, meanfcn=bn.nanmedian):
"""
Welch's statistic for equal means
http://en.wikipedia.org/wiki/Welch%27s_t_test
Parameters
----------
xs: np.array
ys: np.array
Returns
-------
float
"""
xbar, ybar = map(meanfcn, (xs, ys))
sx2, sy2 = map(lambda zs: bn.nanvar(xs) + np.spacing(1), (xs, ys))
return np.abs(xbar - ybar)/np.sqrt(sx2/len(xs) + sy2/len(ys))
def check_nobs(N, minnobs):
if N < 2*minnobs + 1:
raise ValueError("Sample size must be greater than or equal to minnobs")
def compute_endpoints(N, minnobs):
return minnobs, N - minnobs
def _most_likely_cp(xs, minnobs):
N = len(xs)
check_nobs(N, minnobs)
start, end = compute_endpoints(N, minnobs)
wstats = np.array([welch(xs[:i], xs[i:]) for i in xrange(start, end)])
cp = bn.nanargmax(wstats)
stat = wstats[cp]
return cp + start, stat
def most_likely_cp(xs, minnobs, nsamples):
"""
Finds the most likely changepoint in xs and the corresponding p-value.
Parameters
----------
xs: np.array
minnobs: int
Shortest interval to search for a changepoint.
nsamples: int
Returns
-------
(changepoint index, p-value)
"""
cp, stat = _most_likely_cp(xs, minnobs)
prob = pval(stat, xs, minnobs=minnobs, nsamples=nsamples)
return cp, prob
def bootstrap_cps(xs, nsamples, minnobs):
"""
Computes an array of Welch stats using bootstrapped samples from xs.
"""
N = len(xs)
res = np.zeros(nsamples)
for i in xrange(nsamples):
ys = np.random.choice(xs, N)
_, stat = _most_likely_cp(ys, minnobs)
res[i] = stat
return res
def pval(wstat, xs, minnobs, nsamples):
"""
Computes the bootstrapped p-value for a Welch statistic on the sample xs.
"""
sample = bootstrap_cps(xs, minnobs=minnobs, nsamples=nsamples)
return 1 - bn.nanmean(sample < wstat)
def _find_all_cps(xs, nsamples, index, minnobs, crit_val):
res = list()
if xs is None or (len(xs) < (2*minnobs + 1)):
return res
cp_local, prob = most_likely_cp(xs, minnobs=minnobs, nsamples=nsamples)
if prob >= crit_val:
return res
cp_global = cp_local + index
res.append((cp_global, prob))
left = _find_all_cps(xs[:cp_local], minnobs=minnobs, nsamples=nsamples,
index=index, crit_val=crit_val)
if left:
res.extend(left)
right = _find_all_cps(xs[(cp_local + 1):], minnobs=minnobs, nsamples=nsamples,
index=cp_global + 1, crit_val=crit_val)
if right:
res.extend(right)
return res
def find_all_cps(xs, minnobs=10, nsamples=50, crit_val=0.1):
"""
Finds all changepoints for the sample xs where the p-value is below crit_val.
Parameters
----------
xs: np.array
minnobs: int
Shortest interval to search for a changepoint.
nsamples: int
Number of bootstrap samples for computing p-values.
crit_val: float
Threshold for which we'll keep changepoints.
Returns
-------
[(changepoint index, pvalue)]
Example
-------
>>> xs = np.zeros(300)
>>> xs[100:] += 1.0
>>> xs[200:] += 1.0
>>> cps = [c for c, _ in find_all_cps(xs, crit_val=0.01)]
>>> assert 100 in cps
>>> assert 200 in cps
"""
cps = _find_all_cps(xs, minnobs=minnobs, index=0, nsamples=nsamples, crit_val=crit_val)
res = [(c, p) for c, p in cps if p < crit_val]
return sorted(res, key=lambda t: t[0])