-
Notifications
You must be signed in to change notification settings - Fork 1
/
AR_model.py
executable file
·322 lines (273 loc) · 13.5 KB
/
AR_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
"""
created on June 24, 2014
@author: Nikola Jajcay
"""
from src import wavelet_analysis
from src.data_class import DataField, load_station_data
from surrogates.surrogates import SurrogateField
import numpy as np
import matplotlib.pyplot as plt
from datetime import date, datetime
from multiprocessing import Process, Queue
def render(diffs, meanvars, stds = None, subtit = '', percentil = None, fname = None):
fig, ax1 = plt.subplots(figsize=(11,8))
if len(diffs) > 3:
ax1.plot(diffs, color = '#403A37', linewidth = 2, figure = fig)
else:
p2, = ax1.plot(diffs[1], color = '#899591', linewidth = 1.5, figure = fig)
if stds is not None:
ax1.plot(diffs[1] + stds[0], color = '#899591', linewidth = 0.7, figure = fig)
ax1.plot(diffs[1] - stds[0], color = '#899591', linewidth = 0.7, figure = fig)
ax1.fill_between(np.arange(0,diffs[1].shape[0],1), diffs[1] + stds[0], diffs[1] - stds[0],
facecolor = "#899591", alpha = 0.5)
p1, = ax1.plot(diffs[0], color = '#403A37', linewidth = 2, figure = fig)
if percentil != None:
for pos in np.where(percentil[:, 0] == True)[0]:
ax1.plot(pos, diffs[0][pos], 'o', markersize = 8, color = '#403A37')
#ax1.plot(total_diffs[0], np.arange(0,len(total_diffs[0])), total_diffs[1], np.arange(0, cnt))
ax1.axis([0, cnt-1, diff_ax[0], diff_ax[1]])
ax1.set_xlabel('middle year of %.2f-year wide window' % (WINDOW_LENGTH / 365.25), size = 14)
if MEANS:
ax1.set_ylabel('difference in cond mean in temperature [$^{\circ}$C]', size = 14)
elif not MEANS:
ax1.set_ylabel('difference in cond variance in temperature [$^{\circ}$C$^2$]', size = 14)
# year_diff = np.round((last_mid_year - first_mid_year) / 10)
# print last_mid_year, first_mid_year, year_diff
# xnames = np.arange(first_mid_year, last_mid_year, year_diff)
# print xnames
# plt.xticks(np.linspace(0, cnt, len(xnames)), xnames, rotation = 30)
plt.xticks(np.arange(0, cnt+8, 8), np.arange(first_mid_year, last_mid_year+8, 8), rotation = 30)
ax2 = ax1.twinx()
if len(meanvars) > 3:
ax2.plot(meanvars, color = '#CA4F17', linewidth = 2, figure = fig) # color = '#CA4F17'
else:
p4, = ax2.plot(meanvars[1], color = '#64C4A0', linewidth = 1.5, figure = fig)
if stds is not None:
ax2.plot(meanvars[1] + stds[1], color = '#64C4A0', linewidth = 0.7, figure = fig)
ax2.plot(meanvars[1] - stds[1], color = '#64C4A0', linewidth = 0.7, figure = fig)
ax2.fill_between(np.arange(0,diffs[1].shape[0],1), meanvars[1] + stds[1], meanvars[1] - stds[1],
facecolor = "#64C4A0", alpha = 0.5)
p3, = ax2.plot(meanvars[0], color = '#CA4F17', linewidth = 2, figure = fig)
if percentil != None:
for pos in np.where(percentil[:, 1] == True)[0]:
ax2.plot(pos, meanvars[0][pos], 'o', markersize = 8, color = '#CA4F17')
if MEANS:
ax2.set_ylabel('mean of cond means in temperature [$^{\circ}$C]', size = 14)
elif not MEANS:
ax2.set_ylabel('mean of cond variance in temperature [$^{\circ}$C$^2$]', size = 14)
ax2.axis([0, cnt-1, mean_ax[0], mean_ax[1]])
for tl in ax2.get_yticklabels():
tl.set_color('#CA4F17')
if len(diffs) < 3:
plt.legend([p1, p2, p3, p4], ["difference DATA", "difference SURROGATE mean", "mean DATA", "mean SURROGATE mean"], loc = 2)
tit = 'SURR: Evolution of difference in cond'
if MEANS:
tit += ' mean in temp, '
else:
tit += ' variance in temp, '
if not ANOMALISE:
tit += 'SAT, '
else:
tit += 'SATA, '
if np.int(WINDOW_LENGTH) == WINDOW_LENGTH:
tit += ('%d-year window, %d-year shift' % (WINDOW_LENGTH, WINDOW_SHIFT))
else:
tit += ('%.2f-year window, %d-year shift' % (WINDOW_LENGTH, WINDOW_SHIFT))
#plt.title(tit)
if MEANS:
tit = ('Evolution of difference in cond means -- AR(%d) model with coeffs: %s \n' % (k, str(a_coeffs)))
else:
tit = ('Evolution of difference in cond variance -- AR(%d) model with coeffs: %s \n' % (k, str(a_coeffs)))
tit += subtit
plt.text(0.5, 1.05, tit, horizontalalignment = 'center', size = 16, transform = ax2.transAxes)
#ax2.set_xticks(np.arange(start_date.year, end_date.year, 20))
if fname is not None:
plt.savefig(fname)
else:
plt.show()
"""
construct univariate AR(k) model in form of X(t) = SUM_k a_k * X(t-k) + epsilon
"""
np.random.seed()
k = 2 # model order
TS_LEN = 16384 # length of the time series
SIGMA_NOISE = 1 # standard deviation for noise, which is supposed to be Gaussian with mean 0
RANDOM_COEFFS = False
A_COEFFS = [0.7, 0.25] # list of coefficients, should be length of k
## -----------------
ANOMALISE = True
PERIOD = 8 # years, period of wavelet
WINDOW_LENGTH = 16384 # 13462, 16384
WINDOW_SHIFT = 1 # years, delta in the sliding window analysis
MEANS = True # if True, compute conditional means, if False, compute conditional variance
WORKERS = 4
NUM_SURR = 50 # how many surrs will be used to evaluate
SURR_TYPE = 'MF'
diff_ax = (0, 8) # means -> 0, 2, var -> 1, 8
mean_ax = (-1, 1) # means -> -1, 1.5, var -> 9, 18
g = load_station_data('TG_STAID000027.txt', date(1834,7,28), date(2014,1,1), ANOMALISE)
g_working = DataField()
g_surrs = DataField()
TS_LEN = g.data.shape[0]
# map coeffs to numpy array
if RANDOM_COEFFS:
A_COEFFS = []
for i in range(k):
A_COEFFS.append((2*np.random.rand(1) - 1)[0])
a_coeffs = np.array(A_COEFFS)
# initialize first k time points to noise
ts = np.zeros((TS_LEN,))
for i in range(k):
ts[i] = np.random.normal(0, SIGMA_NOISE, 1)
# construct time series as AR(k) model
for i in range(k,TS_LEN):
for j in range(k):
ts[i] += a_coeffs[j] * ts[i-j-1]
ts[i] += np.random.normal(0, SIGMA_NOISE, 1)
ts[TS_LEN/4:TS_LEN/2] *= 3
ts[TS_LEN/2:3*TS_LEN/4] /= 2
print("**WARNING: USING AR(%d) MODEL INSTEAD OF DATA (coeffs are %s)" % (k, str(a_coeffs)))
g.data = ts
print("[%s] Wavelet analysis in progress with %d year window shifted by %d year(s)..." % (str(datetime.now()), WINDOW_LENGTH, WINDOW_SHIFT))
k0 = 6. # wavenumber of Morlet wavelet used in analysis
y = 365.25 # year in days
fourier_factor = (4 * np.pi) / (k0 + np.sqrt(2 + np.power(k0,2)))
period = PERIOD * y # frequency of interest
s0 = period / fourier_factor # get scale
cond_means = np.zeros((8,))
to_wavelet = 16384 if WINDOW_LENGTH < 16000 else 32768
def get_equidistant_bins():
return np.array(np.linspace(-np.pi, np.pi, 9))
def _cond_difference_surrogates(sg, g_temp, a, start_cut, jobq, resq):
mean, var, trend = a
while jobq.get() is not None:
if SURR_TYPE == 'MF':
sg.construct_multifractal_surrogates()
sg.add_seasonality(mean, var, trend)
elif SURR_TYPE == 'FT':
sg.construct_fourier_surrogates_spatial()
sg.add_seasonality(mean, var, trend)
elif SURR_TYPE == 'AR':
sg.construct_surrogates_with_residuals()
sg.add_seasonality(mean[:-1, ...], var[:-1, ...], trend[:-1, ...])
wave, _, _, _ = wavelet_analysis.continous_wavelet(sg.surr_data, 1, False, wavelet_analysis.morlet, dj = 0, s0 = s0, j1 = 0, k0 = k0) # perform wavelet
phase = np.arctan2(np.imag(wave), np.real(wave))
_, _, idx = g_temp.get_data_of_precise_length(WINDOW_LENGTH, start_cut, None, False)
sg.surr_data = sg.surr_data[idx[0] : idx[1]]
phase = phase[0, idx[0] : idx[1]]
phase_bins = get_equidistant_bins() # equidistant bins
for i in range(cond_means.shape[0]): # get conditional means for current phase range
#phase_bins = get_equiquantal_bins(phase_temp) # equiquantal bins
ndx = ((phase >= phase_bins[i]) & (phase <= phase_bins[i+1]))
if MEANS:
cond_means[i] = np.mean(sg.surr_data[ndx])
else:
cond_means[i] = np.var(sg.surr_data[ndx], ddof = 1)
diff = (cond_means.max() - cond_means.min()) # append difference to list
mean_var = np.mean(cond_means)
resq.put((diff, mean_var))
difference_data = []
meanvar_data = []
cnt = 0
difference_surr = []
difference_surr_std = []
meanvar_surr = []
meanvar_surr_std = []
difference_95perc = []
mean_95perc = []
start_year = date.fromordinal(g.time[0]).year + 4
sm = date.fromordinal(g.time[0]).month
sd = date.fromordinal(g.time[0]).day
start_idx = 0
end_idx = to_wavelet
_, _, idx = g.get_data_of_precise_length(WINDOW_LENGTH, date.fromordinal(g.time[4*y]), None, False)
first_mid_year = date.fromordinal(g.time[idx[0]+WINDOW_LENGTH/2]).year
while end_idx < g.data.shape[0]:
# data
g_working.data = g.data[start_idx : end_idx].copy()
g_working.time = g.time[start_idx : end_idx].copy()
if np.all(np.isnan(g_working.data) == False):
wave, _, _, _ = wavelet_analysis.continous_wavelet(g_working.data, 1, False, wavelet_analysis.morlet, dj = 0, s0 = s0, j1 = 0, k0 = k0) # perform wavelet
phase = np.arctan2(np.imag(wave), np.real(wave)) # get phases from oscillatory modes
start_cut = date(start_year+cnt*WINDOW_SHIFT, sm, sd)
idx = g_working.get_data_of_precise_length(WINDOW_LENGTH, start_cut, None, True) # 16k or 13462
print 'data ', g.get_date_from_ndx(start_idx), ' - ', g.get_date_from_ndx(end_idx)
print 'cut from ', start_cut, ' to ', g_working.get_date_from_ndx(-1)
last_mid_year = date.fromordinal(g_working.time[WINDOW_LENGTH/2]).year
phase = phase[0, idx[0] : idx[1]]
phase_bins = get_equidistant_bins() # equidistant bins
for i in range(cond_means.shape[0]): # get conditional means for current phase range
ndx = ((phase >= phase_bins[i]) & (phase <= phase_bins[i+1]))
if MEANS:
cond_means[i] = np.mean(g_working.data[ndx])
else:
cond_means[i] = np.var(g_working.data[ndx], ddof = 1)
difference_data.append(cond_means.max() - cond_means.min()) # append difference to list
meanvar_data.append(np.mean(cond_means))
else:
difference_data.append(np.nan)
meanvar_data.append(np.nan)
# surrogates
if NUM_SURR != 0:
surr_completed = 0
diffs = np.zeros((NUM_SURR,))
mean_vars = np.zeros_like(diffs)
g_surrs.data = g.data[start_idx : end_idx].copy()
g_surrs.time = g.time[start_idx : end_idx].copy()
if np.all(np.isnan(g_surrs.data) == False):
# construct the job queue
jobQ = Queue()
resQ = Queue()
for i in range(NUM_SURR):
jobQ.put(1)
for i in range(WORKERS):
jobQ.put(None)
a = g_surrs.get_seasonality(DETREND = True)
sg = SurrogateField()
sg.copy_field(g_surrs)
if SURR_TYPE == 'AR':
sg.prepare_AR_surrogates()
workers = [Process(target = _cond_difference_surrogates, args = (sg, g_surrs, a, start_cut, jobQ, resQ)) for iota in range(WORKERS)]
for w in workers:
w.start()
while surr_completed < NUM_SURR:
# get result
diff, meanVar = resQ.get()
diffs[surr_completed] = diff
mean_vars[surr_completed] = meanVar
surr_completed += 1
for w in workers:
w.join()
difference_surr.append(np.mean(diffs))
difference_surr_std.append(np.std(diffs, ddof = 1))
meanvar_surr.append(np.mean(mean_vars))
meanvar_surr_std.append(np.std(mean_vars, ddof = 1))
percentil = difference_data[-1] > diffs
no_true = percentil[percentil == True].shape[0]
difference_95perc.append(True if (no_true > NUM_SURR * 0.95) else False)
percentil = meanvar_data[-1] > mean_vars
no_true = percentil[percentil == True].shape[0]
mean_95perc.append(True if (no_true > NUM_SURR * 0.95) else False)
print("%d. time point - data: %.2f, surr mean: %.2f, surr std: %.2f" % (cnt, difference_data[-1], np.mean(diffs), np.std(diffs, ddof = 1)))
else:
difference_surr.append(0)
difference_surr_std.append(0)
meanvar_surr.append(0)
meanvar_surr_std.append(0)
cnt += 1
if WINDOW_LENGTH > 16000:
start_idx = g.find_date_ndx(date(start_year - 4 + WINDOW_SHIFT*5*cnt/7, sm, sd))
else:
start_idx = g.find_date_ndx(date(start_year - 4 + WINDOW_SHIFT*cnt, sm, sd))
end_idx = start_idx + to_wavelet
print("[%s] Wavelet analysis on data done." % (str(datetime.now())))
difference_data = np.array(difference_data)
meanvar_data = np.array(meanvar_data)
difference_95perc = np.array(difference_95perc)
mean_95perc = np.array(mean_95perc)
where_percentil = np.column_stack((difference_95perc, mean_95perc))
fn = ("debug/AR%d_model_coef_%.2fand%.2f_*3from_quater_to_half_div2_after_%s_%d_%ssurr_%s_window.png" % (k, a_coeffs[0],
a_coeffs[1], 'means' if MEANS else 'var', NUM_SURR, SURR_TYPE, '32to16k' if WINDOW_LENGTH > 16000 else '16to14k'))
render([difference_data, np.array(difference_surr)], [meanvar_data, np.array(meanvar_surr)], [np.array(difference_surr_std), np.array(meanvar_surr_std)],
subtit = ("95 percentil: difference - %d/%d and mean %d/%d" % (difference_95perc[difference_95perc == True].shape[0], cnt, mean_95perc[mean_95perc == True].shape[0], cnt)),
percentil = where_percentil, fname = fn)