/
CDST_demo.py
763 lines (583 loc) · 26.5 KB
/
CDST_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
#!/usr/bin/env python
#coding:utf-8
# Author: Chris Musselle -- <chris.j.musselle@gmail.com>
#
# Purpose: The Change-point Detecting Subspace Traker (CD-ST) Algorithm
#
# File defining the CDST object that contains all methods for the algorithm.
# Running this file will define a CDST object and run one experiment as defined
# by the parameters and choice of data set after the if __name__ == '__main__'
# statement.
#
# References and Acknowledgements:
# The FHST_iter function is an implimentation of the Fast Row Householder
# Subspace Tracking algorithm which first appeared in the following reference.
#
# Strobach, P. (2009). The fast recursive row-Householder subspace tracking algorithm.
# Signal Process., 89(12), 2514-2528. Amsterdam, The Netherlands, Elsevier North-Holland, Inc.
# Created: 11/23/11
# Run from current path location
import sys
import os
sys.path.append(os.getcwd() + '/datasets')
sys.path.append(os.getcwd() + '/utils')
import numpy as np
from numpy import dot
import scipy as sp
from math import sqrt
import numpy.linalg as npl
import matplotlib.pyplot as plt
from plot_utils import plot_2x1, plot_3x1, plot_4x1, adjust_spines
from utils import fmeasure, clean_zeros, zscore, zscore_win
from load_data import load_data, load_ts_data
from gen_anom_data import gen_a_grad_persist, gen_a_peak_dip, gen_a_step, gen_a_periodic_shift
from SAX import SAX, plot_SAX, bp_lookup
class CDST(object):
""" Class that holds all methods of CDST
version is a string specifying the combination of mudules to use
F = FHST version
A = Anomaly Detection Version
S = Sax Version, still a work in progress.
Format = 'F-xxxxxxx.A-xxxxxxxx.S-xxxxxxxx'
"""
def __init__(self, version, p, numStreams = 1):
self.F_version = version.split('.')[0]
self.A_version = version.split('.')[1]
if len(version.split('.')) > 2:
self.S_version = version.split('.')[2]
else:
self.S_version = 'none'
self.numStreams = numStreams
# Calculate threshold. Depends on whether test is one or two tailed.
if '+ve' in self.A_version or '-ve' in self.A_version:
p['t_thresh'] = sp.stats.t.isf(1.0 * p['FP_rate'], p['SRE_sample_N'])
elif 'both' in self.A_version:
p['t_thresh'] = sp.stats.t.isf(0.5 * p['FP_rate'], p['SRE_sample_N'])
self.p = p
self.p['version'] = version
""" Initialise all CD-ST variables """
r = self.p['init_r']
# Q_0
if self.p['fix_init_Q'] != 0: # fix inital Q as identity
q_0 = np.eye(numStreams);
Q = q_0
else: # generate random orthonormal matrix N x r
Q = np.eye(numStreams) # Max size of Q
Q_0, R_0 = npl.qr(np.random.rand(numStreams,r))
Q[:,:r] = Q_0
# S_0
small_value = self.p['small_value']
S = np.eye(numStreams) * small_value # Avoids Singularity
# v-1
v = np.zeros((numStreams,1))
# U(t-1) for eigenvalue estimation
U = np.eye(numStreams)
# Define st dictionary
""" This stores variables from one timestep to the next """
self.st = {'Q' : Q, # Orthogonal dominant subspace vectors
'S' : S, # Energy
'v' : v, # used for S update
'U' : U, # Used for eigen value calculations
'r' : r, # Previous rank of Q and number of hidden variables h
't' : 0, # Timestep, used for ignoreup2
'sumEz' : 0.0, # Exponetial sum of zt Energy
'sumEh': 0.0, # Exponential sum of ht energy
'anomaly': np.array([0]*self.numStreams,dtype = bool)}
# Vars for SAX usage
if 'none' not in self.S_version:
self.st['SAX_trigger_q'] = []
self.st['SAX_snapshots'] = {}
def re_init(self, numStreams):
self.numStreams = numStreams
# This deletes all tracked values
if hasattr(self, 'res'):
del self.res
""" Initialise all CD-ST variables """
r = self.p['init_r']
# Q_0
if self.p['fix_init_Q'] != 0: # fix inital Q as identity
q_0 = np.eye(numStreams);
Q = q_0
else: # generate random orthonormal matrix N x r
Q = np.eye(numStreams) # Max size of Q
Q_0, R_0 = npl.qr(np.random.rand(numStreams,r))
Q[:,:r] = Q_0
# S_0
small_value = self.p['small_value']
S = np.eye(numStreams) * small_value # Avoids Singularity
# v-1
v = np.zeros((numStreams,1))
# U(t-1) for eigenvalue estimation
U = np.eye(numStreams)
# Define st dictionary
self.st = {'Q' : Q, # Orthogonal dominant subspace vectors
'S' : S, # Energy
'v' : v, # used for S update
'U' : U, # Used for eigen value calculations
'r' : r, # Previous rank of Q and number of hidden variables h
't' : 0, # Timestep, used for ignoreup2
'sumEz' : 0.0, # Exponetial sum of zt Energy
'sumEh': 0.0, # Exponential sum of ht energy
'anomaly': np.array([0]*self.numStreams, dtype = bool)}
# vars for SAX usage
if 'none' not in self.S_version:
self.st['SAX_trigger_q'] = []
self.st['SAX_snapshots'] = {}
def next_input(self, zt):
# Run Subspace Tracking
if 'FHST' in self.F_version:
self.FHST_iter(zt)
else:
print 'Error: %s method for subspace tracking not recognised' % (self.F_version)
# Run anomaly Detection
if 'SRE' in self.A_version:
self.anomaly_SREstat_fast(zt)
else:
print 'Error: %s method for detect anomalies not recognised' % (self.A_version)
# Run Sax Method
if 'none' not in self.S_version:
self.SAX_simple(zt)
def FHST_iter(self, zt):
""" Iterable version of the Fast row Housholder Algorithm """
'''
zt = data at next time step
st = {'Q' : Q, - Orthogonal dominant subspace vectors
'S' : S, - Energy
'U' : U, - Orthonormal component of Orthogonal iteration around X.T
'v' : v, - Used for speed up of calculating X update
'r' : r, - Previous rank of Q and number of hidden variables h
't' : t, - Timestep, used for ignoreup2
'sumEz' : Et, - Exponetial sum of zt Energy
'sumEh': E_dash_t }- Exponential sum of ht energy
'''
# Load Variables
st = self.st
p = self.p
numStreams = self.numStreams
r = st['r']
# NOTE algorithm's Q, S, v and U matrices/ vectors are kept at max size N (constant memory)
# Create alias's for current value of r
Qt = st['Q'][:, :r]
vt = st['v'][:r, :]
St = st['S'][:r, :r]
Ut = st['U'][:r, :r]
# Check S remains non-singular
for idx in range(r):
if St[idx, idx] < p['small_value']:
St[idx,idx] = p['small_value']
'''Begin main algorithm'''
ht = dot(Qt.T, zt)
Z = dot(zt.T, zt) - dot(ht.T , ht)
if Z > 0 :
# Flag for whether Z(t-1) > 0
# Used for alternative eigenvalue calculation if Z < 0
#st['last_Z_pos'] = bool(1)
# Strobach version of FHST with use of extra u_vec terms
u_vec = dot(St , vt)
X = (p['alpha'] * St) + (2 * p['alpha'] * dot(u_vec, vt.T)) + dot(ht, ht.T)
# Solve to find b_vec
A = X.T
B = sqrt(Z) * ht
b_vec = npl.solve(A,B)
beta = 4 * (dot(b_vec.T , b_vec) + 1)
phi_sq = 0.5 + (1.0 / sqrt(beta))
phi = sqrt(phi_sq)
gamma = (1.0 - 2 * phi_sq) / (2 * phi)
delta = phi / sqrt(Z)
vt = gamma * b_vec
St = X - ((1 /delta) * dot(vt , ht.T))
w = (delta * ht) - (vt)
ee = delta * zt - dot(Qt , w)
Qt = Qt - 2 * dot(ee , vt.T)
else: # if Z is not > 0
# Implies norm of ht is > zt or zt = 0
St = p['alpha'] * St # Continue decay of S matrix
'''Store Values'''
# Update stored values
st['Q'][:,:r] = Qt
st['v'][:r,:] = vt
st['S'][:r, :r] = St
st['U'][:r,:r] = Ut
# Record hidden variables
ht_vec = np.hstack((ht.T[0,:], np.array([np.nan]*(self.numStreams-r))))
st['ht'] = ht_vec
# Energy Ratio Calculations
st['Ez'] = np.sum(zt ** 2) # the norm squared of zt
st['Eh'] = np.sum(ht ** 2) # the norm squared of ht
st['sumEz'] = p['alpha']*st['sumEz'] + st['Ez'] # Energy of Data
st['sumEh'] = p['alpha']*st['sumEh'] + st['Eh'] # Energy of Hidden Variables
if st['sumEz'] == 0 : # Catch NaNs
st['e_ratio'] = 0.0
else:
st['e_ratio'] = st['sumEh'] / st['sumEz']
self.st = st
def anomaly_SREstat_fast(self, zt):
""" Calculates a test statistic for ressidul of zt_reconstructed """
st = self.st
p = self.p
# Slow way
#st['recon'] = dot(st['Q'][:,:st['r']],st['ht'][:st['r']])
#st['recon_err'] = zt.T - st['recon']
#SRE = npl.norm(st['recon_err'])
# Fast Way
# Squared Reconstrunction Error (SRE) or
# Squared norm of the residual error vector
SRE = (st['Ez'] - st['Eh'])
# Build/Slide recon_err_window
if st.has_key('SRE_win'):
st['SRE_win'][:-1] = st['SRE_win'][1:] # Shift Window
st['SRE_win'][-1] = SRE
else:
st['SRE_win'] = np.zeros(((p['SRE_sample_N'] + p['dependency_lag']), 1))
st['SRE_win'][-1] = SRE
# Differenced SRE
st['SRE_dif_sample'] = np.diff(st['SRE_win'], axis = 0)
st['SRE_dif_t'] = st['SRE_dif_sample'][-1]
SRE_sample_sum = (st['SRE_dif_sample'][-(p['SRE_sample_N'] + p['dependency_lag']):-p['dependency_lag']]**2).sum()
# Calculate Test Statistic
st['t_stat'] = st['SRE_dif_t'] / np.sqrt( SRE_sample_sum / (p['SRE_sample_N']-1.0))
# Three Possible Versions to test threshold, +ve only, -ve only or both
calc_anomaly = 0
if st['t'] > p['ignoreUp2']:
if '+ve' in self.A_version and st['t_stat'] > p['t_thresh']:
calc_anomaly = 1
elif '-ve' in self.A_version and st['t_stat'] < -p['t_thresh']:
calc_anomaly = 1
elif 'both' in self.A_version and np.abs(st['t_stat']) > p['t_thresh']:
calc_anomaly = 1
if calc_anomaly:
# Explicitly Calculate reconstruction error vector
recon = np.dot(st['Q'][:,:st['r']],st['ht'][:st['r']])
error = zt[:,0] - recon
# Anomaly is vector of bools showing which streams are responsible
st['anomaly'] = np.abs(error) > np.abs(error).mean() # threshold value may need changing depending on application
self.st = st
def SAX_simple(self, zt):
""" simplest implimentation of SAX in FRAHST
uses sliding window over recent values - may be able to improve to itterative version later...
Takes SAX snap shot when anomalous point is halfway down zt sample.
"""
p = self.p
st = self.st
if 'none' not in self.S_version:
# Build/Slide ht sample window
if st.has_key('ht_sample'):
st['ht_sample'][:-1] = st['ht_sample'][1:] # Shift Window
st['ht_sample'][-1] = st['ht'][0]
else:
st['ht_sample'] = np.zeros(p['zt_sample_size'])
st['ht_sample'][-1] = st['ht'][0]
# Build/Slide zt sample window
if st.has_key('zt_sample'):
st['zt_sample'][:-1,:] = st['zt_sample'][1:,:] # Shift Window
st['zt_sample'][-1,:] = zt[:,0]
else:
st['zt_sample'] = np.zeros((p['zt_sample_size'], self.numStreams))
st['zt_sample'][-1,:] = zt[:,0]
# If anomaly at current time step
if np.any(st['anomaly']):
# Store time step in SAX_que
st['SAX_trigger_q'].append(int(st['t'] + np.round(p['zt_sample_size']/2.)))
if st['SAX_trigger_q']:
# Once any of these times are reached, take SAX snapshot, and remove from que
if st['t'] in st['SAX_trigger_q']:
# get SAX of hidden Var
SAX_array_ht, SAX_dic_ht, seg_means_ht = SAX(np.atleast_2d(st['ht_sample']), p['SAX_alphabet_size'],
p['word_size'], minstd = 0.0001, pre_normed = False)
# Get SAX of data
SAX_array_zt, SAX_dic_zt, seg_means_zt = SAX(st['zt_sample'], p['SAX_alphabet_size'],
p['word_size'], minstd = 0.1, pre_normed = False)
# Remove from que
st['SAX_trigger_q'].remove(st['t'])
# store for data
index = str(int(st['t'] - np.round(p['zt_sample_size']/2.)))
st['SAX_snapshots'][index] = {'zt_SAX_a' : SAX_array_zt ,
'zt_SAX_d' : SAX_dic_zt ,
'zt_seg_m' : seg_means_zt }
# store for hidden variable
st['SAX_snapshots'][index].update({'ht_SAX_a' : SAX_array_ht ,
'ht_SAX_d' : SAX_dic_ht ,
'ht_seg_m' : seg_means_ht })
self.st = st
def track_var(self, values = (), print_anom = 0):
""" Tracks variables specified over time.
At the very least must track time step of anomalies and anomalous streams flagged"""
if not hasattr(self, 'res'):
# initalise res
self.res = {}
for k in values:
self.res[k] = self.st[k]
self.res['anomalies'] = []
self.res['anomalous_streams'] = []
else:
# stack values for all keys
for k in values:
self.res[k] = np.vstack((self.res[k], self.st[k]))
# If anomaly is present, print if specified.
if np.any(self.st['anomaly']):
if print_anom == 1:
print 'Found Anomaly at t = {0}'.format(self.st['t'])
self.res['anomalies'].append(self.st['t'])
self.res['anomalous_streams'].append(self.st['anomaly'])
# Increment time
self.st['t'] += 1
def plot_res(self, var, xname = 'Time Steps', ynames = None, title = None, hline= 1, anom = 1):
"""Plots each of the elements given in var.
var = list of variables. Maximum = 4. if string, will look for them in self.res structure
hline = whether to plot threshold values on final plot.
anom = whether to plot anomalous time ticks.
"""
if ynames is None:
ynames = ['']*4
if title is None:
title = (self.p['version'])
if 'SRE' in self.A_version:
thresh = self.p['t_thresh']
num_plots = len(var)
for i, v in enumerate(var):
if type(v) == str :
var[i] = getattr(self, 'res')[v]
if num_plots == 1:
plt.figure()
plt.plot(var[0])
plt.title(title)
if anom == 1:
for x in self.res['anomalies']:
plt.axvline(x, ymin=0.9, color='r')
elif num_plots == 2:
plot_2x1(var[0], var[1], ynames[:2], xname, main_title = title)
if hline == 1:
plt.hlines(-thresh, 0, self.res['ht'].shape[0], linestyles = 'dashed')
plt.hlines(+thresh, 0, self.res['ht'].shape[0], linestyles = 'dashed')
plt.ylim(-3*thresh,3*thresh)
if anom == 1:
f = plt.gcf()
for ax in f.axes[:-1]:
for x in self.res['anomalies']:
ax.axvline(x, ymin=0.9, color='r')
elif num_plots == 3:
plot_3x1(var[0], var[1], var[2], ynames[:3] , xname, main_title = title)
if hline == 1:
plt.hlines(-thresh, 0, self.res['ht'].shape[0], linestyles = 'dashed')
plt.hlines(+thresh, 0, self.res['ht'].shape[0], linestyles = 'dashed')
plt.ylim(-3*thresh,3*thresh)
if anom == 1:
f = plt.gcf()
for ax in f.axes[:-1]:
for x in self.res['anomalies']:
ax.axvline(x, ymin=0.9, color='r')
elif num_plots == 4:
plot_4x1(var[0], var[1], var[2], var[3], ynames[:4], xname, main_title = title)
plt.title(title)
if hline == 1:
plt.hlines(-thresh, 0, self.res['ht'].shape[0], linestyles = 'dashed')
plt.hlines(+thresh, 0, self.res['ht'].shape[0], linestyles = 'dashed')
plt.ylim(-3*thresh,3*thresh)
if anom == 1:
f = plt.gcf()
for ax in f.axes[:-1]:
for x in self.res['anomalies']:
ax.axvline(x, ymin=0.9, color='r')
plt.draw()
def batch_analysis(self, gt_list, anomalies_list, epsilon = 0, accumulative = 1, keep_sets = 1):
""" Calculate all anomally detection Metrics
# gt_list: list of gt_tables per initial condition
gt_table: ground truth table, each entry has
dtype = [('start','i4'),('loc','i4'),('len','i4'),('mag','i4'),('type','a10')])
# epsilon: used to allow for lagged detections: if anomaly occurs in time window
anom_start - anom_end + eplsilon it is considered a TP.
# accumulative: Whether multiple calls will act accumulateively to metric values.
# keep_sets: whether to store the sets fot TP, FP, TN etc.
"""
# For each initial condition
for k in xrange(len(anomalies_list)):
gt_table = gt_list[k]['gt']
anomalies = anomalies_list[k]
# Detections
D = np.array(anomalies)
index = D > self.p['ignoreUp2']
D = set(list(D[index]))
# initalise metrics
if not hasattr(self, 'metric') or accumulative == 0:
self.metric = { 'TP' : 0.0 ,
'FP' : 0.0 ,
'FN' : 0.0 ,
'TN' : 0.0,
'precision' : 0.0 ,
'recall' : 0.0 ,
'F1' : 0.0,
'F2' : 0.0,
'F05' : 0.0,
'FPR' : 0.0,
'FDR' : 0.0,
'ACC' : 0.0}
self.detection_sets = []
self.anom_detect_tab = []
# set of point anomalies detected as true
anom_TP = set()
# Set of anomalous segments detected
anom_segments_detected_set = set()
# Table to record frequency of anomalous segment detections
anomalies_detected_tab = np.zeros((len(gt_table['start']), 2))
anomalies_detected_tab[:,0] = gt_table['start']
# TRUE POSITIVES
idx = 0
for i in xrange(len(gt_table['start'])):
count = 0
# Run through the list of detections
for d in D:
if d >= gt_table['start'][i] and d <= gt_table['start'][i] + gt_table['len'][i] + epsilon:
# if set does not yet contain the anomaly, add it and increment TP
if not anom_segments_detected_set.issuperset(set([gt_table['start'][i]])):
anom_segments_detected_set.add(gt_table['start'][i])
anom_TP.add(d)
self.metric['TP'] += 1
count += 1
else: # if multiple detections in anomalous segment
count += 1
anom_TP.add(d)
anomalies_detected_tab[idx,1] = count
idx += 1
# FALSE Pos
anom_FP = D - anom_TP
self.metric['FP'] += len(anom_FP)
# FALSE Neg
anom_FN = set(gt_table['start']) - anom_segments_detected_set
self.metric['FN'] += len(anom_FN)
# True Negatives
self.metric['TN'] += (self.st['t'] - self.p['ignoreUp2'] - len(anom_FN) - len(anom_FP) - len(anom_TP))
if self.metric['FP'] == 0 and self.metric['TP'] == 0:
self.metric['precision'] += 0
self.metric['FDR'] += 0
else:
self.metric['precision'] = self.metric['TP'] / (self.metric['TP'] + self.metric['FP'])
self.metric['FDR'] = self.metric['FP'] / (self.metric['FP'] + self.metric['TP'])
self.metric['recall'] = self.metric['TP'] / (self.metric['TP'] + self.metric['FN'])
self.metric['FPR'] = self.metric['FP'] / (self.metric['TN'] + self.metric['FP'])
self.metric['ACC'] = (self.metric['TP'] + self.metric['TN']) / \
( self.metric['TP'] + self.metric['FN'] + self.metric['TN'] + self.metric['FP'] )
self.metric['F1'] = self.fmeasure(1, self.metric['TP'], self.metric['FN'], self.metric['FP'])
self.metric['F2'] = self.fmeasure(2, self.metric['TP'], self.metric['FN'], self.metric['FP'])
self.metric['F05'] = self.fmeasure(0.5, self.metric['TP'], self.metric['FN'], self.metric['FP'])
if keep_sets == 1:
sets = {'TP' : anom_TP,
'anom_seg_detected' : anom_segments_detected_set,
'FN' : anom_FN,
'FP' : anom_FP}
self.detection_sets.append(sets)
self.anom_detect_tab.append(anomalies_detected_tab)
def fmeasure(self, B, hits, misses, falses):
""" General formular for F measure
Uses TP(hits), FN(misses) and FP(falses)
"""
x = ((1 + B**2) * hits) / ((1 + B**2) * hits + B**2 * misses + falses)
return x
def plot_SAX(self, anomaly):
""" For a given detected anomaly, plots the SAX snapshot around that time point """
# Need to add input checks
segs = self.st['SAX_snapshots'][str(anomaly)]['zt_seg_m']
plot_SAX(segs, self.p['SAX_alphabet_size'], self.p['comp_ratio'])
def plot_dat(self, anomaly, data, standardise = 1):
""" For a given detected anomaly, plots the data around that time point """
# Need to add input checks
sample_half = int(round( self.p['zt_sample_size']/2.))
dat = data[anomaly-sample_half:anomaly+sample_half,:]
if standardise:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(zscore(dat))
bpList = bp_lookup(self.p['SAX_alphabet_size'])
for bp in bpList:
ax.axhline(y=bp, xmin=0, xmax=dat.shape[0], ls = '--', color = 'k')
adjust_spines(ax, ['bottom'])
ax.set_yticklabels([])
ax.yaxis.set_ticks([])
ax.set_xticklabels(range(anomaly-sample_half,anomaly+sample_half+1))
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(18)
else:
plt.figure()
plt.plot(dat)
bpList = bp_lookup(self.p['SAX_alphabet_size'])
plt.hlines(bpList, xmin=0, xmax=dat.shape[0]-1, linestyles = 'dashed', color = 'k')
if __name__=='__main__':
''' Experimental Run Parameters '''
p = {'alpha': 0.98, # The exponential decay factor
'init_r' : 2, # the initial number of hidden variables
'fix_init_Q' : 0, # whether to fix initial Q as Identity or make random
'small_value' : 0.0001, # Used to avoid non-signularities
'ignoreUp2' : 50, # Starting time steps ignored for anomalies
# Statistical Anomaly Detection
'SRE_sample_N' : 20, # Size of SRE sample
'dependency_lag' : 5, # lag between current and sampled SRE
't_thresh' : None, # Threshold value for test statistic
'FP_rate' : 10**-5, # Significance level of statistical test
# SAX Parameters
'word_size' : 10, # No. of symbols in each word
'zt_sample_size' : 10, # No. of data points in sample
'SAX_alphabet_size' : 6, # No. of characters in word
'comp_ratio' : None} # Compression Ratio
# Calculate threshold
p['t_thresh'] = sp.stats.t.isf(1.0 * p['FP_rate'], p['SRE_sample_N'])
p['comp_ratio'] = float(p['zt_sample_size']) / float(p['word_size'])
# Anomalous Data Parameters for using synthetic data set
a = { 'N' : 50, # No. of streams
'T' : 1000, # Total time steps
'periods' : [15, 50, 70, 90], # periods of sinosoids
'L' : 10, # Length of anomaly (start)
'L2' : 200, # Length of anomaly (hold)
'M' : 5, # Magnitude of anomaly
'pA' : 0.1, # Percentage of streams that are anomalous
'noise_sig' : 0.0, # noise added
'seed' : 234} # Random seed
anomaly_type = 'grad_persist' # choice of peak_dip, grad_persist and step
gen_funcs = dict(peak_dip = gen_a_peak_dip,
grad_persist = gen_a_grad_persist,
step = gen_a_step)
""" Choice of data sets. Comment/Uncomment to choose. """
"""----------------------------------------------------"""
''' Synthetic Data sets '''
#data_name = 'synth'
#D = gen_funcs[anomaly_type](**a)
#raw_data = D['data']
#data = raw_data.copy()
'''ISP data sets '''
data_name = 'isp_routers'
raw_data = load_ts_data(data_name, 'full')
data = raw_data.copy()
''' Sensor Motes data sets '''
#data_name = 'motes_l'
#raw_data = load_data(data_name)
#data = clean_zeros(raw_data, cpy=1)
''' Data Preprocessing '''
""" Data is loaded into memory, mean centered and standardised
then converted to an iterable to read by the CD-ST each iteration"""
#data = zscore_win(data, 100) # Sliding window implimentation
data = zscore(data) # Batch method implimentation
data = np.nan_to_num(data)
z_iter = iter(data)
numStreams = data.shape[1]
''' Initialise CDST Algorithm '''
CDST_alg = CDST('F-FHST.A-SREboth', p, numStreams)
''' Main Loop '''
for zt in z_iter:
zt = zt.reshape(zt.shape[0],1) # Convert to a column Vector
# Reset anomaly flag if last iteration flagged anomaly
if np.any(CDST_alg.st['anomaly']):
CDST_alg.st['anomaly'][:] = False
''' Next Input Method '''
CDST_alg.next_input(zt)
'''Store data'''
# Calculate reconstructed data if needed for plotting visulisations
st = CDST_alg.st
CDST_alg.st['recon'] = np.dot(st['Q'][:,:st['r']],st['ht'][:st['r']])
tracked_values = ['ht','e_ratio','r', 't_stat', 'SRE_dif_t', 'Ez', 'Eh', 'recon']
#tracked_values = ['ht','e_ratio','r', 't_stat', 'SRE_dif_t', 'Ez', 'Eh', 'recon']
CDST_alg.track_var(tracked_values, print_anom = 1)
#CDST_alg.track_var()
''' Plot Results '''
#CDST_alg.plot_res([data, 'ht', 't_stat'])
#CDST_alg.plot_res([data, 'recon', 't_stat'])
CDST_alg.plot_res([data, 'ht', 't_stat'], ynames =['Standardised Data', 'Hidden Variables', 'Test Statistic'])
CDST_alg.plot_res([data, 'SRE_dif_t', 't_stat'], ynames =['Standardised Data', 'SRE', 'Test Statistic'])
CDST_alg.plot_res([raw_data, data, 't_stat'], ynames =['Raw Data', 'Standardised Data', 'Test Statistic'])