def discretize(dataset, filter='L_T1', readings_per_letter=1, alphabet_size=3, sliding=0, move=1, plot=0): column = dataset[filter] s = SAX(len(column) / readings_per_letter, alphabet_size) # get the letter representation of the data if sliding: (xString, xIndices) = s.sliding_window( column, sliding, move) # get letter rep for sliding windows else: (xString, xIndices) = s.to_letter_rep(column[1:(1 + len(column))]) # construct a column with letter representations sax = [] for i in range(0, len(xString)): for x in range(0, readings_per_letter): sax.insert((i * x) + 1, int(xString[i])) if plot: # plot the original and the letter represented values fig, ax1 = pyplot.subplots() ax1.plot(column[1:(1 + len(column))]) ax2 = ax1.twinx() ax2.plot(sax, 'r.') pyplot.show() return xString
def __init__(self, N=800, symbols_per_word=4, alphabet="abcd"): AnomalyDetector.__init__(self) self.N = N self.symbols_per_word = symbols_per_word self.data_buffer = [] self.apply_count = N self.sax = SAX() self.alphabet = alphabet
def dist(self, p, q): self.distCalls += 1 s = SAX(wordSize=min(len(self.timeseries[p[0]:p[1]]), len(self.timeseries[q[0]:q[1]]))) #normp = s.normalize(self.timeseries[p[0]:p[1]]) #normq = s.normalize(self.timeseries[q[0]:q[1]]) normp = [] normq = [] start = time.time() if len(self.timeseries[p[0]:p[1]]) > len(self.timeseries[q[0]:q[1]]): normp = s.normalize(s.to_PAA(self.timeseries[p[0]:p[1]])[0]) normq = s.normalize(self.timeseries[q[0]:q[1]]) elif len(self.timeseries[q[0]:q[1]]) > len(self.timeseries[p[0]:p[1]]): normq = s.normalize(s.to_PAA(self.timeseries[q[0]:q[1]])[0]) normp = s.normalize(self.timeseries[p[0]:p[1]]) else: normp = s.normalize(self.timeseries[p[0]:p[1]]) normq = s.normalize(self.timeseries[q[0]:q[1]]) sqval = 0.0 for a, b in zip(normp, normq): sqval += (a - b)**2 sqval = math.sqrt(sqval) dist = sqval / float(len(normp)) end = time.time() self.totalDistTime += (end - start) return dist
def main(): train1_df = pd.read_csv('BATADAL_dataset03.csv', index_col='DATETIME') train2_df = pd.read_csv('BATADAL_dataset04.csv', index_col=0) train1_df.index = pd.to_datetime(train1_df.index, dayfirst=True) train2_df['n_gram'] = np.zeros(len(train2_df)) for col in ['L_T1', 'F_PU11', 'S_PU6']: window_size = 10 word_size = 3 alphabet_size = 3 stride = 1 sax = SAX(wordSize=word_size, alphabetSize=alphabet_size) # for column_name in train1_df: train_string_rep, train_window_indices = sax.sliding_window( train1_df[col].values, cover=window_size, stride=stride) train_string_rep2, train_window_indices2 = sax.sliding_window( train2_df[col].values, cover=window_size, stride=stride) threshold = 1e-6 model = n_gram_model(train_string_rep) anomalies, probabilities = n_gram_predict(model, train_string_rep2, train_window_indices2, threshold, window_size) print('window: {}, word: {}, alphabet: {}, threshold: {}'.format( window_size, word_size, alphabet_size, threshold)) train2_df['ATT_FLAG_anom'] = np.where(train2_df['ATT_FLAG'] == 1, 100, 0) train2_df['n_gram'] += probabilities train2_df['n_gram'] = np.where(train2_df['n_gram'] > 0, 1, 0) train2_df['diff'] = train2_df['ATT_FLAG_anom'] - train2_df['n_gram'] arr = train2_df['diff'].value_counts() TTD = utils.TDD_metric(train2_df, probabilities) TP = arr[99] FP = arr[-1] TN = arr[0] FN = arr[100] S_CM = utils.S_cm(TP, FP, TN, FN) accuracy = (TP + TN) / len(train2_df) precision = TP / (TP + FP) print('accuracy: {}, precision: {}'.format(accuracy, precision)) print('TDD: {}'.format(TTD)) print('S_cm: {}'.format(S_CM)) print('Ranked: {}'.format(0.5 * TTD + 0.5 * S_CM))
class TSBitmaps(AnomalyDetector): def __init__(self, lag_window=8, lead_window=4, anomaly_threshold=0.5, N=1600, n=400, alphabet="abcd", bitmap_level=2): super(TSBitmaps, self).__init__() self.alphabet = alphabet self.lagging_tsb = TimeseriesBitmap(self.alphabet, bitmap_level, lag_window) self.leading_tsb = TimeseriesBitmap(self.alphabet, bitmap_level, lead_window) self.sax = SAX() self.N = N # size of the feature window #self.n = n # size of the symbol section self.symbols_per_word = self.N / n self.data_buffer = [] self.word_buffer = [] self.anomaly_threshold = anomaly_threshold def apply(self, x): self.data_buffer.append(x) if len(self.data_buffer) == self.N: self.word_buffer.append(self.sax.convert(self.data_buffer, self.alphabet, self.symbols_per_word)) del self.data_buffer[0] if len(self.word_buffer) == self.leading_tsb.window_size: self.lagging_tsb.update(self.word_buffer[0]) self.leading_tsb.update(self.word_buffer[-1]) del self.word_buffer[0] return self.lagging_tsb.getAnomalyScore(self.leading_tsb) return 0 def detect(self, x, spirit_weights=None): anomaly_level = self.apply(x) if anomaly_level > self.anomaly_threshold: return Anomaly(anomaly_level, spirit_weights) return None
def HOTSAX(T, n, w, a, num_discords=1): ED_counter = 0 T_sax = SAX(T, n, w, a) words, counts = np.unique(T_sax, return_counts=True) best_so_far_dist = 0 best_so_far_loc = None l = len(T) outer = np.concatenate( [np.where(T_sax == words[i])[0] for i in np.argsort(counts)]) for p in outer: nearest_neighbour_dist = np.Inf inner = np.concatenate((np.where(T_sax == T_sax[p])[0], np.random.choice(l - n, size=l - n, replace=False))) for q in inner: if np.abs(p - q) >= n: ED_counter += 1 dist = distance.euclidean(T[p:p + n], T[q:q + n]) nearest_neighbour_dist = min(dist, nearest_neighbour_dist) if nearest_neighbour_dist < best_so_far_dist: break if nearest_neighbour_dist > best_so_far_dist: best_so_far_dist = nearest_neighbour_dist best_so_far_loc = p return (best_so_far_dist, best_so_far_loc, ED_counter)
def __init__(self, N = 800, symbols_per_word = 4, alphabet = "abcd"): AnomalyDetector.__init__(self) self.N = N self.symbols_per_word = symbols_per_word self.data_buffer = [] self.apply_count = N self.sax = SAX() self.alphabet = alphabet
def main(args): if len(args) < 2: print "Usage: python %s {filename}" % (args[0]) exit() filename = args[1] print "Reading..." data = [] with open(filename) as f: for line in f: data.append(float(line.split()[1])) print "Read %d lines." % (len(data)) print "Converting to SAX..." sax = SAX() N = 1600 # size of the sliding window n = 400 # size of a symbol lag_window = 8 lead_window = 4 bitmap_level = 2 alphabet = "abcd" chunks = slidingWindow(data, N, step=n) words = [] for chunk in chunks: word = sax.convert(chunk, alphabet, n) words.append(word) print "Anomaly detection..." with open("bitmap_anomaly.txt", "w") as out: bitmap1 = TimeseriesBitmap(alphabet, bitmap_level, lag_window) bitmap2 = TimeseriesBitmap(alphabet, bitmap_level, lead_window) for i in xrange(len(words) - (lag_window + lead_window)): bitmap1.update(words[i]) bitmap2.update(words[i + lag_window]) if i >= lag_window: score = bitmap1.getAnomalyScore(bitmap2) else: score = 0.0 for _ in xrange(n): out.write("%s\n" % score)
def __init__(self, lag_window=8, lead_window=4, anomaly_threshold=0.5, N=1600, n=400, alphabet="abcd", bitmap_level=2): super(TSBitmaps, self).__init__() self.alphabet = alphabet self.lagging_tsb = TimeseriesBitmap(self.alphabet, bitmap_level, lag_window) self.leading_tsb = TimeseriesBitmap(self.alphabet, bitmap_level, lead_window) self.sax = SAX() self.N = N # size of the feature window #self.n = n # size of the symbol section self.symbols_per_word = self.N / n self.data_buffer = [] self.word_buffer = [] self.anomaly_threshold = anomaly_threshold
class testSax(unittest.TestCase): def setUp(self): self.sax = SAX() self.delta = 1.0e-10 def testEuclideanDistance(self): sig1 = [i for i in xrange(100)] sig2 = [i + 0.5 for i in xrange(100)] lse = self.sax.euclidean_dist(sig1, sig2) assert lse == 5.0 def testNormalizeOnRandom(self): orig_sig = [random.uniform(0, 1) for _ in xrange(1000)] sig = self.sax.normalize(orig_sig) # properly Z-normalized signal should have a mean # very close to 0 and standard deviation very close to 1.0 assert abs(np.mean(sig)) < self.delta assert abs(np.std(sig) - 1.0) < self.delta def testPAA(self): siglen = 100 M = 10 orig_sig = [random.uniform(0, 1) for _ in xrange(siglen)] paa_sig = self.sax.to_PAA(orig_sig, M) self.assertEquals(len(paa_sig), M) self.assertEquals(np.mean(orig_sig[:M]), paa_sig[0]) def testPAAexample(self): orig_sig = [ 2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34 ] M = 9 paa_sig = self.sax.to_PAA(orig_sig, M) res_sig = [ -0.9327168, -0.3699053, 1.383673, 1.391248, 0.6299752, 0.01641218, -0.05933634, -0.8387886, -1.220561 ] assert len(paa_sig) == len(res_sig) M = 5 paa_sig = self.sax.to_PAA(orig_sig, M) res_sig2 = [-0.9379922, -0.0857173, 0.4738943, 1.444949, -0.8951336] assert len(paa_sig) == len(res_sig2) def testPAAZero(self): orig_sig = [0] * 20 paa_sig = self.sax.to_PAA(orig_sig, 5) self.assertTrue(not any(paa_sig)) self.assertEqual(self.sax.convert(orig_sig, "abcd"), "aaaaaaaa")
class HOTSAXDetectorBruteForce(AnomalyDetector): def __init__(self, N=800, symbols_per_word=4, alphabet="abcd"): AnomalyDetector.__init__(self) self.N = N self.symbols_per_word = symbols_per_word self.data_buffer = [] self.apply_count = N self.sax = SAX() self.alphabet = alphabet def apply(self, x): self.data_buffer.append(x) if len(self.data_buffer) == self.N: word = self.sax.convert(self.data_buffer, self.alphabet, self.symbols_per_word) del self.data_buffer[0]
class testSax(unittest.TestCase): def setUp(self): self.sax = SAX() self.delta = 1.0e-10 def testEuclideanDistance(self): sig1 = [ i for i in xrange(100) ] sig2 = [ i + 0.5 for i in xrange(100) ] lse = self.sax.euclidean_dist(sig1, sig2) assert lse == 5.0 def testNormalizeOnRandom(self): orig_sig = [ random.uniform(0, 1) for _ in xrange(1000) ] sig = self.sax.normalize(orig_sig) # properly Z-normalized signal should have a mean # very close to 0 and standard deviation very close to 1.0 assert abs(np.mean(sig)) < self.delta assert abs(np.std(sig) - 1.0) < self.delta def testPAA(self): siglen = 100 M = 10 orig_sig = [ random.uniform(0, 1) for _ in xrange(siglen) ] paa_sig = self.sax.to_PAA(orig_sig, M) self.assertEquals(len(paa_sig), M) self.assertEquals(np.mean(orig_sig[:M]), paa_sig[0]) def testPAAexample(self): orig_sig = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34] M = 9 paa_sig = self.sax.to_PAA(orig_sig, M) res_sig = [-0.9327168, -0.3699053, 1.383673, 1.391248, 0.6299752, 0.01641218, -0.05933634, -0.8387886, -1.220561] assert len(paa_sig) == len(res_sig) M = 5 paa_sig = self.sax.to_PAA(orig_sig, M) res_sig2 = [-0.9379922, -0.0857173, 0.4738943, 1.444949, -0.8951336] assert len(paa_sig) == len(res_sig2) def testPAAZero(self): orig_sig = [0] * 20 paa_sig = self.sax.to_PAA(orig_sig, 5) self.assertTrue(not any(paa_sig)) self.assertEqual(self.sax.convert(orig_sig, "abcd"), "aaaaaaaa")
class HOTSAXDetectorBruteForce(AnomalyDetector): def __init__(self, N = 800, symbols_per_word = 4, alphabet = "abcd"): AnomalyDetector.__init__(self) self.N = N self.symbols_per_word = symbols_per_word self.data_buffer = [] self.apply_count = N self.sax = SAX() self.alphabet = alphabet def apply(self, x): self.data_buffer.append(x) if len(self.data_buffer) == self.N: word = self.sax.convert(self.data_buffer, self.alphabet, self.symbols_per_word) del self.data_buffer[0]
class TSBitmaps(AnomalyDetector): def __init__(self, lag_window=8, lead_window=4, anomaly_threshold=0.5, N=1600, n=400, alphabet="abcd", bitmap_level=2): super(TSBitmaps, self).__init__() self.alphabet = alphabet self.lagging_tsb = TimeseriesBitmap(self.alphabet, bitmap_level, lag_window) self.leading_tsb = TimeseriesBitmap(self.alphabet, bitmap_level, lead_window) self.sax = SAX() self.N = N # size of the feature window #self.n = n # size of the symbol section self.symbols_per_word = self.N / n self.data_buffer = [] self.word_buffer = [] self.anomaly_threshold = anomaly_threshold def apply(self, x): self.data_buffer.append(x) if len(self.data_buffer) == self.N: self.word_buffer.append( self.sax.convert(self.data_buffer, self.alphabet, self.symbols_per_word)) del self.data_buffer[0] if len(self.word_buffer) == self.leading_tsb.window_size: self.lagging_tsb.update(self.word_buffer[0]) self.leading_tsb.update(self.word_buffer[-1]) del self.word_buffer[0] return self.lagging_tsb.getAnomalyScore(self.leading_tsb) return 0 def detect(self, x, spirit_weights=None): anomaly_level = self.apply(x) if anomaly_level > self.anomaly_threshold: return Anomaly(anomaly_level, spirit_weights) return None
def setUp(self): self.sax = SAX() self.delta = 1.0e-10
def main(): train1_df = pd.read_csv('BATADAL_dataset03.csv', index_col='DATETIME') train2_df = pd.read_csv('BATADAL_dataset04.csv', index_col=0) # test_df = pd.read_csv('BATADAL_test_dataset.csv', index_col=0) train1_df.index = pd.to_datetime(train1_df.index, dayfirst=True) labels = [] train2_df['n_gram'] = np.zeros(len(train2_df)) for col in [ 'L_T1', 'F_PU11', 'S_PU6' ]: #'L_T4', 'L_T7', 'S_PU10', 'S_PU11', 'F_PU10', 'F_PU2', 'F_PU6', 'F_PU7 'S_PU2', window_size = 10 word_size = 3 alphabet_size = 3 stride = 1 sax = SAX(wordSize=word_size, alphabetSize=alphabet_size) # for column_name in train1_df: train_string_rep, train_window_indices = sax.sliding_window( train1_df[col].values, cover=window_size, stride=stride) train_string_rep2, train_window_indices2 = sax.sliding_window( train2_df[col].values, cover=window_size, stride=stride) threshold = 1e-6 # print(train_string_rep) # print(train_window_indices) # print(np.shape(train_string_rep2)) model = n_gram_model(train_string_rep) anomalies, probabilities = n_gram_predict(model, train_string_rep2, train_window_indices2, threshold, window_size) print('window: {}, word: {}, alphabet: {}, threshold: {}'.format( window_size, word_size, alphabet_size, threshold)) # print(anomalies) # print(np.shape(probabilities)) # print(np.shape(train2_df.values)) plt.clf() train2_df['ATT_FLAG_anom'] = np.where(train2_df['ATT_FLAG'] == 1, 100, 0) # ax = train2_df['ATT_FLAG_anom'].plot(grid=True, color='r', label='Anomaly') # labels += probabilities train2_df['n_gram'] += probabilities # ax2 = train2_df['n_gram'].plot(grid=True, label='Validation') # # plt.legend() # plt.title('window: {} threshold: {}, col:{}'.format(window_size, threshold, col)) # plt.savefig('images/fig_{}_{}_{}.png'.format(window_size, threshold, col)) # plt.show() # # plt.plot(probabilities, '.') # plt.show() # model = NgramModel(3, train_string_rep) # perplexity = model.perplexity(train_string_rep2) train2_df['n_gram'] = np.where(train2_df['n_gram'] > 0, 1, 0) train2_df['diff'] = train2_df['ATT_FLAG_anom'] - train2_df['n_gram'] arr = train2_df['diff'].value_counts() print(arr) TTD = utils.TDD_metric(train2_df, probabilities) TP = arr[99] FP = arr[-1] TN = arr[0] FN = arr[100] S_CM = utils.S_cm(TP, FP, TN, FN) accuracy = (TP + TN) / len(train2_df) precision = TP / (TP + FP) print('accuracy: {}, precision: {}'.format(accuracy, precision)) print('TDD: {}'.format(TTD)) print('S_cm: {}'.format(S_CM)) print('Ranked: {}'.format(0.5 * TTD + 0.5 * S_CM))
def reduce_dimension_function(option, X_train, new_dim): if option == 'pca': n_batches = 10 pca = PCA(n_components=new_dim) pca.fit(X_train) X_reduced = pca.transform(X_train) print(np.shape(X_reduced)) return X_reduced elif option == 'autoencoder': autoe = AUTOE() autoe.set_data(X_train) autoe.shuffle_data() # autoe.normalize(-1.0, 1.0) autoe.divide_data(0.8) autoe.create_autoencoder(new_dim) # autoe.normalize() # best results of clustering for interval [0, 1] # autoe.standardize() autoe.train_autoencoder() # autoe.test_autoencoder() # autoe.get_activations() autoe.sort_activations() # autoe.plot_reconstruction(i+1) # autoe.save_activations('caract_autoe.csv') # autoe.save_activations(filename+'_'+str(i+1)+'.csv') # autoe.save_activations('caract_autoe.csv') return autoe.get_activations() elif option == 'svd': svd = SVD() svd.set_data(X_train) # svd.load_data('dataset.csv') svd.shuffle_data() # svd.normalize(-1.0,1.0) # svd.standardize() svd.run_svd(new_dim) svd.sort_coefficients() # svd.save_activations('caract_'+svd.__class__.__name__.lower()+'60.csv') # svd.save_activations(filename+'_'+str(i+1)+'.csv') return svd.get_coefficients() elif option == 'cp': cp = CP() cp.set_data(X_train) # cp.load_data('dataset.csv') cp.shuffle_data() # cp.normalize(-1.0, 1.0) # cp.standardize() cp.execute_cp(new_dim) cp.sort_coefficients() # cp.save_activations(filename+'_'+str(i+1)+'.csv') # cp.save_activations('caract_cp.csv') return cp.get_coefficients() elif option == 'dct': dcost = DCT() dcost.set_data(X_train) dcost.shuffle_data() # dcost.normalize(-1.0, 1.0) dcost.execute_dct(new_dim) dcost.sort_coefficients() # dcost.save_activations(filename+'_'+str(i+1)+'.csv') # dcost.save_activations('caract_dct.csv') return dcost.get_coefficients() elif option == 'dwt': dwt = DWT() dwt.set_data(X_train) dwt.shuffle_data() # dwt.normalize(-1,1) # dwt.standardize() dwt.execute_dwt(new_dim) dwt.sort_coefficients() return dwt.get_coefficients() elif option == 'ipla': paa = IPLA() paa.set_data(X_train) # paa.load_data('dataset.csv') paa.shuffle_data() # paa.normalize() # paa.standardize() paa.execute_ipla(new_dim) paa.sort_coefficients() return paa.get_coefficients() elif option == 'paa': paa = PAA() paa.set_data(X_train) # paa.load_data('dataset.csv') paa.shuffle_data() # paa.normalize(-1, 1) # paa.standardize() paa.execute_paa(new_dim) paa.sort_coefficients() return paa.get_coefficients() elif option == 'sax': sax = SAX() sax.set_data(X_train) sax.shuffle_data() # sax.normalize() # sax.standardize() sax.execute_sax(new_dim) sax.sort_coefficients() return sax.get_coefficients() else: return 'Invalid option'
data = raw_data.strip().split('\n') data = np.array([float(x) for x in data]) size = data.shape[0] r = 1.55 distance_cnt = 0 def distance(d1, d2): global distance_cnt distance_cnt += 1 return np.linalg.norm(d1 - d2) start = timeit.default_timer() wordSize = 5 alphabetSize = 4 mysax = SAX(wordSize, alphabetSize) mytrie = Trie(wordSize=wordSize, alphabetSize=alphabetSize) symbol_list = [] for i in range(0, size - window +1): s, idx = mysax.to_letter_rep(data[i:window+i]) symbol_list.append(s) mytrie.add(s, i) print 'construct trie: ', timeit.default_timer()-start def in_different_windoe(i): def pred(t): return True if abs(i - t) >= window else False return pred
def buildMotifs(self): s = SAX(self.wordSize, self.alphabetSize) self.saxterms = s.sliding_window(self.timeseries, self.windowSize) self.grammar = Grammar() self.grammar.train_string(self.saxterms) self.myrules = self.grammar.getRules()