def visualize_features(self, data, file_name, method='TSNE', show=False): fig = plt.figure() tc.yellow('Visualize features using {}...'.format(method)) features = data.drop(['is_anomaly', 'window_label'], axis=1) if (method == 'TSNE'): embedded = TSNE(n_components=2).fit_transform(features) elif (method == 'UMAP'): embedded = umap.UMAP().fit_transform(features) ai = data.index[data.is_anomaly == 1].tolist() ni = data.index[data.is_anomaly == 0].tolist() normal = plt.scatter(embedded[ni, 0], embedded[ni, 1], c='blue', s=2) anomaly = plt.scatter(embedded[ai, 0], embedded[ai, 1], c='red', s=2) # Add time window labels to feature plot # for i in ai: # wl = data.loc[i].window_label # plt.annotate( # '{} ({})'.format(i, wl), # (embedded[i, 0], embedded[i, 1]) # ) plt.legend((normal, anomaly), ('Normal', 'Anomaly'), loc='lower right') plt.title('{} projection of the features\n'.format(method)) fig.tight_layout() file_name = file_name.replace('.csv', '') file_path = '{}_{}-features.png'.format(file_name, method) fig.savefig(file_path) if show: plt.show() plt.close() tc.green('Saved {} visualized features using {}'.format( method, file_path))
def generate_features(self, timeseries, anomaly_labels, window_size, file_name, method='ARMA', order=(2, 2), stride=0): """Process the complete timeseries. Create windows first and then encode each window to reduce the dimensionality. Returns ------- features: DataFrame List of features with anomaly labels """ if stride == 0: stride = window_size / 2 if (method == 'ARMA'): get_parameters = self.get_arma_params elif (method == 'ARIMA'): get_parameters = self.get_arima_params else: raise ValueError('Unkown method {}.'.format(method) + 'Only ARMA and ARIMA are supported.') window_columns = ['window_start', 'window_end', 'is_anomaly'] windows = pd.DataFrame(columns=window_columns) features = pd.DataFrame() window_starts = np.arange(0, len(timeseries), step=stride, dtype=int) tc.yellow("Generating features...") for i, start in enumerate(tqdm(window_starts)): end = int(start + window_size - 1) window_data = timeseries[start:end] window_is_anomaly = min(1, sum(anomaly_labels[start:end])) windows.loc[i] = [start, end, window_is_anomaly] fitted = get_parameters(window_data, order) if i == 0: feature_columns = np.append(fitted.data.param_names, ('is_anomaly', 'window_label')) features = pd.DataFrame(columns=feature_columns) window_label = '{}-{}'.format(start, end) # TODO: add fitted.sigma2 newRow = np.append(fitted.params, (window_is_anomaly, window_label)) features.loc[i] = newRow features.is_anomaly = features.is_anomaly.astype(int) features.to_csv(file_name, index=False) # Save features to file tc.green('Saved features in {}'.format(file_name)) return pd.read_csv(file_name)
def detect_anomalies(train_features, test_features, test_labels): regularization_strengths = [0.0, 0.00001, 0.0001, 0.001, 0.01, 0.1] regularization_strengths = [0.0001] for regularization_strength in regularization_strengths: tc.yellow('Running with regularization_strength {}...'.format( regularization_strength)) result_file_name = '{}/anomaly_scores_regularization_{}.csv'.format( folder, str(regularization_strength).replace('.', '_')) encoder.run(train_features, test_features, test_labels, regularization_strength, result_file_name)
def detect_anomalies(train_features, test_features, test_labels, out_folder): regularization_strengths = [0.0, 0.00001, 0.0001, 0.001, 0.01, 0.1] epochs = 100 for regularization_strength in regularization_strengths: tc.yellow('Running with regularization_strength {}...'.format( regularization_strength)) fn = '{}/anomaly_scores_regularization_{}_epochs_{}.csv'.format( out_folder, str(regularization_strength).replace('.', '_'), epochs) encoder.run(train_features, test_features, test_labels, regularization_strength, fn, epochs)
def detect_anomalies(self, X, show=False): plot_num = 1 plt.figure(figsize=(len(self.anomaly_algorithms) * 2 + 3, 6)) for name, algorithm in self.anomaly_algorithms: tc.yellow('Detecting anomalies using {}...'.format(name)) algorithm.fit(X) plt.subplot(1, len(self.anomaly_algorithms), plot_num) plt.title(name, size=18) # fit the data and tag outliers if name == "Local Outlier Factor": y_pred = algorithm.fit_predict(X) else: y_pred = algorithm.fit(X).predict(X) # Print and plot self.print_anomalies(name, y_pred) self.plot_anomalies(name, X, y_pred, plt) plot_num += 1 plt.tight_layout() plt.savefig(self.file) tc.green('Saved anomaly plot to {}'.format(self.file)) if show: plt.show()
def generate_timeseries(self, show=False, seed=12345): np.random.seed(seed) """Stitch together two time series with different ARMA parameters to generate one timeseries which contains anomalies. Parameters ---------- show : bool Show generated data as plots. Returns ------- stitched_data: array Data containing anomalies. """ # Genrate the two timeseries (with different ARMA parameters) tc.yellow('Generating normal timeseries...') ar, ma = self.arma_generate_params([.75, -.25], [.65, .35]) default_series = arima.arma_generate_sample(ar, ma, self.nsample) default_series = pd.DataFrame(default_series, columns=['value']) default_series['is_anomaly'] = int(0) tc.yellow('Generating anomaly timeseries...') ar, ma = self.arma_generate_params([.75, -.25], [-.65, .35]) anomaly_series = arima.arma_generate_sample(ar, ma, self.nsample) anomaly_series = pd.DataFrame(anomaly_series, columns=['value']) anomaly_series['is_anomaly'] = int(1) # Plot the two timeseries if show: self.show_raw_data(default_series, anomaly_series) tc.yellow( 'Combining the two timeseries to get one time series' 'containing anomalies...' ) stitched_data = default_series for anomaly in self.anomalies: start = anomaly * self.window_size end = (anomaly + 1) * self.window_size # Inject anomalies stitched_data[start : end] = anomaly_series[start : end] self.create_data_plot(stitched_data, show) self.save_data(stitched_data) return pd.DataFrame(stitched_data)
def run(training_data, test_data, test_labels, regularization_strength, file_name, epochs=100): assert training_data.shape[1] == test_data.shape[1] # Train autoencoder network encoding_dim = 2 model = Sequential() data_dim = test_data.shape[1] layers = [data_dim] hidden_dim = int(data_dim / 2) # Input layer and first encoding layer model.add( Dense(hidden_dim, input_dim=data_dim, activation='relu', activity_regularizer=l2(regularization_strength), name='encoding_{}'.format(hidden_dim))) layers.append(hidden_dim) # Add layers with decreasing size hidden_dim = int(hidden_dim / 2) while encoding_dim <= hidden_dim: model.add( Dense(hidden_dim, activation='relu', activity_regularizer=l2(regularization_strength), name='encoding_{}'.format(hidden_dim))) layers.append(hidden_dim) hidden_dim = int(hidden_dim / 2) # Add layers with increasing size layers.pop() # remove smallest element for hidden_dim in sorted(layers): model.add( Dense(hidden_dim, activation='relu', activity_regularizer=l2(regularization_strength), name='decoding_{}'.format(hidden_dim))) # Output layer model.add(Dense(data_dim, name='output')) # Multiple output neurons model.compile(loss='mean_squared_error', optimizer='adam') model.fit(training_data, training_data, verbose=1, epochs=epochs) # Save network structure to png dirname = os.path.dirname(file_name) fn = '{}/auto_encoder_model.png'.format(dirname) plot_model(model, to_file=fn, show_shapes=True) tc.green('Saved model image as {}'.format(fn)) pred = model.predict(training_data) score = np.sqrt(metrics.mean_squared_error(pred, training_data)) tc.yellow("Training Normal Score (RMSE): {}".format(score)) pred = model.predict(test_data) score = np.sqrt(metrics.mean_squared_error(pred, test_data)) tc.yellow("Test Normal Score (RMSE): {}".format(score)) # Predict / create anomaly scores scores = [] tc.yellow('Generating anomaly scores...') for feature in tqdm(test_data): pred = model.predict(np.array([feature])) score = np.sqrt(metrics.mean_squared_error(pred, np.array([feature]))) scores.append(score) # Save scores (anomaly scores) df = pd.DataFrame({'anomaly_score': scores, 'is_anomaly': test_labels}) df.to_csv(file_name, index=False) tc.green('Saved file {}'.format(file_name)) visualize_and_save(scores, test_labels, file_name, regularization_strength)