def visualize_features(self, data, file_name, method='TSNE', show=False): fig = plt.figure() tc.yellow('Visualize features using {}...'.format(method)) features = data.drop(['is_anomaly', 'window_label'], axis=1) if (method == 'TSNE'): embedded = TSNE(n_components=2).fit_transform(features) elif (method == 'UMAP'): embedded = umap.UMAP().fit_transform(features) ai = data.index[data.is_anomaly == 1].tolist() ni = data.index[data.is_anomaly == 0].tolist() normal = plt.scatter(embedded[ni, 0], embedded[ni, 1], c='blue', s=2) anomaly = plt.scatter(embedded[ai, 0], embedded[ai, 1], c='red', s=2) # Add time window labels to feature plot # for i in ai: # wl = data.loc[i].window_label # plt.annotate( # '{} ({})'.format(i, wl), # (embedded[i, 0], embedded[i, 1]) # ) plt.legend((normal, anomaly), ('Normal', 'Anomaly'), loc='lower right') plt.title('{} projection of the features\n'.format(method)) fig.tight_layout() file_name = file_name.replace('.csv', '') file_path = '{}_{}-features.png'.format(file_name, method) fig.savefig(file_path) if show: plt.show() plt.close() tc.green('Saved {} visualized features using {}'.format( method, file_path))
def visualize_labelled_series(anomaly_windows, show=True): file_names = get_sorted_file_names() files_test = file_names[30:] file_count = len(files_test) fn = '{}/plot_test_taxi_files_anomalies.png'.format(result_dir) data_test = load_files(files_test, file_count) # visualize(data_test, file_count, fn, show=True) fig, ax = plt.subplots() plt.plot(data_test.index, data_test.values, color='blue') title = '{} files nyc taxi (custom) dataset (test data)'.format(file_count) plt.title(title) # Highlight anomalies for window in anomaly_windows: start, end = window.split('-') start = int(start) end = int(end) if len(data_test) < end: end = len(data_test) plt.plot(data_test.index[start:end], data_test.values[start:end], color='red') ax.get_xaxis().set_minor_locator(mpl.ticker.AutoMinorLocator()) plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right') plt.tight_layout() fig.savefig(fn) tc.green('Saved plot in {}'.format(fn)) if show: plt.show()
def visualize_labelled_series(anomaly_windows, show=True): fn = '{}/plot_anomalies.png'.format(result_dir) # data_test = load_files(files_test, file_count) fn = '{}/sel102.csv'.format(result_dir) df = pd.read_csv(fn, header=0) row_count = len(df.index) split = int(row_count / 2) # data_train = df.iloc[split:].V5 data_test = df.iloc[:-split].V5 fig, ax = plt.subplots() plt.plot(data_test.index, data_test.values, color='blue') title = 'ECG dataset (test data)' plt.title(title) # Highlight anomalies for window in anomaly_windows: start, end = window.split('-') start = int(start) end = int(end) if len(data_test) < end: end = len(data_test) plt.plot(data_test.index[start:end], data_test.values[start:end], color='red') ax.get_xaxis().set_minor_locator(mpl.ticker.AutoMinorLocator()) plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right') plt.tight_layout() fn = fn.replace('.csv', '.png') fig.savefig(fn) tc.green('Saved plot in {}'.format(fn)) if show: plt.show()
def generate_features(self, timeseries, anomaly_labels, window_size, file_name, method='ARMA', order=(2, 2), stride=0): """Process the complete timeseries. Create windows first and then encode each window to reduce the dimensionality. Returns ------- features: DataFrame List of features with anomaly labels """ if stride == 0: stride = window_size / 2 if (method == 'ARMA'): get_parameters = self.get_arma_params elif (method == 'ARIMA'): get_parameters = self.get_arima_params else: raise ValueError('Unkown method {}.'.format(method) + 'Only ARMA and ARIMA are supported.') window_columns = ['window_start', 'window_end', 'is_anomaly'] windows = pd.DataFrame(columns=window_columns) features = pd.DataFrame() window_starts = np.arange(0, len(timeseries), step=stride, dtype=int) tc.yellow("Generating features...") for i, start in enumerate(tqdm(window_starts)): end = int(start + window_size - 1) window_data = timeseries[start:end] window_is_anomaly = min(1, sum(anomaly_labels[start:end])) windows.loc[i] = [start, end, window_is_anomaly] fitted = get_parameters(window_data, order) if i == 0: feature_columns = np.append(fitted.data.param_names, ('is_anomaly', 'window_label')) features = pd.DataFrame(columns=feature_columns) window_label = '{}-{}'.format(start, end) # TODO: add fitted.sigma2 newRow = np.append(fitted.params, (window_is_anomaly, window_label)) features.loc[i] = newRow features.is_anomaly = features.is_anomaly.astype(int) features.to_csv(file_name, index=False) # Save features to file tc.green('Saved features in {}'.format(file_name)) return pd.read_csv(file_name)
def visualize(df, show=True): fig = plt.figure() plt.title('Record sel102 from Physionet') df.V5.plot() # df.V2.plot() save_data(df) if show: plt.show() fig.tight_layout() file_path = '{}/ecg.png'.format(result_dir) fig.savefig(file_path) plt.close() tc.green('Saved {}.'.format(file_path))
def create_data_plot(self, data, show): """Plot the generated (stitched) data containing the anomalies. """ fig = plt.figure() plt.title(self.get_title()) cmap = ['b', 'r'] # plt.plot(data.mask((data['is_anomaly'] == 1))['value'], color='blue') plt.plot(data['value'], color='blue') plt.plot(data.mask((data['is_anomaly'] == 0))['value'], color='red') plt.tight_layout() # avoid overlapping plot titles image_file_name = self.file_name.replace('.csv', '.png') fig.savefig(image_file_name) tc.green('Saved data plot in {}'.format(image_file_name)) if show: plt.show() plt.close()
def load_and_label_data(features_fn, threshold, scores_fn): features = pd.read_csv(features_fn) anomaly_data = pd.read_csv(scores_fn) mask = anomaly_data.anomaly_score > threshold # Save labelled anomaly scores anomaly_data.loc[mask, 'is_anomaly'] = 1 fn = scores_fn.replace( '.csv', '_labelled_{}.csv'.format(str(threshold).replace('.', '_'))) anomaly_data.to_csv(fn, index=False) tc.green('Saved file {}'.format(fn)) # Save labelled features features.loc[mask, 'is_anomaly'] = 1 fn = features_fn.replace( '.csv', '_labelled_{}.csv'.format(str(threshold).replace('.', '_'))) features.to_csv(fn, index=False) tc.green('Saved file {}'.format(fn)) # Return anomaly windows anomaly_windows = features.loc[mask].window_label return anomaly_windows
def visualize(data, file_count, file_name, show=False): '''Plot a line graph with red markers for all anomalies. ''' fig = plt.figure() data.value.plot.line(color='blue') title = 'First {} files of yahoo S5 dataset'.format(file_count) if file_count == 1: title = 'First file of yahoo S5 dataset'.format(file_count) plt.title(title) # Add anomaly markers for index, row in data.loc[data.is_anomaly == 1].iterrows(): plt.scatter(index, row['value'], marker='x', color='red') plt.tight_layout() fig.savefig(file_name) tc.green('Saved plot in {}'.format(file_name)) if show: plt.show()
def visualize_and_save(data, labels, file_name, regularization_strength, show=False): # Show green vertical lines for each anomaly label # anomaly_indices = [i for i, x in enumerate(labels) if x == 1] # for ai in anomaly_indices: # plt.axvline(x=ai, zorder=-1, c='green') axes = plt.gca() axes.set_ylim([0, 1]) plt.plot(data) # plotting by columns plt.title('regularization_strength: {}'.format(regularization_strength)) image_file = file_name.replace('.csv', '.png') plt.savefig(image_file) tc.green('Saved image {}'.format(image_file)) if show: plt.show() plt.clf()
def visualize(data, file_count, file_name, show=False): '''Plot a line graph with red markers for all anomalies. ''' fig = plt.figure() data.plot(color='blue') title = '{} files of nyc taxi (custom) dataset'.format(file_count) plt.title(title) ''' TODO: Research anomalies (such as Marathon, Blizzard etc.) and add an 'is_anomaly' column to the csv files. ''' # Add anomaly markers # for index, row in data.loc[data.is_anomaly == 1].iterrows(): # plt.scatter(index, row['value'], marker='x', color='red') plt.tight_layout() fig.savefig(file_name) tc.green('Saved plot in {}'.format(file_name)) if show: plt.show()
def visualize(data, anomalies, title): """Plot the generated (stitched) data containing the anomalies. """ fig = plt.figure(1, figsize=(12, 3)) ax1 = fig.add_subplot(111) # Generate title to show window count and anomaly window ax1.title.set_text(title) ax1.plot(np.arange(data.size), data, color='blue', zorder=1) # Add anomaly markers for anomaly_index in anomalies: ax1.scatter(anomaly_index, data[anomaly_index], marker='x', color='red', zorder=2) plt.tight_layout() # avoid overlapping plot titles fn = '{}/taxi_data.png'.format(folder) fig.savefig(fn) tc.green('Created {}'.format(fn))
def detect_anomalies(self, X, show=False): plot_num = 1 plt.figure(figsize=(len(self.anomaly_algorithms) * 2 + 3, 6)) for name, algorithm in self.anomaly_algorithms: tc.yellow('Detecting anomalies using {}...'.format(name)) algorithm.fit(X) plt.subplot(1, len(self.anomaly_algorithms), plot_num) plt.title(name, size=18) # fit the data and tag outliers if name == "Local Outlier Factor": y_pred = algorithm.fit_predict(X) else: y_pred = algorithm.fit(X).predict(X) # Print and plot self.print_anomalies(name, y_pred) self.plot_anomalies(name, X, y_pred, plt) plot_num += 1 plt.tight_layout() plt.savefig(self.file) tc.green('Saved anomaly plot to {}'.format(self.file)) if show: plt.show()
def run(training_data, test_data, test_labels, regularization_strength, file_name, epochs=100): assert training_data.shape[1] == test_data.shape[1] # Train autoencoder network encoding_dim = 2 model = Sequential() data_dim = test_data.shape[1] layers = [data_dim] hidden_dim = int(data_dim / 2) # Input layer and first encoding layer model.add( Dense(hidden_dim, input_dim=data_dim, activation='relu', activity_regularizer=l2(regularization_strength), name='encoding_{}'.format(hidden_dim))) layers.append(hidden_dim) # Add layers with decreasing size hidden_dim = int(hidden_dim / 2) while encoding_dim <= hidden_dim: model.add( Dense(hidden_dim, activation='relu', activity_regularizer=l2(regularization_strength), name='encoding_{}'.format(hidden_dim))) layers.append(hidden_dim) hidden_dim = int(hidden_dim / 2) # Add layers with increasing size layers.pop() # remove smallest element for hidden_dim in sorted(layers): model.add( Dense(hidden_dim, activation='relu', activity_regularizer=l2(regularization_strength), name='decoding_{}'.format(hidden_dim))) # Output layer model.add(Dense(data_dim, name='output')) # Multiple output neurons model.compile(loss='mean_squared_error', optimizer='adam') model.fit(training_data, training_data, verbose=1, epochs=epochs) # Save network structure to png dirname = os.path.dirname(file_name) fn = '{}/auto_encoder_model.png'.format(dirname) plot_model(model, to_file=fn, show_shapes=True) tc.green('Saved model image as {}'.format(fn)) pred = model.predict(training_data) score = np.sqrt(metrics.mean_squared_error(pred, training_data)) tc.yellow("Training Normal Score (RMSE): {}".format(score)) pred = model.predict(test_data) score = np.sqrt(metrics.mean_squared_error(pred, test_data)) tc.yellow("Test Normal Score (RMSE): {}".format(score)) # Predict / create anomaly scores scores = [] tc.yellow('Generating anomaly scores...') for feature in tqdm(test_data): pred = model.predict(np.array([feature])) score = np.sqrt(metrics.mean_squared_error(pred, np.array([feature]))) scores.append(score) # Save scores (anomaly scores) df = pd.DataFrame({'anomaly_score': scores, 'is_anomaly': test_labels}) df.to_csv(file_name, index=False) tc.green('Saved file {}'.format(file_name)) visualize_and_save(scores, test_labels, file_name, regularization_strength)
def save_data(self, data): """Write data to pandas csv file. """ df = pd.DataFrame(data) df.to_csv(self.file_name, index=False) tc.green('Saved data in {}.'.format(self.file_name))
def save_data(df): """Write data to pandas csv file. """ fn = '{}/sel102.csv'.format(result_dir) df.to_csv(fn, index=False) tc.green('Saved data in {}.'.format(fn))