def isolation_forest(df, df_class, df_train, df_train_class, maximize_score=F1_SCORE, n_jobs=-1, bootstrap=True, n_estimators=40, train_size=0.5, random_seed=10): df_train_common_class = df_anomaly_instances(df_train_class) df_train_with_common_class = df_train.join(df_train_common_class) df_common_class = df_anomaly_instances(df_class) train, valid = get_train_valid_sets(df_train_with_common_class, train_size=train_size, random_seed=random_seed) model = IsolationForest(random_state=random_seed, n_jobs=n_jobs, max_samples=train.shape[0], bootstrap=bootstrap, n_estimators=n_estimators) model.fit(train.drop('class', axis=1).values) thresholds = np.linspace(0, 1, 200) thresholds = np.round(thresholds, 7) # round thresholds y_scores = min_max_normalization( model.decision_function(valid.drop('class', axis=1).values)) y_scores = np.round(y_scores, 7) # round scores training_threshold_scores = get_threshold_scores(thresholds, y_scores, valid['class']) selected_index = get_max_score_index_for_score_type( training_threshold_scores, maximize_score) selected_threshold = thresholds[selected_index] # detection on dataset scores = min_max_normalization(model.decision_function(df.values)) scores = np.round(scores, 7) # round scores y_hat_results = (scores < selected_threshold).astype(int) y_truth = df_common_class['class'].values.astype(int) detection_threshold_scores = get_threshold_scores(thresholds, scores, df_common_class['class']) info = get_detection_meta(selected_threshold, y_hat_results, y_truth) info['thresholds'] = thresholds.tolist() info['training_threshold_scores'] = training_threshold_scores.tolist() info['detection_threshold_scores'] = detection_threshold_scores.tolist() return scores, y_hat_results, df_common_class, info
def cluster_gaussian_mixture(df, df_class, df_train, df_train_class, maximize_score=F1_SCORE, n_components=3, n_init=3, train_size=0.5, random_seed=10): df_train_common_class = df_anomaly_instances(df_train_class) df_train_with_common_class = df_train.join(df_train_common_class) df_common_class = df_anomaly_instances(df_class) train, valid = get_train_valid_sets(df_train_with_common_class, train_size=train_size, random_seed=random_seed) gmm = GaussianMixture(n_components=n_components, n_init=n_init, random_state=random_seed) gmm.fit(train.drop('class', axis=1).values) thresholds = np.linspace(0, 1, 200) thresholds = np.round(thresholds, 7) # round thresholds y_scores = min_max_normalization( gmm.score_samples(valid.drop('class', axis=1).values)) y_scores = np.round(y_scores, 7) # round scores training_threshold_scores = get_threshold_scores(thresholds, y_scores, valid['class']) selected_index = get_max_score_index_for_score_type( training_threshold_scores, maximize_score) selected_threshold = thresholds[selected_index] # detection on dataset scores = min_max_normalization(gmm.score_samples(df.values)) scores = np.round(scores, 7) # round scores y_hat_results = (scores < selected_threshold).astype(int) y_truth = df_common_class['class'].values.astype(int) detection_threshold_scores = get_threshold_scores(thresholds, scores, df_common_class['class']) info = get_detection_meta(selected_threshold, y_hat_results, y_truth) info['thresholds'] = thresholds.tolist() info['training_threshold_scores'] = training_threshold_scores.tolist() info['detection_threshold_scores'] = detection_threshold_scores.tolist() return scores, y_hat_results, df_common_class, info
def lisa_geo(df, df_class, time_series_id, maximize_score=F1_SCORE): df_corr_dist = get_df_corr_geo_distance(df) df_class_copy = df_class.copy() df_class_copy = df_class_copy.rename(columns={time_series_id: 'class'}) # append mean values of each row to dataframe df_val_mean = df_copy_with_mean(df) # LISA Time Series df_results = df_lisa_time_series(time_series_id, df_val_mean, df_corr_dist, global_correlation=True) thresholds = np.linspace(0, 1, 200) thresholds = np.round(thresholds, 7) # round thresholds scores = min_max_normalization(df_results[time_series_id].values) scores = np.round(scores, 7) # round scores threshold_scores = get_threshold_scores(thresholds, scores, df_class_copy['class']) selected_index = get_max_score_index_for_score_type(threshold_scores, maximize_score) selected_threshold = thresholds[selected_index] y_hat_results = (scores < selected_threshold).astype(int) y_truth = df_class_copy['class'].values.astype(int) info = get_detection_meta(selected_threshold, y_hat_results, y_truth) info['thresholds'] = thresholds.tolist() info['detection_threshold_scores'] = threshold_scores.tolist() return scores, y_hat_results, info
def histogram(df, df_class, df_train, df_train_class, maximize_score=F1_SCORE, train_size=0.5, random_seed=10): df_train_common_class = df_anomaly_instances(df_train_class) df_train_with_common_class = df_train.join(df_train_common_class) df_common_class = df_anomaly_instances(df_class) train, valid = get_train_valid_sets(df_train_with_common_class, train_size=train_size, random_seed=random_seed) # square root of number of instances as number of bins num_bins = (np.sqrt(train.shape[0])).astype(int) logging.debug('Number of bins %d' % num_bins) # create and train model model = hist_model(bins=num_bins) model.fit(train.drop('class', axis=1).values) thresholds = np.linspace(0, 1, 200) thresholds = np.round(thresholds, 7) # round thresholds y_scores = min_max_normalization(model.predict(valid.drop('class', axis=1).values)) y_scores = np.round(y_scores, 7) # round scores training_threshold_scores = get_threshold_scores(thresholds, y_scores, valid['class']) selected_index = get_max_score_index_for_score_type(training_threshold_scores, maximize_score) selected_threshold = thresholds[selected_index] # detection on dataset scores = min_max_normalization(model.predict(df.values)) scores = np.round(scores, 7) # round scores y_hat_results = (scores < selected_threshold).astype(int) y_truth = df_common_class['class'].values.astype(int) detection_threshold_scores = get_threshold_scores(thresholds, scores, df_common_class['class']) info = get_detection_meta(selected_threshold, y_hat_results, y_truth) info['thresholds'] = thresholds.tolist() info['training_threshold_scores'] = training_threshold_scores.tolist() info['detection_threshold_scores'] = detection_threshold_scores.tolist() return scores, y_hat_results, df_common_class, info
def svm(df, df_class, df_train, df_train_class, maximize_score=F1_SCORE, nu=0.5, kernel='rbf', train_size=0.5, random_seed=10): df_train_common_class = df_anomaly_instances(df_train_class) df_train_with_common_class = df_train.join(df_train_common_class) df_common_class = df_anomaly_instances(df_class) train, valid = get_train_valid_sets(df_train_with_common_class, train_size=train_size, random_seed=random_seed) model = OneClassSVM(gamma='scale', nu=nu, kernel=kernel) model.fit(train.drop('class', axis=1).values) thresholds = np.linspace(0, 1, 200) thresholds = np.round(thresholds, 7) # round thresholds y_scores = min_max_normalization(model.decision_function(valid.drop('class', axis=1).values)) y_scores = np.round(y_scores, 7) # round scores training_threshold_scores = get_threshold_scores(thresholds, y_scores, valid['class']) selected_index = get_max_score_index_for_score_type(training_threshold_scores, maximize_score) selected_threshold = thresholds[selected_index] # detection on dataset scores = min_max_normalization(model.decision_function(df.values)) scores = np.round(scores, 7) # round scores y_hat_results = (scores < selected_threshold).astype(int) y_truth = df_common_class['class'].values.astype(int) detection_threshold_scores = get_threshold_scores(thresholds, scores, df_common_class['class']) info = get_detection_meta(selected_threshold, y_hat_results, y_truth) info['thresholds'] = thresholds.tolist() info['training_threshold_scores'] = training_threshold_scores.tolist() info['detection_threshold_scores'] = detection_threshold_scores.tolist() return scores, y_hat_results, df_common_class, info
def lisa_dtw(df, df_class, time_series_id, maximize_score=F1_SCORE, window_size=10, distance_function=EUCLIDEAN): df_correlation = dtw_pearson(df, time_series_id, distance_function, window_size=window_size) # after correlation have been computed we remove the size of the window from the head of the data frame, as those correlations cannot be computed offset = window_size - 1 df_correlation = df_correlation.iloc[offset:] df = df.iloc[offset:] df_class = df_class.iloc[offset:] # mean values of each row of dataframe df_mean = df_copy_with_mean(df) df_class_copy = df_class.copy() df_class_copy = df_class_copy.rename(columns={time_series_id: 'class'}) # LISA Time Series df_results = df_lisa_time_series(time_series_id, df_mean, df_correlation) thresholds = np.linspace(0, 1, 200) thresholds = np.round(thresholds, 7) # round thresholds scores = min_max_normalization(df_results[time_series_id].values) scores = np.round(scores, 7) # round scores threshold_scores = get_threshold_scores(thresholds, scores, df_class_copy['class']) selected_index = get_max_score_index_for_score_type(threshold_scores, maximize_score) selected_threshold = thresholds[selected_index] y_hat_results = (scores < selected_threshold).astype(int) y_truth = df_class_copy['class'].values.astype(int) info = get_detection_meta(selected_threshold, y_hat_results, y_truth) info['thresholds'] = thresholds.tolist() info['detection_threshold_scores'] = threshold_scores.tolist() return scores, y_hat_results, info, df, df_class
def robust_pca_huber_loss(df, df_class, df_train, df_train_class, delta=1, n_components=2, maximize_score=F1_SCORE, train_size=0.5, random_seed=10): df_train_common_class = df_anomaly_instances(df_train_class) df_train_with_common_class = df_train.join(df_train_common_class) df_common_class = df_anomaly_instances(df_class) # for supervised detection(!) - stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify #X_train, X_test, y_train, y_test = train_test_split(df_train, df_train_common_class, train_size=train_size, random_state=random_seed, stratify=df_train_common_class) train, valid = get_train_valid_sets(df_train_with_common_class, train_size=train_size, random_seed=random_seed) X_train = train.drop('class', axis=1) X_test = valid.drop('class', axis=1) # Dimensionality reduction with Robust PCA and Huber Loss Function huber_loss = loss.HuberLoss(delta=delta) M_rpca = MRobustPCA(n_components, huber_loss) # Fit R-PCA on Train Set M_rpca.fit(X_train) # R-PCA on Test Set X_test_reduced = M_rpca.transform(X_test) X_test_reduced = pd.DataFrame(data=X_test_reduced, index=X_test.index) X_test_reconstructed = M_rpca.inverse_transform(X_test_reduced) X_test_reconstructed = pd.DataFrame(data=X_test_reconstructed, index=X_test.index) y_test_scores = normalized_anomaly_scores(X_test, X_test_reconstructed) y_test_scores = np.round(y_test_scores, 7) # round scores # computed scores are always in between 0-1 due to min max normalization thresholds = np.linspace(0, 1, 200) thresholds = np.round(thresholds, 7) # round thresholds training_threshold_scores = get_threshold_scores( thresholds, y_test_scores, valid['class'], upper_boundary=True ) # or replace valid['class'] with y_test for supervised detection selected_index = get_max_score_index_for_score_type( training_threshold_scores, maximize_score) selected_threshold = thresholds[selected_index] # Run on Dataset X_df_reduced = M_rpca.transform(df) X_df_reduced = pd.DataFrame(data=X_df_reduced, index=df.index) X_df_reconstructed = M_rpca.inverse_transform(X_df_reduced) X_df_reconstructed = pd.DataFrame(data=X_df_reconstructed, index=df.index) # detection on dataset scores = normalized_anomaly_scores(df, X_df_reconstructed) scores = np.round(scores, 7) # round scores y_hat_results = (scores > selected_threshold).astype(int) y_truth = df_common_class.values.astype(int) detection_threshold_scores = get_threshold_scores(thresholds, scores, df_common_class['class'], upper_boundary=True) info = get_detection_meta(selected_threshold, y_hat_results, y_truth, upper_boundary=True) info['thresholds'] = thresholds.tolist() info['training_threshold_scores'] = training_threshold_scores.tolist() info['detection_threshold_scores'] = detection_threshold_scores.tolist() return scores, y_hat_results, df_common_class, info