def outliers_detect(self, columns,outliers_fraction = 0.05): X = pd.get_dummies(self.data[columns]) clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0) clf.fit(X) scores_pred = clf.decision_function(X) * -1 y_pred = clf.predict(X) self.data['outlier'] = y_pred.tolist() n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1)
def cblof(self, X_train, contamination=None, random_state=None): """ Train CBLOF model from PYOD Parameters __________ X_train: scaled training data contamination: percentage of anomalies in the data random_state: random number seed Returns ________ Anomaly scores """ model = CBLOF(contamination=contamination, random_state=random_state) model.fit(X_train) # Predict raw anomaly score labels = model.predict(X_train) # outlier labels (0 or 1) cblof_anomaly_scores = model.decision_function( X_train) # outlier scores cblof_anomaly_scores = self.min_max_scaler(cblof_anomaly_scores) return cblof_anomaly_scores, labels
def get_CBOLF_scores(dataframe, cols, outliers_fraction=0.01): '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default Returns: df with CBOLF scores added ''' #standardize selected variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) #fit clf = CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) CheckOutliers.df1 = dataframe CheckOutliers.df1['outlier'] = y_pred.tolist() print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with CBLOF')
n_features=2, contamination=contamination, random_state=42) # train CBLOF detector clf_name = 'CBLOF' clf = CBLOF(random_state=42) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred,
clf1 = CBLOF(random_state=42) # Default contamination 0.1 clf1.fit(X_train1) #Setting threshold using the contamination parameter dec_scores = clf1.decision_scores_ dec_scores_sorted=sorted(dec_scores, reverse=True) a = round(len(X_train1) * clf1.contamination) print(a) anomalies=dec_scores_sorted[:a] threshold = anomalies[-1] print(threshold) # Validation data is scored y_valid_scores = clf1.decision_function(X_valid1) y_valid_scores = pd.Series(y_valid_scores) valid_SrcIP = np.load('preprocessing1_valid_srcIP.npy',allow_pickle=True) # For each score, if it is above threshold value, it is considered outlier, else inlier valid_outliers = [] y_pred_valid = [] for score in range(0,len(y_valid_scores)): if y_valid_scores[score] > threshold: reg = (valid_SrcIP[score], y_valid_scores[score]) valid_outliers.append(reg) y_pred_valid.append(1.0) else: y_pred_valid.append(0.0)
class TestCBLOF(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = CBLOF(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'clustering_estimator_') and self.clf.clustering_estimator_ is not None) assert (hasattr(self.clf, 'cluster_labels_') and self.clf.cluster_labels_ is not None) assert (hasattr(self.clf, 'cluster_sizes_') and self.clf.cluster_sizes_ is not None) assert (hasattr(self.clf, 'cluster_centers_') and self.clf.cluster_centers_ is not None) assert (hasattr(self.clf, '_clustering_threshold') and self.clf._clustering_threshold is not None) assert (hasattr(self.clf, 'small_cluster_labels_') and self.clf.small_cluster_labels_ is not None) assert (hasattr(self.clf, 'large_cluster_labels_') and self.clf.large_cluster_labels_ is not None) assert (hasattr(self.clf, '_large_cluster_centers') and self.clf._large_cluster_centers is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_prediction_labels_confidence(self): pred_labels, confidence = self.clf.predict(self.X_test, return_confidence=True) assert_equal(pred_labels.shape, self.y_test.shape) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_prediction_proba_linear_confidence(self): pred_proba, confidence = self.clf.predict_proba(self.X_test, method='linear', return_confidence=True) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
class TestLOF(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = CBLOF(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_sklearn_estimator(self): # TODO: sklearn examples are too small to form valid # check_estimator(self.clf) pass def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'clustering_estimator_') and self.clf.clustering_estimator_ is not None) assert_true(hasattr(self.clf, 'cluster_labels_') and self.clf.cluster_labels_ is not None) assert_true(hasattr(self.clf, 'cluster_sizes_') and self.clf.cluster_sizes_ is not None) assert_true(hasattr(self.clf, 'cluster_centers_') and self.clf.cluster_centers_ is not None) assert_true(hasattr(self.clf, '_clustering_threshold') and self.clf._clustering_threshold is not None) assert_true(hasattr(self.clf, 'small_cluster_labels_') and self.clf.small_cluster_labels_ is not None) assert_true(hasattr(self.clf, 'large_cluster_labels_') and self.clf.large_cluster_labels_ is not None) assert_true(hasattr(self.clf, '_large_cluster_centers') and self.clf._large_cluster_centers is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
df[['Sales', 'Profit']] = minmax.fit_transform(df[['Sales', 'Profit']]) print(df[['Sales', 'Profit']].head()) X1 = df['Sales'].values.reshape(-1, 1) X2 = df['Profit'].values.reshape(-1, 1) X = np.concatenate((X1, X2), axis=1) outliers_fraction = 0.01 xx, yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100)) clf = CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) plt.figure(figsize=(8, 8)) df1 = df df1['outlier'] = y_pred.tolist() # sales - inlier feature 1, profit - inlier feature 2 inliers_sales = np.array(df1['Sales'][df1['outlier'] == 0]).reshape(-1, 1) inliers_profit = np.array(df1['Profit'][df1['outlier'] == 0]).reshape(-1, 1)
def get_outliers(dataframe, cols, outliers_fraction, row_id, n, cbolf=True, hbos=True, iforest=True, knn=True): ''' Params: row_id ('str'): unique row identifier on the dataframe n(int): Minimum number of timmes an observation should be flagged as an outlier to be considered one Retrurns: List of index labels for rows in the dataframe that are flagged as outliers ''' #standardize selected numerical variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm #Outliers.row_id = row_id arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) if cbolf: '''Runs Cluster-Based Outlier Local Factor (CBOLF) algorithm to identify outliers''' #fit clf = CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=0) clf.fit(X) #predict raw anomaly score scores_pred = clf.decision_function(X) * -1 #prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) #Hold results to dataframe and print findings Outliers.df1 = dataframe Outliers.df1['outlier'] = y_pred.tolist() Outliers.df1 = Outliers.df1.loc[Outliers.df1['outlier'] == 1] print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with CBLOF') if hbos: '''Runs Histogram Based Outlier Score (HBOS) algorithm to identify outliers''' #fit clf = HBOS(contamination=outliers_fraction) clf.fit(X) #predict raw anomaly score scores_pred = clf.decision_function(X) * -1 #prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) #Hold results to dataframe and print findings Outliers.df2 = dataframe Outliers.df2['outlier'] = y_pred.tolist() Outliers.df2 = Outliers.df2.loc[Outliers.df2['outlier'] == 1] print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with HBOS') if iforest: '''Runs Isolation Forest algorithm to identify outliers''' #fit clf = IForest(contamination=outliers_fraction, random_state=0) clf.fit(X) #predict raw anomaly score scores_pred = clf.decision_function(X) * -1 #prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) #Hold results to dataframe and print findings Outliers.df3 = dataframe Outliers.df3['outlier'] = y_pred.tolist() Outliers.df3 = Outliers.df3.loc[Outliers.df3['outlier'] == 1] print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with IForest') if knn: '''Runs K-Nearest Neighbors algorithm to identify outliers''' #fit clf = KNN(contamination=outliers_fraction) clf.fit(X) #predict raw anomaly score scores_pred = clf.decision_function(X) * -1 #prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) #Hold results to dataframe and print findings Outliers.df4 = dataframe Outliers.df4['outlier'] = y_pred.tolist() Outliers.df4 = Outliers.df4.loc[Outliers.df4['outlier'] == 1] print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with KNN') #Merge dataframes merged_df = pd.concat( [Outliers.df1, Outliers.df2, Outliers.df3, Outliers.df4]) #Get counts (Count number of times an observation is identified as an outlier) merged_df['count'] = merged_df.groupby(row_id)[row_id].transform( 'count') #outliers['count'] = outliers.groupby('client_id')['client_id'].transform('count') #Filter common outliers (Outlier identified by all n algorithms) #common = outliers.loc[outliers['count'] >= n] common = merged_df.loc[merged_df['count'] >= n] #drop duplicates common = common.drop_duplicates(keep='last') #get list of indices to be removed on main dataframe Outliers.outlier_indices = [] for index in common.index: Outliers.outlier_indices.append(index) #print(f' \n{common.shape[0]} outliers commonly found by all algorithms\n') print( f' \n{len(Outliers.outlier_indices)} outliers commonly found by all algorithms\n' ) print(f'The row index labels are:\n {Outliers.outlier_indices}') return Outliers.outlier_indices
x = data118457['Time'] y = data118457['Speed diff'] plt.figure(figsize=(10, 4)) plt.plot(x, y, label='Car 118457') plt.xlabel('Time') plt.ylabel('Speed diff') plt.show() # In[15]: cblof = CBLOF() cblof.fit(df['Speed diff'].values.reshape(-1, 1)) xx = np.linspace(df['Speed diff'].min(), df['Speed diff'].max(), len(df)).reshape(-1, 1) anomaly_score = cblof.decision_function(xx) outlier = cblof.predict(xx) plt.figure(figsize=(10, 4)) plt.plot(xx, anomaly_score, label='anomaly score') plt.ylabel('anomaly score') plt.xlabel('Speed diff') plt.show() # In[16]: minmax = MinMaxScaler(feature_range=(0, 1)) df[['CarId', 'Speed diff', 'Heading diff', 'Position diff']] = minmax.fit_transform( df[['CarId', 'Speed diff', 'Heading diff', 'Position diff']]) df[['CarId', 'Speed diff', 'Heading diff', 'Position diff']].head()
# Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train CBLOF detector clf_name = 'CBLOF' clf = CBLOF() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)
# clf_lscp = LSCP([clf_knn, clf_pca, clf_mcd ]) # clf_ae = AutoEncoder(epochs=50) clf_mcd.fit(encodings_train) clf_pca.fit(encodings_train) clf_knn.fit(encodings_train) clf_lof.fit(encodings_train) clf_cblof.fit(encodings_train) # clf_lscp.fit(encodings_train) # clf_ae.fit(encodings_train) anomaly_scores_mcd = clf_mcd.decision_function(encodings_train) anomaly_scores_pca = clf_pca.decision_function(encodings_train) anomaly_scores_knn = clf_knn.decision_function(encodings_train) anomaly_scores_lof = clf_lof.decision_function(encodings_train) anomaly_scores_cblof = clf_cblof.decision_function(encodings_train) # anomaly_scores_lscp = clf_lscp.decision_function(encodings_train) # anomaly_scores_ae = clf_ae.predict_proba(encodings_train) # y_test_scores = [] # for x,_ in test_loader: # encodings_test = encoder(torch.Tensor(x).to(device)) # probs = clf.predict_proba(encodings_test.detach().cpu().numpy()) # y_test_scores.extend(probs[:,0]) # y_test_scores = np.array(y_test_scores) y_ind_1 = np.argwhere(y_window.reshape(-1, ) == 1) y_ind_3 = np.argwhere(y_window.reshape(-1, ) == 3) for i, anomaly_scores in enumerate([ anomaly_scores_knn, anomaly_scores_lof, anomaly_scores_cblof,