def test_hbos(self): clf = HBOS(contamination=0.05) clf.fit(self.X_train) assert_equal(len(clf.decision_scores), self.X_train.shape[0]) pred_scores = clf.decision_function(self.X_test) assert_equal(pred_scores.shape[0], self.X_test.shape[0]) assert_equal(clf.predict(self.X_test).shape[0], self.X_test.shape[0]) assert_greater(roc_auc_score(self.y_test, pred_scores), 0.5)
def detect_anomaly(df): df = df.fillna(0) clf =HBOS() x_values = df.index.values.reshape(df.index.values.shape[0],1) y_values = df.total_traded_quote_asset_volume.values.reshape(df.total_traded_quote_asset_volume.values.shape[0],1) clf.fit(y_values) clf.predict(y_values) df["label_qav"] = clf.predict(y_values) df["score_qav"] = clf.decision_function(y_values)#.round(6) df['change_qav'] = df.total_traded_quote_asset_volume.pct_change(periods=1)*100 df['change_price'] = df.last_price.pct_change(periods=1)*100 return df
def train(): """ Train the predictor on the data collected """ start_time = time.time() device_uuid, date_time = request.args.get('deviceUUID'), request.args.get( 'datetime') data_filename = get_data_filename(device_uuid, date_time) with open(data_filename, 'r') as f: rows = f.readlines() with open(config.awake_filename, 'rb') as f: awake_features = pickle.load(f) if len(rows) < config.min_train_data_size: return jsonify({ "status": 1, "message": "Not enough training data! %d" % len(rows) }) raw = np.zeros((len(rows), 3)) for i in range(len(rows)): raw[i] = [int(val) for val in rows[i].strip().split(',')] norm = features.normalize(raw) temp_features = features.extract_multi_features(norm, step=config.step_size, x_len=config.sample_size) baseline_features = features.get_baseline_features(temp_features) norm_features = features.get_calibrated_features(temp_features, baseline_features) X = np.concatenate((awake_features, norm_features), axis=0) X[:, 1] = np.abs(np.random.normal(0, 0.01, len(X))) app.logger.info( 'Training classifier using %d feature sets, each containing %d features' % (X.shape[0], X.shape[1])) clf = HBOS(contamination=0.05) clf.fit(X) model_filename = get_model_filename(device_uuid, date_time) with open(model_filename, 'wb') as f: pickle.dump(clf, f) pred = clf.decision_function(X) baseline = {'features': baseline_features, 'hboss_base': np.min(pred)} baseline_filename = get_baseline_filename(device_uuid, date_time) with open(baseline_filename, 'wb') as f: pickle.dump(baseline, f) return jsonify({"status": 0, "time": (time.time() - start_time)})
def get_HBOS_scores(dataframe, cols, outliers_fraction=0.01, standardize=True): '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default Returns: df with CBOLF scores added ''' if standardize: #standardize selected variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) #fit clf = HBOS(contamination=outliers_fraction) #clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) CheckOutliers.df2 = dataframe CheckOutliers.df2['outlier'] = y_pred.tolist() print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with HBOS')
class TestHBOS(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = HBOS(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'hist_') and self.clf.hist_ is not None) assert (hasattr(self.clf, 'bin_edges_') and self.clf.bin_edges_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') # def test_score(self): # self.clf.score(self.X_test, self.y_test) # self.clf.score(self.X_test, self.y_test, scoring='roc_auc_score') # self.clf.score(self.X_test, self.y_test, scoring='prc_n_score') # with assert_raises(NotImplementedError): # self.clf.score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
n_test = 500 X_train, y_train, c_train, X_test, y_test, c_test = generate_data( n_train=n_train, n_test=n_test, contamination=contamination) # train a HBOS detector (default version) clf = HBOS() clf.fit(X_train) # get the prediction on the training data y_train_pred = clf.y_pred y_train_score = clf.decision_scores # get the prediction on the test data y_test_pred = clf.predict(X_test) y_test_score = clf.decision_function(X_test) print('Train ROC:{roc}, precision@n:{prn}'.format( roc=roc_auc_score(y_train, y_train_score), prn=precision_n_scores(y_train, y_train_score))) print('Test ROC:{roc}, precision@n:{prn}'.format( roc=roc_auc_score(y_test, y_test_score), prn=precision_n_scores(y_test, y_test_score))) ####################################################################### # Visualizations # initialize the log directory if it does not exist pathlib.Path('example_figs').mkdir(parents=True, exist_ok=True) # plot the results
class TestHBOS(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = HBOS(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'hist_') and self.clf.hist_ is not None) assert_true(hasattr(self.clf, 'bin_edges_') and self.clf.bin_edges_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') # def test_score(self): # self.clf.score(self.X_test, self.y_test) # self.clf.score(self.X_test, self.y_test, scoring='roc_auc_score') # self.clf.score(self.X_test, self.y_test, scoring='prc_n_score') # with assert_raises(NotImplementedError): # self.clf.score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
n_features=2, contamination=contamination, random_state=42) # train HBOS detector clf_name = 'HBOS' clf = HBOS() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred,
def detect_anomaly(df, type): clf = HBOS() # if type == "forest": clf = IForest() x_values = df.index.values.reshape(df.index.values.shape[0], 1) y_values = df.close.values.reshape(df.close.values.shape[0], 1) clf.fit(y_values) clf.predict(y_values) df["label_close"] = clf.predict(y_values) df["score_close"] = clf.decision_function(y_values) #.round(6) y_values = df.volume.values.reshape(df.volume.values.shape[0], 1) clf.fit(y_values) clf.predict(y_values) df["label_volume"] = clf.predict(y_values) df["score_volume"] = clf.decision_function(y_values) #.round(4) # x_values = df.index.values.reshape(df.index.values.shape[0],1) # y_values = df.close.values.reshape(df.close.values.shape[0],1) # clf = KNN() # clf.fit(y_values) # clf.predict(y_values) # df["label_close_knn"] = clf.predict(y_values) # df["score_close_knn"] = clf.decision_function(y_values)#.round(6) # y_values = df.volume.values.reshape(df.volume.values.shape[0],1) # clf = KNN() # clf.fit(y_values) # clf.predict(y_values) # df["label_volume_knn"] = clf.predict(y_values) # df["score_volume_knn"] = clf.decision_function(y_values)#.round(4) # x_values = df.index.values.reshape(df.index.values.shape[0],1) # y_values = df.close.values.reshape(df.close.values.shape[0],1) # clf = PCA() # clf.fit(y_values) # clf.predict(y_values) # df["label_close_pca"] = clf.predict(y_values) # df["score_close_pca"] = clf.decision_function(y_values)#.round(6) # y_values = df.volume.values.reshape(df.volume.values.shape[0],1) # clf = PCA() # clf.fit(y_values) # clf.predict(y_values) # df["label_volume_pca"] = clf.predict(y_values) # df["score_volume_pca"] = clf.decision_function(y_values)#.round(4) # x_values = df.index.values.reshape(df.index.values.shape[0],1) # y_values = df.close.values.reshape(df.close.values.shape[0],1) # clf = IForest() # clf.fit(y_values) # clf.predict(y_values) # df["label_close_iforest"] = clf.predict(y_values) # df["score_close_iforest"] = clf.decision_function(y_values)#.round(6) # y_values = df.volume.values.reshape(df.volume.values.shape[0],1) # clf = IForest() # clf.fit(y_values) # clf.predict(y_values) # df["label_volume_iforest"] = clf.predict(y_values) # df["score_volume_iforest"] = clf.decision_function(y_values)#.round(4) return df
class TestHBOS(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = HBOS(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr( self.clf, 'decision_scores_') or self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') if not hasattr(self.clf, '_mu') or self.clf._mu is None: self.assertRaises(AttributeError, '_mu is not set') if not hasattr(self.clf, '_sigma') or self.clf._sigma is None: self.assertRaises(AttributeError, '_sigma is not set') if not hasattr(self.clf, 'hist_') or self.clf.hist_ is None: self.assertRaises(AttributeError, 'hist_ is not set') if not hasattr(self.clf, 'bin_edges_') or self.clf.bin_edges_ is None: self.assertRaises(AttributeError, 'bin_edges_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_evaluate(self): self.clf.fit_predict_evaluate(self.X_test, self.y_test) def tearDown(self): pass
# Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train HBOS detector clf_name = 'HBOS' clf = HBOS() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)