def test_hbos(self): clf = HBOS(contamination=0.05) clf.fit(self.X_train) assert_equal(len(clf.decision_scores), self.X_train.shape[0]) pred_scores = clf.decision_function(self.X_test) assert_equal(pred_scores.shape[0], self.X_test.shape[0]) assert_equal(clf.predict(self.X_test).shape[0], self.X_test.shape[0]) assert_greater(roc_auc_score(self.y_test, pred_scores), 0.5)
def S2(self): self.S1() water_data = self.water_data result = self.result # 数据预处理及模型训练 clean_data = water_data[water_data['S1'] == 0] Y = pd.DataFrame(index=clean_data.index, columns=['S2']) X_train = np.array(clean_data.iloc[:, 1:12]) name = list(clean_data.iloc[:, 1:12].columns.values) scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True) clf2 = KNN(contamination=0.05, n_neighbors=100) clf3 = HBOS(contamination=0.05, n_bins=10) clf4 = PCA(contamination=0.05) clf1.fit(X_train) clf2.fit(X_train) clf3.fit(X_train) clf4.fit(X_train) Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_ water_data = pd.concat([water_data, Y], axis=1) # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0; result['统计异常'] = water_data['S2'].values # 寻找异常维度 from sklearn.neighbors import KernelDensity clean_data = water_data[water_data['S1'] == 0] dens = pd.DataFrame(index=clean_data.index, columns=[ 'temperature', 'pH', 'EC', 'ORP', 'DO', 'turbidity', 'transparency', 'COD', 'P', 'NH3N', 'flux' ]) for i in dens.columns: kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit( clean_data[i].values.reshape(-1, 1)) dens[i] = np.exp( kde.score_samples(clean_data[i].values.reshape(-1, 1))) dens = dens.iloc[:, 0:11].rank() dens['S2_names'] = dens.idxmin(axis=1) water_data = pd.concat([water_data, dens['S2_names']], axis=1) self.water_data = water_data result['统计异常维度'] = water_data['S2_names'].values # 存储模型 joblib.dump(scaler, "./water_model/S2_scaler") joblib.dump(clf1, "./water_model/S2_Iforest")
def detect_anomaly(df): df = df.fillna(0) clf =HBOS() x_values = df.index.values.reshape(df.index.values.shape[0],1) y_values = df.total_traded_quote_asset_volume.values.reshape(df.total_traded_quote_asset_volume.values.shape[0],1) clf.fit(y_values) clf.predict(y_values) df["label_qav"] = clf.predict(y_values) df["score_qav"] = clf.decision_function(y_values)#.round(6) df['change_qav'] = df.total_traded_quote_asset_volume.pct_change(periods=1)*100 df['change_price'] = df.last_price.pct_change(periods=1)*100 return df
def getOutlierHBOS(dataset): ''' @brief Function that executes HBOS algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model hbos = HBOS() # Fits the data and obtains labels hbos.fit(dataset) # Return labels return hbos.labels_
def train(): """ Train the predictor on the data collected """ start_time = time.time() device_uuid, date_time = request.args.get('deviceUUID'), request.args.get( 'datetime') data_filename = get_data_filename(device_uuid, date_time) with open(data_filename, 'r') as f: rows = f.readlines() with open(config.awake_filename, 'rb') as f: awake_features = pickle.load(f) if len(rows) < config.min_train_data_size: return jsonify({ "status": 1, "message": "Not enough training data! %d" % len(rows) }) raw = np.zeros((len(rows), 3)) for i in range(len(rows)): raw[i] = [int(val) for val in rows[i].strip().split(',')] norm = features.normalize(raw) temp_features = features.extract_multi_features(norm, step=config.step_size, x_len=config.sample_size) baseline_features = features.get_baseline_features(temp_features) norm_features = features.get_calibrated_features(temp_features, baseline_features) X = np.concatenate((awake_features, norm_features), axis=0) X[:, 1] = np.abs(np.random.normal(0, 0.01, len(X))) app.logger.info( 'Training classifier using %d feature sets, each containing %d features' % (X.shape[0], X.shape[1])) clf = HBOS(contamination=0.05) clf.fit(X) model_filename = get_model_filename(device_uuid, date_time) with open(model_filename, 'wb') as f: pickle.dump(clf, f) pred = clf.decision_function(X) baseline = {'features': baseline_features, 'hboss_base': np.min(pred)} baseline_filename = get_baseline_filename(device_uuid, date_time) with open(baseline_filename, 'wb') as f: pickle.dump(baseline, f) return jsonify({"status": 0, "time": (time.time() - start_time)})
def extract_is_outlier(df: pd.DataFrame, col: str, pbar=None, verbose: bool = True, model=None, outliers_fraction: float = 0.05, replace_with=None) -> pd.DataFrame: """ Create an is_outlier column :param df: the data :param col: the column name :param conf: the config dir :param pbar: tqdm progress bar :return: """ df = df.copy(deep=True) msg = "Trying to find outliers in " + str(col) if pbar is None: print_c(verbose, msg) else: pbar.set_description(msg) if model is None: model = HBOS(contamination=outliers_fraction) X = df[col].astype(np.float32) mask = ~(np.isnan(X) | np.isinf(X) | np.isneginf(X)) model.fit(X[mask].to_frame()) preds = model.predict(X[mask].to_frame()) df[col + '_' + 'isoutlier'] = 0 df.loc[mask, col + '_' + 'isoutlier'] = preds if replace_with is not None: msg = "Replacing outliers in " + str(col) + " with " + str( replace_with) if pbar is None: print_c(verbose, msg) else: pbar.set_description(msg) df.loc[df[col + '_' + 'isoutlier'] == 1, col] = replace_with return df
def get_HBOS_scores(dataframe, cols, outliers_fraction=0.01, standardize=True): '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default Returns: df with CBOLF scores added ''' if standardize: #standardize selected variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) #fit clf = HBOS(contamination=outliers_fraction) #clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) CheckOutliers.df2 = dataframe CheckOutliers.df2['outlier'] = y_pred.tolist() print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with HBOS')
def _get_outlier_labels(eigs: ndarray, tol: float) -> List[str]: """Identify the outliers of eigs with HBOS.""" hb = HBOS(tol=tol) steps = np.arange(0, len(eigs)) X = np.vstack([eigs, steps]).T # data array is_outlier = np.array(hb.fit(X).labels_, dtype=bool) # outliers get "1" # because eigs are sorted, HBOS will *usually* identify outliers at one of # the two ends of the eigenvalues, which is what we want. But this is not # always the case, so we need to de-identify those values as outliers. if is_outlier[0]: start = find_first(is_outlier, False) for i in range(start, len(is_outlier)): is_outlier[i] = False if is_outlier[-1]: stop = find_last(is_outlier, False) for i in range(stop): is_outlier[i] = False if not is_outlier[0] and not is_outlier[-1]: # force a break later is_outlier = np.zeros(is_outlier.shape, dtype=bool) return ["outlier" if label else "inlier" for label in is_outlier]
class TestHBOS(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = HBOS(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'hist_') and self.clf.hist_ is not None) assert_true(hasattr(self.clf, 'bin_edges_') and self.clf.bin_edges_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') # def test_score(self): # self.clf.score(self.X_test, self.y_test) # self.clf.score(self.X_test, self.y_test, scoring='roc_auc_score') # self.clf.score(self.X_test, self.y_test, scoring='prc_n_score') # with assert_raises(NotImplementedError): # self.clf.score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
class TestHBOS(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = HBOS(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr( self.clf, 'decision_scores_') or self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') if not hasattr(self.clf, '_mu') or self.clf._mu is None: self.assertRaises(AttributeError, '_mu is not set') if not hasattr(self.clf, '_sigma') or self.clf._sigma is None: self.assertRaises(AttributeError, '_sigma is not set') if not hasattr(self.clf, 'hist_') or self.clf.hist_ is None: self.assertRaises(AttributeError, 'hist_ is not set') if not hasattr(self.clf, 'bin_edges_') or self.clf.bin_edges_ is None: self.assertRaises(AttributeError, 'bin_edges_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_evaluate(self): self.clf.fit_predict_evaluate(self.X_test, self.y_test) def tearDown(self): pass
def detect_anomaly(df, type): clf = HBOS() # if type == "forest": clf = IForest() x_values = df.index.values.reshape(df.index.values.shape[0], 1) y_values = df.close.values.reshape(df.close.values.shape[0], 1) clf.fit(y_values) clf.predict(y_values) df["label_close"] = clf.predict(y_values) df["score_close"] = clf.decision_function(y_values) #.round(6) y_values = df.volume.values.reshape(df.volume.values.shape[0], 1) clf.fit(y_values) clf.predict(y_values) df["label_volume"] = clf.predict(y_values) df["score_volume"] = clf.decision_function(y_values) #.round(4) # x_values = df.index.values.reshape(df.index.values.shape[0],1) # y_values = df.close.values.reshape(df.close.values.shape[0],1) # clf = KNN() # clf.fit(y_values) # clf.predict(y_values) # df["label_close_knn"] = clf.predict(y_values) # df["score_close_knn"] = clf.decision_function(y_values)#.round(6) # y_values = df.volume.values.reshape(df.volume.values.shape[0],1) # clf = KNN() # clf.fit(y_values) # clf.predict(y_values) # df["label_volume_knn"] = clf.predict(y_values) # df["score_volume_knn"] = clf.decision_function(y_values)#.round(4) # x_values = df.index.values.reshape(df.index.values.shape[0],1) # y_values = df.close.values.reshape(df.close.values.shape[0],1) # clf = PCA() # clf.fit(y_values) # clf.predict(y_values) # df["label_close_pca"] = clf.predict(y_values) # df["score_close_pca"] = clf.decision_function(y_values)#.round(6) # y_values = df.volume.values.reshape(df.volume.values.shape[0],1) # clf = PCA() # clf.fit(y_values) # clf.predict(y_values) # df["label_volume_pca"] = clf.predict(y_values) # df["score_volume_pca"] = clf.decision_function(y_values)#.round(4) # x_values = df.index.values.reshape(df.index.values.shape[0],1) # y_values = df.close.values.reshape(df.close.values.shape[0],1) # clf = IForest() # clf.fit(y_values) # clf.predict(y_values) # df["label_close_iforest"] = clf.predict(y_values) # df["score_close_iforest"] = clf.decision_function(y_values)#.round(6) # y_values = df.volume.values.reshape(df.volume.values.shape[0],1) # clf = IForest() # clf.fit(y_values) # clf.predict(y_values) # df["label_volume_iforest"] = clf.predict(y_values) # df["score_volume_iforest"] = clf.decision_function(y_values)#.round(4) return df
def do_pyod(model, colnames, arr_baseline, arr_highlight): # init some counters n_charts, n_dims, n_bad_data, fit_success, fit_default, fit_fail = init_counters( colnames) # dict to collect results into results = {} n_lags = model.get('n_lags', 0) model_level = model.get('model_level', 'dim') model = model.get('type', 'hbos') # model init clf = pyod_init(model) # get map of cols to loop over col_map = get_col_map(colnames, model_level) # build each model for colname in col_map: chart = colname.split('|')[0] dimension = colname.split('|')[1] if '|' in colname else '*' arr_baseline_dim = arr_baseline[:, col_map[colname]] arr_highlight_dim = arr_highlight[:, col_map[colname]] # check for bad data bad_data = False # skip if bad data if bad_data: n_bad_data += 1 log.info(f'... skipping {colname} due to bad data') else: if n_lags > 0: arr_baseline_dim = add_lags(arr_baseline_dim, n_lags=n_lags) arr_highlight_dim = add_lags(arr_highlight_dim, n_lags=n_lags) # remove any nan rows arr_baseline_dim = arr_baseline_dim[~np.isnan(arr_baseline_dim). any(axis=1)] arr_highlight_dim = arr_highlight_dim[~np.isnan(arr_highlight_dim). any(axis=1)] log.debug(f'... chart = {chart}') log.debug(f'... dimension = {dimension}') log.debug(f'... arr_baseline_dim.shape = {arr_baseline_dim.shape}') log.debug( f'... arr_highlight_dim.shape = {arr_highlight_dim.shape}') log.debug(f'... arr_baseline_dim = {arr_baseline_dim}') log.debug(f'... arr_highlight_dim = {arr_highlight_dim}') if model == ['auto_encoder']: clf = pyod_init(model, n_features=arr_baseline_dim.shape[1]) clf, result = try_fit(clf, colname, arr_baseline_dim, PyODDefaultModel) fit_success += 1 if result == 'success' else 0 fit_default += 1 if result == 'default' else 0 # try predictions and if they fail use default model try: preds = clf.predict(arr_highlight_dim) probs = clf.predict_proba(arr_highlight_dim)[:, 1] except: fit_success -= 1 fit_default += 1 clf = PyODDefaultModel() clf.fit(arr_baseline_dim) preds = clf.predict(arr_highlight_dim) probs = clf.predict_proba(arr_highlight_dim)[:, 1] log.debug(f'... preds.shape = {preds.shape}') log.debug(f'... preds = {preds}') log.debug(f'... probs.shape = {probs.shape}') log.debug(f'... probs = {probs}') # save results score = (np.mean(probs) + np.mean(preds)) / 2 if chart in results: results[chart].append({dimension: {'score': score}}) else: results[chart] = [{dimension: {'score': score}}] # log some summary stats log.info( summary_info(n_charts, n_dims, n_bad_data, fit_success, fit_fail, fit_default, model_level)) return results
data = pd.read_csv(path,index_col=0) data['plate'] = f data = data[data['Metadata_broad_sample'].isin(drugs)] data = data[data.columns.intersection(selected_cols)] b = data['Metadata_broad_sample'] w = data['Metadata_Well'] p = data['plate'] del data['Metadata_broad_sample'] del data['Metadata_Well'] del data['plate'] outliers_fraction = 0.01 clf = HBOS (contamination= outliers_fraction) clf.fit(data) y_pred = clf.predict(data) X = pd.DataFrame() X['outlier'] = y_pred.tolist() X['Metadata_broad_sample'] = b X['Metadata_Well'] = w X['plate'] = p X.to_csv('outlier_without_regress/'+f) #target = y_pred.tolist() #tsne = TSNE(n_components= 2, verbose=1, perplexity=40, n_iter=2000) #tsne_results = tsne.fit_transform(data) #fig = plt.figure() #ax = fig.add_subplot(111, projection='3d') #ax.scatter(tsne_results[:,0], tsne_results[:,1],tsne_results[:,2], cmap = "coolwarm", edgecolor = "None" , c = target)
def fit_hbos_transformer(input_data: pd.DataFrame): hbos = HBOS() hbos.fit(input_data) return hbos
def hbos_transformer(self): hbos = HBOS() hbos.fit(self.train_transformed_data) return hbos
df = pd.read_csv(file) df.loc[df['ground.truth'] == 'anomaly', 'ground.truth'] = 1 df.loc[df['ground.truth'] == 'nominal', 'ground.truth'] = 0 y = df['ground.truth'].values.reshape(-1) df[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7']] = scaler.fit_transform( df[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7']]) x1 = df['V1'].values.reshape(-1, 1) x2 = df['V2'].values.reshape(-1, 1) x3 = df['V3'].values.reshape(-1, 1) x4 = df['V4'].values.reshape(-1, 1) x5 = df['V5'].values.reshape(-1, 1) x6 = df['V6'].values.reshape(-1, 1) x7 = df['V7'].values.reshape(-1, 1) x = np.concatenate((x1, x2, x3, x4, x5, x6, x7), axis=1) hbos = HBOS(contamination=outliers_fraction) hbos.fit(x) y_pred = hbos.predict(x) fpr, tpr, threshold = roc_curve(y, y_pred) ###计算真阳性率和假阳性率 roc_auc = auc(fpr, tpr) ###计算auc的值 lw = 2 ax = fig.add_subplot(3, 3, i) plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.3f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate')
from pyod.data.load_data import generate_data from pyod.utils.utility import precision_n_scores from pyod.models.hbos import HBOS if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 1000 n_test = 500 X_train, y_train, c_train, X_test, y_test, c_test = generate_data( n_train=n_train, n_test=n_test, contamination=contamination) # train a HBOS detector (default version) clf = HBOS() clf.fit(X_train) # get the prediction on the training data y_train_pred = clf.y_pred y_train_score = clf.decision_scores # get the prediction on the test data y_test_pred = clf.predict(X_test) y_test_score = clf.decision_function(X_test) print('Train ROC:{roc}, precision@n:{prn}'.format( roc=roc_auc_score(y_train, y_train_score), prn=precision_n_scores(y_train, y_train_score))) print('Test ROC:{roc}, precision@n:{prn}'.format( roc=roc_auc_score(y_test, y_test_score),
def generate_meta_features(X): """Get the meta-features of a datasets X Parameters ---------- X : numpy array of shape (n_samples, n_features) Input array Returns ------- meta_features : numpy array of shape (1, 200) Meta-feature in dimension of 200 """ # outliers_fraction = np.count_nonzero(y) / len(y) # outliers_percentage = round(outliers_fraction * 100, ndigits=4) X = check_array(X) meta_vec = [] meta_vec_names = [] # on the sample level n_samples, n_features = X.shape[0], X.shape[1] meta_vec.append(n_samples) meta_vec.append(n_features) meta_vec_names.append('n_samples') meta_vec_names.append('n_features') sample_mean = np.mean(X) sample_median = np.median(X) sample_var = np.var(X) sample_min = np.min(X) sample_max = np.max(X) sample_std = np.std(X) q1, q25, q75, q99 = np.percentile(X, [0.01, 0.25, 0.75, 0.99]) iqr = q75 - q25 normalized_mean = sample_mean / sample_max normalized_median = sample_median / sample_max sample_range = sample_max - sample_min sample_gini = gini(X) med_abs_dev = np.median(np.absolute(X - sample_median)) avg_abs_dev = np.mean(np.absolute(X - sample_mean)) quant_coeff_disp = (q75 - q25) / (q75 + q25) coeff_var = sample_var / sample_mean outliers_15iqr = np.logical_or(X < (q25 - 1.5 * iqr), X > (q75 + 1.5 * iqr)) outliers_3iqr = np.logical_or(X < (q25 - 3 * iqr), X > (q75 + 3 * iqr)) outliers_1_99 = np.logical_or(X < q1, X > q99) outliers_3std = np.logical_or(X < (sample_mean - 3 * sample_std), X > (sample_mean + 3 * sample_std)) percent_outliers_15iqr = np.sum(outliers_15iqr) / len(X) percent_outliers_3iqr = np.sum(outliers_3iqr) / len(X) percent_outliers_1_99 = np.sum(outliers_1_99) / len(X) percent_outliers_3std = np.sum(outliers_3std) / len(X) has_outliers_15iqr = np.any(outliers_15iqr).astype(int) has_outliers_3iqr = np.any(outliers_3iqr).astype(int) has_outliers_1_99 = np.any(outliers_1_99).astype(int) has_outliers_3std = np.any(outliers_3std).astype(int) meta_vec.extend([ sample_mean, sample_median, sample_var, sample_min, sample_max, sample_std, q1, q25, q75, q99, iqr, normalized_mean, normalized_median, sample_range, sample_gini, med_abs_dev, avg_abs_dev, quant_coeff_disp, coeff_var, # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10, percent_outliers_15iqr, percent_outliers_3iqr, percent_outliers_1_99, percent_outliers_3std, has_outliers_15iqr, has_outliers_3iqr, has_outliers_1_99, has_outliers_3std ]) meta_vec_names.extend([ 'sample_mean', 'sample_median', 'sample_var', 'sample_min', 'sample_max', 'sample_std', 'q1', 'q25', 'q75', 'q99', 'iqr', 'normalized_mean', 'normalized_median', 'sample_range', 'sample_gini', 'med_abs_dev', 'avg_abs_dev', 'quant_coeff_disp', 'coeff_var', # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10, 'percent_outliers_15iqr', 'percent_outliers_3iqr', 'percent_outliers_1_99', 'percent_outliers_3std', 'has_outliers_15iqr', 'has_outliers_3iqr', 'has_outliers_1_99', 'has_outliers_3std' ]) ########################################################################### normality_k2, normality_p = normaltest(X) is_normal_5 = (normality_p < 0.05).astype(int) is_normal_1 = (normality_p < 0.01).astype(int) meta_vec.extend(list_process(normality_p)) meta_vec.extend(list_process(is_normal_5)) meta_vec.extend(list_process(is_normal_1)) meta_vec_names.extend(list_process_name('normality_p')) meta_vec_names.extend(list_process_name('is_normal_5')) meta_vec_names.extend(list_process_name('is_normal_1')) moment_5 = moment(X, moment=5) moment_6 = moment(X, moment=6) moment_7 = moment(X, moment=7) moment_8 = moment(X, moment=8) moment_9 = moment(X, moment=9) moment_10 = moment(X, moment=10) meta_vec.extend(list_process(moment_5)) meta_vec.extend(list_process(moment_6)) meta_vec.extend(list_process(moment_7)) meta_vec.extend(list_process(moment_8)) meta_vec.extend(list_process(moment_9)) meta_vec.extend(list_process(moment_10)) meta_vec_names.extend(list_process_name('moment_5')) meta_vec_names.extend(list_process_name('moment_6')) meta_vec_names.extend(list_process_name('moment_7')) meta_vec_names.extend(list_process_name('moment_8')) meta_vec_names.extend(list_process_name('moment_9')) meta_vec_names.extend(list_process_name('moment_10')) # note: this is for each dimension == the number of dimensions skewness_list = skew(X).reshape(-1, 1) skew_values = list_process(skewness_list) meta_vec.extend(skew_values) meta_vec_names.extend(list_process_name('skewness')) # note: this is for each dimension == the number of dimensions kurtosis_list = kurtosis(X) kurtosis_values = list_process(kurtosis_list) meta_vec.extend(kurtosis_values) meta_vec_names.extend(list_process_name('kurtosis')) correlation = np.nan_to_num(pd.DataFrame(X).corr(), nan=0) correlation_list = flatten_diagonally(correlation)[0:int( (n_features * n_features - n_features) / 2)] correlation_values = list_process(correlation_list) meta_vec.extend(correlation_values) meta_vec_names.extend(list_process_name('correlation')) covariance = np.cov(X.T) covariance_list = flatten_diagonally(covariance)[0:int( (n_features * n_features - n_features) / 2)] covariance_values = list_process(covariance_list) meta_vec.extend(covariance_values) meta_vec_names.extend(list_process_name('covariance')) # sparsity rep_counts = [] for i in range(n_features): rep_counts.append(len(np.unique(X[:, i]))) sparsity_list = np.asarray(rep_counts) / (n_samples) sparsity = list_process(sparsity_list) meta_vec.extend(sparsity) meta_vec_names.extend(list_process_name('sparsity')) # ANOVA p value p_values_list = [] all_perm = list(itertools.combinations(list(range(n_features)), 2)) for j in all_perm: p_values_list.append(f_oneway(X[:, j[0]], X[:, j[1]])[1]) anova_p_value = list_process(np.asarray(p_values_list)) # anova_p_value = np.mean(p_values_list) # anova_p_value_exceed_thresh = np.mean((np.asarray(p_values_list)<0.05).astype(int)) meta_vec.extend(anova_p_value) meta_vec_names.extend(list_process_name('anova_p_value')) # pca pca_transformer = sklearn_PCA(n_components=3) X_transform = pca_transformer.fit_transform(X) # first pc pca_fpc = list_process(X_transform[0, :], r_min=False, r_max=False, r_mean=False, r_std=True, r_skew=True, r_kurtosis=True) meta_vec.extend(pca_fpc) meta_vec_names.extend( ['first_pca_std', 'first_pca_skewness', 'first_pca_kurtosis']) # entropy entropy_list = [] for i in range(n_features): counts = pd.Series(X[:, i]).value_counts() entropy_list.append(entropy(counts) / n_samples) entropy_values = list_process(entropy_list) meta_vec.extend(entropy_values) meta_vec_names.extend(list_process_name('entropy')) ##############################Landmarkers###################################### # HBOS clf = HBOS(n_bins=10) clf.fit(X) HBOS_hists = clf.hist_ HBOS_mean = np.mean(HBOS_hists, axis=0) HBOS_max = np.max(HBOS_hists, axis=0) HBOS_min = np.min(HBOS_hists, axis=0) meta_vec.extend(list_process(HBOS_mean)) meta_vec.extend(list_process(HBOS_max)) meta_vec.extend(list_process(HBOS_min)) meta_vec_names.extend(list_process_name('HBOS_mean')) meta_vec_names.extend(list_process_name('HBOS_max')) meta_vec_names.extend(list_process_name('HBOS_min')) # IForest n_estimators = 100 clf = IForest(n_estimators=n_estimators) clf.fit(X) n_leaves = [] n_depth = [] fi_mean = [] fi_max = [] # doing this for each sub-trees for i in range(n_estimators): n_leaves.append(clf.estimators_[i].get_n_leaves()) n_depth.append(clf.estimators_[i].get_depth()) fi_mean.append(clf.estimators_[i].feature_importances_.mean()) fi_max.append(clf.estimators_[i].feature_importances_.max()) # print(clf.estimators_[i].tree_) meta_vec.extend(list_process(n_leaves)) meta_vec.extend(list_process(n_depth)) meta_vec.extend(list_process(fi_mean)) meta_vec.extend(list_process(fi_max)) meta_vec_names.extend(list_process_name('IForest_n_leaves')) meta_vec_names.extend(list_process_name('IForest_n_depth')) meta_vec_names.extend(list_process_name('IForest_fi_mean')) meta_vec_names.extend(list_process_name('IForest_fi_max')) # PCA clf = PCA(n_components=3) clf.fit(X) meta_vec.extend(clf.explained_variance_ratio_) meta_vec.extend(clf.singular_values_) meta_vec_names.extend( ['pca_expl_ratio_1', 'pca_expl_ratio_2', 'pca_expl_ratio_3']) meta_vec_names.extend(['pca_sv_1', 'pca_sv_2', 'pca_sv_3']) # LODA n_bins = 10 n_random_cuts = 100 n_hists_mean = [] n_hists_max = [] n_cuts_mean = [] n_cuts_max = [] clf = LODA(n_bins=n_bins, n_random_cuts=n_random_cuts) clf.fit(X) for i in range(n_bins): n_hists_mean.append(clf.histograms_[:, i].mean()) n_hists_max.append(clf.histograms_[:, i].max()) for i in range(n_random_cuts): n_cuts_mean.append(clf.histograms_[i, :].mean()) n_cuts_max.append(clf.histograms_[i, :].max()) meta_vec.extend(list_process(n_hists_mean)) meta_vec.extend(list_process(n_hists_max)) meta_vec.extend(list_process(n_cuts_mean)) meta_vec.extend(list_process(n_cuts_max)) meta_vec_names.extend(list_process_name('LODA_n_hists_mean')) meta_vec_names.extend(list_process_name('LODA_n_hists_max')) meta_vec_names.extend(list_process_name('LODA_n_cuts_mean')) meta_vec_names.extend(list_process_name('LODA_n_cuts_max')) return meta_vec, meta_vec_names
class TestHBOS(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = HBOS(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'hist_') and self.clf.hist_ is not None) assert (hasattr(self.clf, 'bin_edges_') and self.clf.bin_edges_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') # def test_score(self): # self.clf.score(self.X_test, self.y_test) # self.clf.score(self.X_test, self.y_test, scoring='roc_auc_score') # self.clf.score(self.X_test, self.y_test, scoring='prc_n_score') # with assert_raises(NotImplementedError): # self.clf.score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train HBOS detector clf_name = 'HBOS' clf = HBOS() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)