def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'pima.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) self.clf = XGBOD(random_state=42) self.clf.fit(self.X_train, self.y_train) self.roc_floor = 0.75
def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'pima.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) self.clf = XGBOD(random_state=42) self.clf.fit(self.X_train, self.y_train) self.roc_floor = 0.8
def train_model(X, Y, contamination, name, from_scratch=True): model_dir = './model' if not os.path.exists(model_dir): os.mkdir(model_dir) file_name = name + '.pkl' if from_scratch: if name == 'ocsvm': model = OCSVM(contamination=contamination) model.fit(X) elif name == 'iforest': model = IForest(contamination=contamination) model.fit(X) elif name == 'lof': model = LOF(contamination=contamination) model.fit(X) elif name == 'knn': model = KNN(contamination=contamination) model.fit(X) elif name == 'xgbod': model = XGBOD(contamination=contamination) model.fit(X, Y) save(model, model_dir, file_name) else: model = load(model_dir, file_name) return model
def model_test(model_type, y_train, y_test, X_train, X_test, model_file, save_flag): if model_type == 'KNN': clf_name = 'KNN' clf = KNN() clf.fit(X_train) if model_type == 'XGBOD': clf_name = 'XGBOD' #set this scale_pos_weight sum(negative instances) / sum(positive instances). clf = XGBOD(random_state=42, scale_pos_weight=50) clf.fit(X_train, y_train) if model_type == 'SOD': # train SOD detector # Note that SOD is meant to work in high dimensions d > 2. # But here we are using 2D for visualization purpose # thus, higher precision is expected in higher dimensions clf_name = 'SOD' clf = SOD() clf.fit(X_train) if model_type == 'VAE': # train VAE detector (Beta-VAE) clf_name = 'VAE' contamination = 0.01 clf = VAE(epochs=30, contamination=contamination, gamma=0.8, capacity=0.2) clf.fit(X_train) #save model if specified if save_flag == '1': pickle.dump(clf, open(model_file, "wb")) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) conf_train = confusion_matrix(y_train, y_train_pred) print("<<<< confusion matrix for train: ", conf_train) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) conf_test = confusion_matrix(y_test, y_test_pred) print("<<<< confusion matrix for test: ", conf_test) # visualize the results #todo: Input data has to be 2-d for visualization. #visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, # y_test_pred, show_figure=True, save_figure=False) return model_file
def initialise_supervised_classifiers(self): classifiers = {} classifiers['Extreme Boosting Based Outlier Detector (XGBOD)'] = [] classifiers['Nearest Non Outlier (NNO)'] = [] for i in range(self.k_way): classifiers[ 'Extreme Boosting Based Outlier Detector (XGBOD)'].append( XGBOD(silent=False)) classifiers['Nearest Non Outlier (NNO)'].append(NNO()) return classifiers
def outlier_detection(x_raw, y_raw): """ Filter all ourlier points :param x_raw: feature in ndarray :param y_raw: label in ndarray :return x_clean, y_clean: cleaned feature and label in ndarray """ # TODO Filter the outliers. print() print("Detecting outliers...") print("Before outlier detection: {}".format(x_raw.shape)) outliers_fraction = 0.04 random_state = np.random.RandomState(42) # all outlier detection method candidate list as follows classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'Improving Supervised Outlier Detection with Unsupervised Representation Learning': XGBOD(contamination=outliers_fraction), } clf_name = 'Isolation Forest' clf = IForest(contamination=outliers_fraction, random_state=random_state) # clf_name = 'Angle-based Outlier Detector (ABOD)' # clf = ABOD(contamination=outliers_fraction, method='default') clf.fit(x_raw) y_pred = clf.predict(x_raw) # for pyod, 1 means outliers and 0 means inliers # for sklearn, -1 means outliers and 1 means inliers idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1] x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0) y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0) print("After outlier detection: {}".format(x_clean.shape)) assert (x_clean.shape[0] == y_clean.shape[0]) return x_clean, y_clean
class TestXGBOD(unittest.TestCase): def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'pima.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) self.clf = XGBOD(random_state=42) self.clf.fit(self.X_train, self.y_train) self.roc_floor = 0.75 def test_parameters(self): assert (hasattr(self.clf, 'clf_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, '_scalar') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'n_detector_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'X_train_add_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) # check performance assert_greater(roc_auc_score(self.y_test, pred_proba), self.roc_floor) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train, self.y_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) print(pred_ranks) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), rtol=4) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), rtol=4) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
class TestXGBOD(unittest.TestCase): def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'pima.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) self.clf = XGBOD(random_state=42) self.clf.fit(self.X_train, self.y_train) self.roc_floor = 0.8 def test_parameters(self): assert_true(hasattr(self.clf, 'clf_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, '_scalar') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'n_detector_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'X_train_add_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) # check performance assert_greater(roc_auc_score(self.y_test, pred_proba), self.roc_floor) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train, self.y_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) print(pred_ranks) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=4) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=4) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
def construct_xgbod(): from pyod.models.xgbod import XGBOD model = XGBOD(estimator_list=construct_raw_base_estimators(), silent=False, n_jobs=24) return model
X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) # train XGBOD detector clf_name = 'XGBOD' clf = XGBOD(random_state=42) clf.fit(X_train, y_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
def run(data_train, data_test, clf_name): X_train, y_train = train_data_process(data_train) X_test, y_true = test_data_process(data_test) classifiers = { "XGBOD": XGBOD(random_state=0), "KNeighborsClassifier": KNeighborsClassifier(3), "SVC": SVC(random_state=0), "GaussianProcessClassifier": GaussianProcessClassifier(1.0 * RBF(1.0)), "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0), "RandomForestClassifier": RandomForestClassifier(random_state=0), "MLPClassifier": MLPClassifier(random_state=0), "AdaBoostClassifier": AdaBoostClassifier(), "GaussianNB": GaussianNB(), "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(), "BaggingClassifierPU": BaggingClassifierPU( DecisionTreeClassifier(), n_estimators=1000, # 1000 trees as usual max_samples=sum(y_train), # Balance the positives and unlabeled in each bag n_jobs=-1 # Use all cores ) } clf = classifiers[clf_name] try: clf.fit(X_train, y_train) y_pred = clf.predict(X_test) TP = 0 FN = 0 FP = 0 TN = 0 for i, label in enumerate(y_true): if label: if y_pred[i]: TP += 1 else: FN += 1 else: if y_pred[i]: FP += 1 else: TN += 1 if (FP + TN) == 0: pf = "no negative samples." else: pf = FP / (FP + TN) try: auc = roc_auc_score(y_true, y_pred) except ValueError as e: auc = str(e) return { 'train samples': str(X_train.shape[0]), 'defective train samples': str(np.sum(y_train)), 'precision': precision_score(y_true, y_pred), 'recall': recall_score(y_true, y_pred), 'pf': pf, 'F-measure': f1_score(y_true, y_pred), 'accuracy': accuracy_score(y_true, y_pred), 'AUC': auc } except ValueError as e: return str(e)
def pyod_init(model, n_features=None): # initial model set up if model == 'abod': from pyod.models.abod import ABOD clf = ABOD() elif model == 'auto_encoder' and n_features: #import os #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from pyod.models.auto_encoder import AutoEncoder clf = AutoEncoder(hidden_neurons=[ n_features, n_features * 5, n_features * 5, n_features ], epochs=5, batch_size=64, preprocessing=False) elif model == 'cblof': from pyod.models.cblof import CBLOF clf = CBLOF(n_clusters=4) elif model == 'hbos': from pyod.models.hbos import HBOS clf = HBOS() elif model == 'iforest': from pyod.models.iforest import IForest clf = IForest() elif model == 'knn': from pyod.models.knn import KNN clf = KNN() elif model == 'lmdd': from pyod.models.lmdd import LMDD clf = LMDD() elif model == 'loci': from pyod.models.loci import LOCI clf = LOCI() elif model == 'loda': from pyod.models.loda import LODA clf = LODA() elif model == 'lof': from pyod.models.lof import LOF clf = LOF() elif model == 'mcd': from pyod.models.mcd import MCD clf = MCD() elif model == 'ocsvm': from pyod.models.ocsvm import OCSVM clf = OCSVM() elif model == 'pca': from pyod.models.pca import PCA clf = PCA() elif model == 'sod': from pyod.models.sod import SOD clf = SOD() elif model == 'vae': from pyod.models.vae import VAE clf = VAE() elif model == 'xgbod': from pyod.models.xgbod import XGBOD clf = XGBOD() else: #raise ValueError(f"unknown model {model}") clf = PyODDefaultModel() return clf
'IForest': IForest(), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), #'MoGaal':MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]) } models = { 'XGBOD': XGBOD(), 'BRM': BRM(), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'OCKRA': m_OCKRA(), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), } Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root,scaler,models,start,counts,other_models,CPUS_Models) for scaler in scalers)
rs = 10 # Save how to run the models models = [ (IsolationForest(random_state=rs), 'ISOF'), (EllipticEnvelope(random_state=rs), 'EE'), (LMDD(dis_measure='aad', random_state=rs), 'AAD_LMDD'), (COPOD(), 'COPOD'), (FeatureBagging(combination='average', random_state=rs), 'AVE_Bagging'), # n_jobs (LMDD(dis_measure='iqr', random_state=rs), 'IQR_LMDD'), (KNN(method='largest'), 'Largest_KNN'), # n_jobs (LODA(), 'LODA'), (FeatureBagging(combination='max', n_jobs=-1, random_state=rs), 'MAX_Bagging'), (MCD(random_state=rs), 'MCD'), (XGBOD(random_state=rs), 'XGBOD'), # n_jobs (GaussianMixture(random_state=rs), 'GMM'), (LocalOutlierFactor(novelty=True), 'LOF'), (KNN(method='median'), 'Median_KNN'), # n_jobs (KNN(method='mean'), 'Avg_KNN'), # n_jobs (CBLOF(n_clusters=10, random_state=rs), 'CBLOF'), (HBOS(), 'HBOS'), (SOD(), 'SOD'), (PCA(random_state=rs), 'PCA'), (VAE(encoder_neurons=[3, 4, 3], decoder_neurons=[3, 4, 3], random_state=rs), 'VAE'), (AutoEncoder(hidden_neurons=[3, 4, 4, 3], verbose=0, random_state=rs), 'AE') ] # Start the counter of time