def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'cardio.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) detectors = [LOF(), LOF()] self.clf = LSCP(base_estimators=detectors) self.clf.fit(self.X_train) self.roc_floor = 0.6
contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) detectors = [KNN(), LOF(), OCSVM()] clf_name = 'LSCP' clf = LSCP(base_estimators=detectors) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print('Average', y_train, y_train_scores) print("\nOn Test Data:") evaluate_print('Average', y_test, y_test_scores)
class TestLSCP(unittest.TestCase): def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'cardio.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) detectors = [LOF(), LOF()] self.clf = LSCP(base_estimators=detectors) self.clf.fit(self.X_train) self.roc_floor = 0.6 def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, proba_method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, proba_method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, proba_method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def tearDown(self): pass
def fit(self, X): """ Fit individual detectors. Parameters ---------- X : numpy array of shape (n_samples, n_features) The RD profile of all segments generated after preprocessing. Returns ------- self : object Fitted estimator. """ X = check_array(X) # normalization of all segments with Z-score scale_X = scale(X) # all base detectors with default parameters detectors = [LOF(), SO_GAAL(), IForest(), HBOS(), CBLOF()] # record results for individual detectors self.scores_base_ = np.zeros((len(scale_X), len(detectors))) self.labels_base_ = np.zeros((len(scale_X), len(detectors))) # record results for all merging strategies self.scores_ = np.zeros((len(scale_X), len(self.scores_comb))) self.labels_ = np.zeros((len(scale_X), len(self.scores_comb))) for i in range(len(detectors)): clf = detectors[i].fit(scale_X) self.scores_base_[:, i] = clf.decision_function(scale_X) # obtain a series of binary labels using the BCM _npat = BCM(X=scale_X, is_require_X=self.is_require_X, bandwidth=self.bandwidth) _npat.fit(self.scores_base_[:, i].reshape(-1, 1)) self.labels_base_[:, i] = _npat.labels_ # normalization of all outlier score vectors with Z-score _scale_score = scale(self.scores_base_) for i in range(len(self.scores_comb)): if self.scores_comb[i] == "voting": # majority_vote self.scores_[:, i] = np.array([np.nan] * len(scale_X)) self.labels_[:, i] = np.array( [statistics.mode(j) for j in self.labels_base_]) elif self.scores_comb[i] == "maximum": # the maximum of five outlier scores for each segment self.scores_[:, i] = np.max(_scale_score, axis=1) # obtain binary labels with BCM _npat = BCM(X=scale_X, is_require_X=self.is_require_X, bandwidth=self.bandwidth) _npat.fit(self.scores_[:, i].reshape(-1, 1)) self.labels_[:, i] = _npat.labels_ elif self.scores_comb[i] == "lscp": clf = LSCP(detectors, pre_fitted=True) clf.fit(scale_X) self.scores_[:, i] = clf.decision_function(scale_X) # obtain binary labels with the BCM _npat = BCM(X=scale_X, is_require_X=self.is_require_X, bandwidth=self.bandwidth) _npat.fit(self.scores_[:, i].reshape(-1, 1)) self.labels_[:, i] = _npat.labels_ elif self.scores_comb[i] == "averaging": self.scores_[:, i] = np.mean(_scale_score, axis=1) # obtain binary labels with the BCM _npat = BCM(X=scale_X, is_require_X=self.is_require_X, bandwidth=self.bandwidth) _npat.fit(self.scores_[:, i].reshape(-1, 1)) self.labels_[:, i] = _npat.labels_