class IForestSupervisedKNN(BaseDetector): def __init__(self, get_top=0.8, if_params={}, knn_params={}): super(IForestSupervisedKNN, self).__init__() self.get_top = get_top self.is_fitted = False self.iforest = IForest(**if_params) self.knn = KNN(**knn_params) def fit(self, X, y=None): X = check_array(X) self._set_n_classes(y) self.iforest.fit(X) scores = self.iforest.predict_proba(X)[:, 1] normal_instances = X[np.argsort(scores)[:int(len(X) * self.get_top)]] self.knn.fit(normal_instances) self.decision_scores_ = self.decision_function(X) self._process_decision_scores() self.is_fitted = True return self def decision_function(self, X): check_is_fitted(self, ['is_fitted']) return self.knn.decision_function(X)
class IForestWrapper: def __init__(self, **kwargs): self._model = IForest(**kwargs) def fit(self, X, T): # unsupervised learning Targets not used self._model.fit(X) return self def predict(self, X): Y = self._model.predict(X) return Y def predict_proba(self, X): probs = self._model.predict_proba(X) return probs
def compute_anomaly_scores(self): self._find_reference_groups() beh_vals_train = self.X_train[:, self.behavioral_attr] beh_vals_test = self.X_test[:, self.behavioral_attr] n_groups = len(set(self.ref_groups_train)) scores_all = np.empty([len(beh_vals_test)]) for i in range(n_groups): indices_train = np.where(self.ref_groups_train == i)[0] indices_test = np.where(self.ref_groups_test == i)[0] training_vals = beh_vals_train[indices_train, :] test_vals = beh_vals_test[indices_test, :] if (len(training_vals) == 0) | (len(test_vals) == 0): print('Empty cluster') continue clf = IForest(random_state=47, contamination=0.01, behaviour='new').fit(training_vals) scores_unified = clf.predict_proba(test_vals, method='unify')[:, 1] for j in range(len(indices_test)): scores_all[indices_test[j]] = scores_unified[j] return scores_all
class TestIForest(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = IForest(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'estimators_') and self.clf.estimators_ is not None) assert (hasattr(self.clf, 'estimators_samples_') and self.clf.estimators_samples_ is not None) assert (hasattr(self.clf, 'max_samples_') and self.clf.max_samples_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
############################################# # And the conversion. if IForest is not None: onx = to_onnx(model1, initial_types=initial_type, target_opset=14) ############################################### # Checking discrepencies # ++++++++++++++++++++++ if IForest is not None: data = sc_data.astype(np.float32) expected_labels = model1.predict(data) expected_proba = model1.predict_proba(data) sess = InferenceSession(onx.SerializeToString()) res = sess.run(None, {'float_input': data}) onx_labels = res[0] onx_proba = res[1] diff_labels = np.abs(onx_labels.ravel() - expected_labels.ravel()).max() diff_proba = np.abs(onx_proba.ravel() - expected_proba.ravel()).max() print("dicrepencies:", diff_labels, diff_proba) print("ONNX labels", onx_labels) print("ONNX probabilities", onx_proba)
@author: zixing.mei """ from pyod.models.iforest import IForest clf = IForest(behaviour='new', bootstrap=False, contamination=0.1, max_features=1.0, max_samples='auto', n_estimators=500, n_jobs=-1, random_state=None, verbose=0) clf.fit(x) out_pred = clf.predict_proba(x, method='linear')[:, 1] train['out_pred'] = out_pred train['for_pred'] = np.where(train.out_pred > 0.7, '负样本占比', '正样本占比') dic = dict(train.groupby(train.for_pred).bad_ind.agg(np.sum)/ \ train.bad_ind.groupby(train.for_pred).count()) pd.DataFrame(dic, index=[0]) clf = IForest(behaviour='new', bootstrap=False, contamination=0.1, max_features=1.0, max_samples='auto', n_estimators=500, n_jobs=-1, random_state=None, verbose=0)
class TestIForest(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = IForest(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'estimators_') and self.clf.estimators_ is not None) assert_true(hasattr(self.clf, 'estimators_samples_') and self.clf.estimators_samples_ is not None) assert_true(hasattr(self.clf, 'max_samples_') and self.clf.max_samples_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
class TestIForest(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = IForest(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr( self.clf, 'decision_scores_') or self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') if not hasattr(self.clf, 'estimators_') or self.clf.estimators_ is None: self.assertRaises(AttributeError, 'estimators_ is not set') if not hasattr( self.clf, 'estimators_samples_') or self.clf.estimators_samples_ is None: self.assertRaises(AttributeError, 'estimators_samples_ is not set') if not hasattr(self.clf, 'max_samples_') or self.clf.max_samples_ is None: self.assertRaises(AttributeError, 'max_samples_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_evaluate(self): self.clf.fit_predict_evaluate(self.X_test, self.y_test) def tearDown(self): pass