def test_data_generate_cluster5(self): with assert_raises(ValueError): X_train, y_train, X_test, y_test = \ generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, n_clusters='e', contamination=self.contamination, random_state=self.random_state) with assert_raises(ValueError): X_train, y_train, X_test, y_test = \ generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features='e', contamination=self.contamination, random_state=self.random_state) with assert_raises(ValueError): X_train, y_train, X_test, y_test = \ generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination='e', random_state=self.random_state) with assert_raises(ValueError): X_train, y_train, X_test, y_test = \ generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination=self.contamination, dist='e', random_state=self.random_state)
def test_data_generate_cluster3(self): X_train, y_train, X_test, y_test = \ generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination=self.contamination, random_state=self.random_state) X_train2, y_train2, X_test2, y_test2 = \ generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination=self.contamination, random_state=self.random_state) assert_allclose(X_train, X_train2) assert_allclose(X_test, X_test2) assert_allclose(y_train, y_train2) assert_allclose(y_test, y_test2)
def test_data_generate_cluster2(self): X_train, X_test, y_train, y_test = \ generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=4, contamination=self.contamination, random_state=self.random_state) assert_allclose(X_train.shape, (self.n_train, 4)) assert_allclose(X_test.shape, (self.n_test, 4))
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.n_components = 4 self.roc_floor = 0.8 self.X_train, self.X_test, self.y_train, self.y_test = generate_data_clusters( n_train=self.n_train, n_test=self.n_test, n_clusters=self.n_components, contamination=self.contamination, random_state=42, ) self.clf = GMM(n_components=self.n_components, contamination=self.contamination) self.clf.fit(self.X_train)
def test_data_generate_cluster(self): X_train, X_test, y_train, y_test = \ generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=2, contamination=self.contamination, random_state=self.random_state) assert_equal(y_train.shape[0], X_train.shape[0]) assert_equal(y_test.shape[0], X_test.shape[0]) assert (self.n_train - X_train.shape[0] <= 1) assert_equal(X_train.shape[1], 2) assert (self.n_test - X_test.shape[0] <= 1) assert_equal(X_test.shape[1], 2) out_perc = (np.sum(y_train) + np.sum(y_test)) / ( self.n_train + self.n_test) assert_allclose(self.contamination, out_perc, atol=0.01)
# temporary solution for relative imports in case pyod is not installed # if pyod is installed, no need to use the following line sys.path.append( os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))) if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, X_test, y_train, y_test = generate_data_clusters( n_train=n_train, n_test=n_test, n_features=2, n_clusters=4, contamination=contamination, random_state=42, ) # train kNN detector clf_name = "GMM" clf = GMM(n_components=4) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
from pyod.models.lof import LOF from pyod.utils.data import generate_data_clusters from pyod.utils.example import data_visualize from pyod.utils.example import visualize from pyod.utils.data import evaluate_print if __name__ == "__main__": contamination = 0.1 # percentage of outliers # Generate sample data in clusters X, y = generate_data_clusters(n_train=450, n_test=50, n_clusters=3, n_features=2, contamination=contamination, size='different', density='different', dist=0.2, random_state=42, return_in_clusters=True) # visualize the results data_visualize(X, y, show_figure=True, save_figure=False) # test on the generated datasets # Generate sample data in clusters X_train, X_test, y_train, y_test = generate_data_clusters( n_train=450, n_test=50, n_clusters=3,