def test_ReplicateValidation(self): c = oe.cluster(self.data) c.cluster('parent', 'kmeans', 'kmeans', K=2) v = oe.validation(self.data, c) len_expected = 0 self.assertEqual(len_expected, len(v.validation)) v.calculate('Ball_Hall', 'kmeans', 'parent') len_expected = 1 self.assertEqual(len_expected, len(v.validation)) v.calculate('Ball_Hall', 'kmeans', 'parent') self.assertEqual(len_expected, len(v.validation))
def test_validation_badSourceAndCluster(self): c = oe.cluster(self.data) c.cluster('parent', 'kmeans', 'kmeans', K=2) v = oe.validation(self.data, c) self.assertRaises( ValueError, lambda: v.calculate('Ball_Hall', 'kmeans', 'gobblygook')) self.assertRaises( ValueError, lambda: v.calculate('Ball_Hall', 'gobblygook', 'parent')) self.assertRaises( ValueError, lambda: v.calculate('GobblyGook', 'kmeans', 'parent'))
def test_allValidationMetrics(self): c = oe.cluster(self.data) c.cluster('parent', 'kmeans', 'kmeans', K=2) v = oe.validation(self.data, c) FCN_DICT = v.validation_metrics_available() len_expected = 1 for validation_name in FCN_DICT: v.calculate(validation_name, 'kmeans', 'parent') self.assertEqual(len_expected, len(v.validation)) self.assertEqual(len_expected, len(v.description)) self.assertEqual(len_expected, len(v.source_name)) self.assertEqual(len_expected, len(v.cluster_name)) len_expected += 1
def test_validation_merge(self): c = oe.cluster(self.data) c.cluster('parent', 'kmeans', 'kmeans_1', K=2, random_seed=0, init='random', n_init=1) c.cluster('parent', 'kmeans', 'kmeans_2', K=2, random_seed=0, init='random', n_init=1) c.cluster('parent', 'kmeans', 'kmeans_3', K=2, random_seed=0, init='random', n_init=1) v = oe.validation(self.data, c) v2 = oe.validation(self.data, c) v3 = oe.validation(self.data, c) v.calculate('silhouette', 'kmeans_1', 'parent') v2.calculate('silhouette', 'kmeans_2', 'parent') v3.calculate('silhouette', 'kmeans_3', 'parent') self.assertEqual(1, len(v.validation.keys())) self.assertRaises(ValueError, lambda: v.merge(['string'])) v.merge([v2, v3]) self.assertEqual(3, len(v.validation.keys()))
import pandas as pd from sklearn import datasets import openensembles as oe import matplotlib.pyplot as plt import seaborn as sns #Set up a dataset and put in pandas DataFrame. x, y = datasets.make_moons(n_samples=200, shuffle=True, noise=0.02, random_state=None) df = pd.DataFrame(x) #instantiate the oe data object dataObj = oe.data(df, [1,2]) #instantiate an oe clustering object c = oe.cluster(dataObj) c_MV_arr = [] val_arr = [] for i in range(0,39): # add a new clustering solution, with a unique name name = 'kmeans_' + str(i) c.cluster('parent', 'kmeans', name, K=16, init = 'random', n_init = 1) # calculate a new majority vote solution, where c has one more solution on each iteration c_MV_arr.append(c.finish_majority_vote(threshold=0.5)) #calculate the determinant ratio metric for each majority vote solution v = oe.validation(dataObj, c_MV_arr[i]) val_name = v.calculate('det_ratio', 'majority_vote', 'parent') val_arr.append(v.validation[val_name]) #calculate the co-occurrence matrix coMat = c.co_occurrence_matrix() coMat.plot(labels=False)
#Reproduce Ana Fred's majority voting solution using OpenEnsembles import pandas as pd from sklearn import datasets import openensembles as oe #Set up a dataset and put in pandas DataFrame. x, y = datasets.make_blobs(n_samples=250, centers=[(0,0), (0, 10)], cluster_std=1) df = pd.DataFrame(x) #instantiate the oe data object dataObj = oe.data(df, [1,2]) #instantiate an oe clustering object c = oe.cluster(dataObj) #Use a c_MV_arr = [] val_arr = [] for i in range(0,19): name = 'kmeans_' + str(i) #to append a new solution, it must have a name (dictionary key) that is unique c.cluster('parent', 'kmeans', name, K=16, init = 'random', n_init = 1) #c.cluster will eventually become numIterations long c_MV_arr.append(c.finish_majority_vote(threshold=0.5)) # calculate a new majority vote solution each time it has one more iteration #calculate the silhouette validation metric for each majority vote solution v = oe.validation(dataObj, c_MV_arr[i]) #instantiate with the majority vote cluster object output_name = v.calculate('silhouette', 'majority_vote', 'parent') val_arr.append(v.validation[output_name]) #calculate the co-occurrence matrix coMat = c.co_occurrence_matrix()