Esempio n. 1
0
 def test_ReplicateValidation(self):
     c = oe.cluster(self.data)
     c.cluster('parent', 'kmeans', 'kmeans', K=2)
     v = oe.validation(self.data, c)
     len_expected = 0
     self.assertEqual(len_expected, len(v.validation))
     v.calculate('Ball_Hall', 'kmeans', 'parent')
     len_expected = 1
     self.assertEqual(len_expected, len(v.validation))
     v.calculate('Ball_Hall', 'kmeans', 'parent')
     self.assertEqual(len_expected, len(v.validation))
Esempio n. 2
0
 def test_validation_badSourceAndCluster(self):
     c = oe.cluster(self.data)
     c.cluster('parent', 'kmeans', 'kmeans', K=2)
     v = oe.validation(self.data, c)
     self.assertRaises(
         ValueError,
         lambda: v.calculate('Ball_Hall', 'kmeans', 'gobblygook'))
     self.assertRaises(
         ValueError,
         lambda: v.calculate('Ball_Hall', 'gobblygook', 'parent'))
     self.assertRaises(
         ValueError, lambda: v.calculate('GobblyGook', 'kmeans', 'parent'))
Esempio n. 3
0
 def test_allValidationMetrics(self):
     c = oe.cluster(self.data)
     c.cluster('parent', 'kmeans', 'kmeans', K=2)
     v = oe.validation(self.data, c)
     FCN_DICT = v.validation_metrics_available()
     len_expected = 1
     for validation_name in FCN_DICT:
         v.calculate(validation_name, 'kmeans', 'parent')
         self.assertEqual(len_expected, len(v.validation))
         self.assertEqual(len_expected, len(v.description))
         self.assertEqual(len_expected, len(v.source_name))
         self.assertEqual(len_expected, len(v.cluster_name))
         len_expected += 1
Esempio n. 4
0
    def test_validation_merge(self):
        c = oe.cluster(self.data)
        c.cluster('parent',
                  'kmeans',
                  'kmeans_1',
                  K=2,
                  random_seed=0,
                  init='random',
                  n_init=1)
        c.cluster('parent',
                  'kmeans',
                  'kmeans_2',
                  K=2,
                  random_seed=0,
                  init='random',
                  n_init=1)
        c.cluster('parent',
                  'kmeans',
                  'kmeans_3',
                  K=2,
                  random_seed=0,
                  init='random',
                  n_init=1)

        v = oe.validation(self.data, c)
        v2 = oe.validation(self.data, c)
        v3 = oe.validation(self.data, c)

        v.calculate('silhouette', 'kmeans_1', 'parent')
        v2.calculate('silhouette', 'kmeans_2', 'parent')
        v3.calculate('silhouette', 'kmeans_3', 'parent')

        self.assertEqual(1, len(v.validation.keys()))
        self.assertRaises(ValueError, lambda: v.merge(['string']))
        v.merge([v2, v3])
        self.assertEqual(3, len(v.validation.keys()))
Esempio n. 5
0
import pandas as pd
from sklearn import datasets
import openensembles as oe
import matplotlib.pyplot as plt
import seaborn as sns
#Set up a dataset and put in pandas DataFrame.
x, y = datasets.make_moons(n_samples=200, shuffle=True, noise=0.02, random_state=None)
df = pd.DataFrame(x)
#instantiate the oe data object
dataObj = oe.data(df, [1,2])
#instantiate an oe clustering object
c = oe.cluster(dataObj)
c_MV_arr = []
val_arr = []
for i in range(0,39):
    # add a new clustering solution, with a unique name
    name = 'kmeans_' + str(i)
    c.cluster('parent', 'kmeans', name, K=16, init = 'random', n_init = 1)
    # calculate a new majority vote solution, where c has one more solution on each iteration
    c_MV_arr.append(c.finish_majority_vote(threshold=0.5))
    #calculate the determinant ratio metric for each majority vote solution
    v = oe.validation(dataObj, c_MV_arr[i])
    val_name = v.calculate('det_ratio', 'majority_vote', 'parent')
    val_arr.append(v.validation[val_name])

#calculate the co-occurrence matrix
coMat = c.co_occurrence_matrix()
coMat.plot(labels=False)
Esempio n. 6
0
#Reproduce Ana Fred's majority voting solution using OpenEnsembles
import pandas as pd 
from sklearn import datasets
import openensembles as oe

#Set up a dataset and put in pandas DataFrame.
x, y = datasets.make_blobs(n_samples=250, centers=[(0,0), (0, 10)], cluster_std=1)
df = pd.DataFrame(x) 

#instantiate the oe data object
dataObj = oe.data(df, [1,2])

#instantiate an oe clustering object
c = oe.cluster(dataObj) 

#Use a 
c_MV_arr = []
val_arr = []
for i in range(0,19):
    name = 'kmeans_' + str(i) #to append a new solution, it must have a name (dictionary key) that is unique
    c.cluster('parent', 'kmeans', name, K=16, init = 'random', n_init = 1) #c.cluster will eventually become numIterations long
    c_MV_arr.append(c.finish_majority_vote(threshold=0.5)) # calculate a new majority vote solution each time it has one more iteration
    #calculate the silhouette validation metric for each majority vote solution
    v = oe.validation(dataObj, c_MV_arr[i]) #instantiate with the majority vote cluster object
    output_name = v.calculate('silhouette', 'majority_vote', 'parent')
    val_arr.append(v.validation[output_name])

#calculate the co-occurrence matrix
coMat = c.co_occurrence_matrix()