def test_dataObj_merge(self): fileName = 'data_test.csv' df = pd.read_csv(fileName) x = [0, 5, 30] data2 = oe.data(df, x) self.assertRaises(ValueError, lambda: self.data.merge([x])) #add some transforms, so we can make sure merge pulls everything along self.data.transform('parent', 'log', 'log10', base=10) self.assertEqual(2, len(self.data.D.keys())) data2.transform('parent', 'log', 'log10', base=10) data2.transform('parent', 'log', 'log2', base=2) self.data.merge([data2]) self.assertEqual(5, len(self.data.D.keys()))
def slice(self, names): """ Returns a new data object containing a slice indicated by the list of names given (dictionary keys shared amongst D, params, etc.). Cannot remove 'parent' as that is the default dataframe matrix that established data object. To replace parent, instead instantiate a new object on a dataframe created from transformation of interest. Parameters ---------- names: list A list of strings matching the names to keep in the new slice Returns -------- d: an openensembles data object A oe.data object that contains only those names passed in Examples -------- Remove 'zscore' from the list, keeping everything else >>> names = d.D.keys() #get all the keys >>> names = names.remove(['zscore']) >>> dNew = d.slice(names) Raises ------ ValueError If a name in the list of names does not exist in data object """ d = oe.data(self.df, self.x) names_existing = list(self.D.keys()) for name in names: if name not in names_existing: raise ValueError( "ERROR: the source you requested for slicing does not exist in data object %s" % (name)) d.D[name] = self.D[name] d.x[name] = self.x[name] d.params[name] = self.params[name] return d
def run_mv_oe(X, y=None): """Deprecated""" print("a") n_features = X.shape[1] columns = [f"x{i}" for i in range(n_features)] df = pd.DataFrame(X, columns=columns) dataObj = oe.data(df, list(range(n_features))) c = oe.cluster(dataObj) c_MV_arr = [] for i in range(30): name = f'kmeans_{i}' c.cluster('parent', 'kmeans', name, K=15, init='random', n_init=1) c_MV_arr.append(c.finish_majority_vote(threshold=0.5)) final_labels = c_MV_arr[-1].labels['majority_vote'] - 1 print(len(np.unique(final_labels))) return X, final_labels, y if len(np.unique(final_labels)) > 1 else run_mv_oe(X, y)
#%matplotlib inline #np.random.seed(a_fixed_number) every time you call the numpy's other random function, the result will be the same #However, if you just call it once and use various random functions, the results will still be different: np.random.seed( 0 ) #this helps to establish the same dataset and functionality, but is not required # Open a csv file and convert to dataframe object #df = pd.read_csv('Data/DataGranulatedGCSNoPT.csv') ----- # convert dataframe to oe dataobject # second argument is the number of columns d = oe.data(df, [i for i in range(1, len(df.columns) + 1)]) ''' WHAT NEEDS TO BE ADDED FOR FUTURE USAGE FROM OTHERS: a) After loading data Ensure everything is either normalized or handle in this cell before creating the ensemble b) Ensure categorical features are encoded Code cell accidently deleted for encoding but pandas has useful tool for easy encoding https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html ''' # pass oe dataobj to cluster class c = oe.cluster( d) #instantiate an object so we can get all available algorithms
def test_setup_floats(self): fileName = 'data_test.csv' df = pd.read_csv(fileName) x = [0.0, 5.0, 30.0] self.data = oe.data(df, x) self.assertListEqual(x, self.data.x['parent'])
def test_setup_stringX(self): fileName = 'data_test.csv' df = pd.read_csv(fileName) x = ['something', 5, 30] self.data = oe.data(df, x) self.assertListEqual([0, 1, 2], self.data.x['parent'])
def test_incorrect_setup(self): fileName = 'data_test.csv' df = pd.read_csv(fileName) x = [0, 5, 10, 30] self.assertRaises(ValueError, lambda: oe.data(df, x))
def test_remove_metaData(self): fileName = 'data_test_meta.csv' df = pd.read_csv(fileName) x = [0, 5, 30] self.data = oe.data(df, x)
def setUp(self): fileName = 'data_test.csv' df = pd.read_csv(fileName) x = [0, 5, 30] self.data = oe.data(df, x)
def setUp(self): fileName = 'data_test.csv' df = pd.DataFrame.from_csv(fileName) x = [0, 5, 30] self.data = oe.data(df, x)
# --- SECTION 1 --- # Libraries and data loading import openensembles as oe import numpy as np import pandas as pd import sklearn.metrics from sklearn.datasets import load_breast_cancer from sklearn.manifold import TSNE bc = load_breast_cancer() t = TSNE() # --- SECTION 2 --- # Create the data object cluster_data = oe.data(pd.DataFrame(t.fit_transform(bc.data)), [0, 1]) np.random.seed(123456) # --- SECTION 3 --- # Create the ensembles and calculate the homogeneity score for K in [2, 3, 4, 5, 6, 7]: for ensemble_size in [3, 4, 5]: ensemble = oe.cluster(cluster_data) for i in range(ensemble_size): name = f'kmeans_{ensemble_size}_{i}' ensemble.cluster('parent', 'kmeans', name, K) preds = ensemble.finish_majority_vote(threshold=0.5) print(f'K: {K}, size {ensemble_size}:', end=' ') print('%.2f' % sklearn.metrics.homogeneity_score( bc.target, preds.labels['majority_vote']))
recents = recents.dropna(axis=1, how="all") recents = recents.fillna(recents.median()) # Use only these specific features columns = [ 'Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Positive affect', 'Negative affect', 'Confidence in national government', 'Democratic Quality', 'Delivery Quality' ] # Transform the data with TSNE tsne = t_sne.TSNE() transformed = pd.DataFrame(tsne.fit_transform(recents[columns])) # Create the data object cluster_data = oe.data(transformed, [0, 1]) # Create the ensemble ensemble = oe.cluster(cluster_data) for i in range(20): name = f'kmeans({i}-tsne' ensemble.cluster('parent', 'kmeans', name, 10) # Create the cluster labels preds = ensemble.finish_co_occ_linkage(threshold=0.5) # Add Life Ladder to columns columns = [ 'Life Ladder', 'Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Positive affect',
# --- SECTION 1 --- # Libraries and data loading import openensembles as oe import numpy as np import pandas as pd import sklearn.metrics from sklearn.datasets import load_breast_cancer bc = load_breast_cancer() # --- SECTION 2 --- # Create the data object cluster_data = oe.data(pd.DataFrame(bc.data), bc.feature_names) np.random.seed(123456) # --- SECTION 3 --- # Create the ensembles and calculate the homogeneity score for K in [2, 3, 4, 5, 6, 7]: for ensemble_size in [3, 4, 5]: ensemble = oe.cluster(cluster_data) for i in range(ensemble_size): name = f'kmeans_{ensemble_size}_{i}' ensemble.cluster('parent', 'kmeans', name, K) preds = ensemble.finish_graph_closure(threshold=0.5) print(f'K: {K}, size {ensemble_size}:', end=' ') print('%.2f' % sklearn.metrics.homogeneity_score( bc.target, preds.labels['graph_closure']))
# Use only these specific features columns = ['Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption','Positive affect', 'Negative affect', 'Confidence in national government', 'Democratic Quality', 'Delivery Quality'] # Normalize the features by subtracting the mean # and dividing by the standard deviation normalized = recents[columns] normalized = normalized - normalized.mean() normalized = normalized / normalized.std() # Create the data object cluster_data = oe.data(recents[columns], columns) np.random.seed(123456) results = {'K':[], 'size':[], 'silhouette': []} # Test different ensemble setups Ks = [2, 4, 6, 8, 10, 12, 14] sizes = [5, 10, 20, 50] for K in Ks: for ensemble_size in sizes: ensemble = oe.cluster(cluster_data) for i in range(ensemble_size): name = f'kmeans_{ensemble_size}_{i}' ensemble.cluster('parent', 'kmeans', name, K) preds = ensemble.finish_co_occ_linkage(threshold=0.5)
import pandas as pd from sklearn import datasets import openensembles as oe import matplotlib.pyplot as plt import seaborn as sns #Set up a dataset and put in pandas DataFrame. x, y = datasets.make_moons(n_samples=200, shuffle=True, noise=0.02, random_state=None) df = pd.DataFrame(x) #instantiate the oe data object dataObj = oe.data(df, [1,2]) #instantiate an oe clustering object c = oe.cluster(dataObj) c_MV_arr = [] val_arr = [] for i in range(0,39): # add a new clustering solution, with a unique name name = 'kmeans_' + str(i) c.cluster('parent', 'kmeans', name, K=16, init = 'random', n_init = 1) # calculate a new majority vote solution, where c has one more solution on each iteration c_MV_arr.append(c.finish_majority_vote(threshold=0.5)) #calculate the determinant ratio metric for each majority vote solution v = oe.validation(dataObj, c_MV_arr[i]) val_name = v.calculate('det_ratio', 'majority_vote', 'parent') val_arr.append(v.validation[val_name]) #calculate the co-occurrence matrix coMat = c.co_occurrence_matrix() coMat.plot(labels=False)