def __init__(self, name, dataset=None, xlabels=None, ylabels=None, libs=None, models=None, modellibdict=None): if type(name) == str: self.name = name else: raise ValueError( 'Experiment \'name\' argument must be string, not {}'.format( str(type(name)))) # Initialize the dataset manager object self.dm = DatasetManager() if type(dataset) != type(None): if type(xlabels) == type(None) or type(ylabels) == type(None): self.dm.setData(dataset) else: self.dm.setData(dataset=dataset, xlabels=xlabels, ylabels=ylabels) # Initialize the models object self.mm = ModelsManager() # Add the provided model information self.addModels(libs=libs, models=models, modellibdict=modellibdict)
def test_setTargetType(self): # If the internal logic and user-input disagree, throw a warning notifying the user diabetes = load_diabetes(as_frame=True) diabetes = diabetes.frame target_col = 'target' data_cols = [ 'age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6' ] with warnings.catch_warnings(): try: warnings.filterwarnings('error') dm = DatasetManager(dataset=diabetes, xlabels=data_cols, ylabels=target_col) dm.setTargetType(target_type='classification') fail(self) except Warning as uw: self.assertEqual( str(uw), 'User specified classification target type, but alexandria found regression target type. Assuming the user is correct...' ) iris = load_iris() with warnings.catch_warnings(): try: warnings.filterwarnings('error') dm = DatasetManager(dataset=iris, xlabels='data', ylabels='target') dm.setTargetType(target_type='regression') fail(self) except Warning as uw: self.assertEqual( str(uw), 'User specified regression target type, but alexandria found classification target type. Assuming the user is correct...' )
def test_setNumClasses(self): # Fails if non-integer is provided iris = load_iris() try: dm = DatasetManager(dataset=iris, xlabels='data', ylabels='target') new_num = 3.2 dm.setNumClasses(new_num) except ValueError as ve: self.assertEqual( str(ve), 'num_classes argument must be integer, not {}'.format( type(new_num))) # Don't fail if non-integer is added when type is regression boston = load_boston dm = DatasetManager(dataset=boston, xlabels='data', ylabels='target') new_num = 3.2 dm.setNumClasses(new_num) self.assertEqual(dm.num_classes, None) # Check that it throws a warning to the user when the suggested number of classes is different iris = load_iris() with warnings.catch_warnings(): try: warnings.filterwarnings('error') dm = DatasetManager(dataset=iris, xlabels='data', ylabels='target') dm.setNumClasses(num_classes=4) fail(self) except Warning as uw: self.assertEqual( str(uw), 'User specified 4 classes, but alexandria found 3 classes. Assuming user is correct...' )
def test_getNumClasses(self): # Check that it works as expected iris = load_iris() dm = DatasetManager(dataset=iris, xlabels='data', ylabels='target') expected = 3 actual = dm.getNumClasses() self.assertEqual(actual, expected) bc = load_breast_cancer(as_frame=True) bc = bc.frame data_cols = bc.columns[:-1] target_col = 'target' dm = DatasetManager(dataset=bc, xlabels=data_cols, ylabels=target_col) expected = 2 actual = dm.getNumClasses() self.assertEqual(actual, expected) wine = load_wine() dm = DatasetManager(dataset=wine, xlabels='data', ylabels='target') expected = 3 actual = dm.getNumClasses() self.assertEqual(actual, expected) # Return None if the target is regression, not classification diabetes = load_diabetes(as_frame=True) diabetes = diabetes.frame data_cols = diabetes.columns[:-1] target_col = 'target' dm = DatasetManager(dataset=diabetes, xlabels=data_cols, ylabels=target_col) expected = None actual = dm.getNumClasses() self.assertEqual(actual, expected) boston = load_boston() dm = DatasetManager(dataset=boston, xlabels='data', ylabels='target') expected = None actual = dm.getNumClasses() self.assertEqual(actual, expected)
def test_gety(self): # Check that it works as expected iris = load_iris() dm = DatasetManager(dataset=iris, xlabels='data', ylabels='target') expected_y = iris.target actual_y = dm.gety() self.assertTrue((actual_y == expected_y).any()) boston = load_boston() dm = DatasetManager(dataset=boston, xlabels='data', ylabels='target') expected_y = boston.target actual_y = dm.gety() self.assertTrue((actual_y == expected_y).any()) wine = load_wine() dm = DatasetManager(dataset=wine, xlabels='data', ylabels='target') expected_y = wine.target actual_y = dm.gety() self.assertTrue((actual_y == expected_y).any()) iris = load_iris(as_frame=True) iris = iris.frame data_cols = iris.columns[:-1] target_col = 'target' dm = DatasetManager(dataset=iris, xlabels=data_cols, ylabels=target_col) expected_y = iris[target_col] actual_y = dm.gety() self.assertTrue(actual_y.equals(expected_y)) diabetes = load_diabetes(as_frame=True) diabetes = diabetes.frame data_cols = diabetes.columns[:-1] target_col = 'target' dm = DatasetManager(dataset=diabetes, xlabels=data_cols, ylabels=target_col) expected_y = diabetes[target_col] actual_y = dm.gety() self.assertTrue(actual_y.equals(expected_y))
def test_setData(self): # Check that we can add the data into the DatasetManager object iris = load_iris() dm = DatasetManager() dm.setData(iris) self.assertEqual(dm.dataset, iris) self.assertEqual(dm.xlabels, None) self.assertEqual(dm.xlabels, None) self.assertEqual(dm.target_type, None) self.assertEqual(dm.num_classes, None) xlabels = 'data' ylabels = 'target' dm.setData(iris, xlabels=xlabels, ylabels=ylabels) self.assertEqual(dm.dataset, iris) self.assertEqual(dm.xlabels, xlabels) self.assertEqual(dm.ylabels, ylabels) self.assertEqual(dm.target_type, 'classification') self.assertEqual(dm.num_classes, 3) # Fail if datatypes for xlabel and ylabel are wrong iris = load_iris() try: xlabels = ['data'] ylabels = 512.39 dm = DatasetManager() dm.setData(dataset=iris, xlabels=xlabels, ylabels=ylabels) fail(self) except ValueError as ve: self.assertEqual( str(ve), 'ylabels argument must be string, not {}'.format( type(ylabels))) iris = load_iris() try: xlabels = {'name': 'value'} ylabels = ['test1', 'test2'] dm = DatasetManager() dm.setData(dataset=iris, xlabels=xlabels, ylabels=ylabels) fail(self) except ValueError as ve: self.assertEqual( str(ve), 'xlabels argument must be string or list of strings, not {}'. format(type(xlabels)))
def test_init(self): # Check that initializations occur correctly dm = DatasetManager() self.assertEqual(dm.target_type, None) self.assertEqual(dm.dataset, None) self.assertEqual(dm.datatype, None) self.assertEqual(dm.xlabels, None) self.assertEqual(dm.ylabels, None) self.assertEqual(dm.num_classes, None) # Check that it handles input data correctly - sklearn.Bunch # Classification dataset iris = load_iris() dm = DatasetManager(dataset=iris, xlabels='data', ylabels='target') self.assertEqual(dm.target_type, 'classification') self.assertEqual(dm.dataset, iris) self.assertEqual(dm.datatype, type(iris)) self.assertEqual(dm.xlabels, 'data') self.assertEqual(dm.ylabels, 'target') self.assertEqual(dm.num_classes, 3) # Regression dataset boston = load_boston() dm = DatasetManager(dataset=boston, xlabels=['data'], ylabels='target') self.assertEqual(dm.target_type, 'regression') self.assertEqual(dm.dataset, boston) self.assertEqual(dm.datatype, type(boston)) self.assertEqual(dm.xlabels, ['data']) self.assertEqual(dm.ylabels, 'target') self.assertEqual(dm.num_classes, None) # Check that it handles input data correctly - pandas.DataFrame # Classification dataset iris = load_iris(as_frame=True) iris = iris.frame target_col = 'target' data_cols = [ 'sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)' ] dm = DatasetManager(dataset=iris, xlabels=data_cols, ylabels=target_col) self.assertEqual(dm.target_type, 'classification') self.assertEqual(dm.datatype, type(iris)) self.assertEqual(dm.xlabels, data_cols) self.assertEqual(dm.ylabels, target_col) self.assertEqual(dm.num_classes, 3) # Regression dataset diabetes = load_diabetes(as_frame=True) diabetes = diabetes.frame target_col = 'target' data_cols = [ 'age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6' ] dm = DatasetManager(dataset=diabetes, xlabels=data_cols, ylabels=target_col) self.assertEqual(dm.target_type, 'regression') self.assertEqual(dm.datatype, type(diabetes)) self.assertEqual(dm.xlabels, data_cols) self.assertEqual(dm.ylabels, target_col) self.assertEqual(dm.num_classes, None) # Fail if the xlabels or ylabels don't exist within the provided dataset # sklearn.Bunch iris = load_iris() try: fake_attr = 'not_present' dm = DatasetManager(dataset=iris, xlabels=fake_attr, ylabels='target') fail(self) except ValueError as ve: self.assertEqual( str(ve), '{} is not an attribute of the provided dataset!'.format( fake_attr)) # pandas.DataFrame iris = load_iris(as_frame=True) iris = iris.frame try: fake_cols = [ 'fake column 1', 'fake column 2', 'sepal length (cm)', 'fake column 3' ] dm = DatasetManager(dataset=iris, xlabels=fake_cols, ylabels='target') fail(self) except ValueError as ve: self.assertEqual( str(ve), 'These columns don\'t exist in the dataset: {}'.format( ['fake column 1', 'fake column 2', 'fake column 3'])) # If the internal logic and user-input disagree, throw a warning notifying the user diabetes = load_diabetes(as_frame=True) diabetes = diabetes.frame target_col = 'target' data_cols = [ 'age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6' ] with warnings.catch_warnings(): try: warnings.filterwarnings('error') dm = DatasetManager(dataset=diabetes, xlabels=data_cols, ylabels=target_col, target_type='classification') fail(self) except Warning as uw: self.assertEqual( str(uw), 'User specified classification target type, but alexandria found regression target type. Assuming the user is correct...' ) iris = load_iris() with warnings.catch_warnings(): try: warnings.filterwarnings('error') dm = DatasetManager(dataset=iris, xlabels='data', ylabels='target', target_type='regression') fail(self) except Warning as uw: self.assertEqual( str(uw), 'User specified regression target type, but alexandria found classification target type. Assuming the user is correct...' ) iris = load_iris() with warnings.catch_warnings(): try: warnings.filterwarnings('error') dm = DatasetManager(dataset=iris, xlabels='data', ylabels='target', num_classes=4) fail(self) except Warning as uw: self.assertEqual( str(uw), 'User specified 4 classes, but alexandria found 3 classes. Assuming user is correct...' ) # Fail if the dataset type is not supported iris = load_iris() try: fake_type = 'fake type' dm = DatasetManager(dataset=iris, xlabels='data', ylabels='target', target_type=fake_type) fail(self) except ValueError as ve: self.assertEqual( str(ve), 'target_type argument must be \'regression\' or \'classification\', not {}' .format(fake_type)) # Fail if target_type is not valid data type iris = load_iris() try: fake_type = ['list', 'of', 'vals', 512] dm = DatasetManager(dataset=iris, xlabels='data', ylabels='target', target_type=fake_type) fail(self) except ValueError as ve: self.assertEqual( str(ve), 'target type must be of string type, not {}'.format( type(fake_type))) # Fail if num_classes is not an integer iris = load_iris() try: fake_n_classes = 10.77733 dm = DatasetManager(dataset=iris, xlabels='data', ylabels='target', num_classes=fake_n_classes) fail(self) except ValueError as ve: self.assertEqual( str(ve), 'num_classes argument must be integer, not {}'.format( str(type(fake_n_classes)))) # Fail if more than one ylabel is provided iris = load_iris() try: xlabels = ['data'] ylabels = ['test1', 'test2'] dm = DatasetManager(dataset=iris, xlabels=xlabels, ylabels=ylabels) fail(self) except ValueError as ve: self.assertEqual(str(ve), 'Multi-column target is not supported!') # Fail if datatypes for xlabel and ylabel are wrong iris = load_iris() try: xlabels = ['data'] ylabels = 512.39 dm = DatasetManager(dataset=iris, xlabels=xlabels, ylabels=ylabels) fail(self) except ValueError as ve: self.assertEqual( str(ve), 'ylabels argument must be string, not {}'.format( type(ylabels))) iris = load_iris() try: xlabels = {'name': 'value'} ylabels = ['test1', 'test2'] dm = DatasetManager(dataset=iris, xlabels=xlabels, ylabels=ylabels) fail(self) except ValueError as ve: self.assertEqual( str(ve), 'xlabels argument must be string or list of strings, not {}'. format(type(xlabels)))
class Experiment: def __init__(self, name, dataset=None, xlabels=None, ylabels=None, libs=None, models=None, modellibdict=None): if type(name) == str: self.name = name else: raise ValueError( 'Experiment \'name\' argument must be string, not {}'.format( str(type(name)))) # Initialize the dataset manager object self.dm = DatasetManager() if type(dataset) != type(None): if type(xlabels) == type(None) or type(ylabels) == type(None): self.dm.setData(dataset) else: self.dm.setData(dataset=dataset, xlabels=xlabels, ylabels=ylabels) # Initialize the models object self.mm = ModelsManager() # Add the provided model information self.addModels(libs=libs, models=models, modellibdict=modellibdict) # This method only exists to allow for users to feel more comfortable with the API def addModel(self, lib=None, model=None, modellibdict=None): self.addModels(libs=lib, models=model, modellibdict=modellibdict) def addModels(self, modellibdict=None, libs=None, models=None): # We can hand this data over if modellibdict != None: self.mm.addModels(modellibdict) # If model values were specified, we must translate them into a dictionary # for the model manager else: if models != None: modellibdict = self.createForModelsManager(libs=libs, models=models) self.mm.addModels(modellibdict) def createForModelsManager(self, libs=[], models=[]): return_dict = dict() # If libs is None, then we will assume the default library on the other end if libs == None: libs = '' # If the libs argument is a string, then add all the models with this library if type(libs) == str: lib = libs return_dict[lib] = models elif type(libs) == list: if type(models) == list: if len(models) == len(libs): for lib, model in zip(libs, models): if lib in return_dict: return_dict[lib].append(model) else: return_dict[lib] = [model] return return_dict def getName(self): return self.name def setName(self, name): if type(name) == str: self.name = name else: raise ValueError( 'Experiment \'name\' argument must be string, not {}'.format( str(type(name)))) def getModels(self, aslist=False): return self.mm.getModels(aslist) def getNumModels(self): return len(self.mm.getNumModels()) def train(self, X=None, y=None, *args, **kwargs): X = X y = y exp_type = '' if type(X) == type(None) or type(y) == type(None): self.dm.splitData(*args, **kwargs) X, y = self.dm.getXtrain(), self.dm.getytrain() exp_type = self.dm.target_type else: # If user provides data, we need to figure out what type of experiment it is exp_type = self.dm.getExperimentTypeOf(y) # Train the models on the provided data self.mm.trainModelsOnXy(X, y, exp_type) def trainCV(self, X=None, y=None, nfolds=-1, metrics=''): if type(metrics) == type(None) or metrics == '': raise ValueError('Metrics must be defined for cross validation!') X = X y = y exp_type = '' if type(X) == type(None) or type(y) == type(None): X = self.dm.getX() y = self.dm.gety() exp_type = self.dm.getTargetType() else: # If user provides data, we need to figure out what type of experiment it is exp_type = self.dm.getExperimentTypeOf(y) self.mm.trainCV(X, y, metrics=metrics, nfolds=nfolds, exp_type=exp_type) def predict(self, X=None): if type(X) != type(None): return self.mm.generateModelPredictions(X) else: return self.mm.generateModelPredictions(self.dm.getXtest()) def getMetrics(self): return self.mm.getMetrics() def summarizeMetrics(self): # TO-DO: Make this output a lot smarter and more customizable print('\n' + self.name) metrics = self.mm.getMetrics() # List will hold all the rows for tabulate rows = [] headers = [] # Go through all of the metrics in the dictionary for model_metrics in metrics.values(): if type(model_metrics) == dict: # Set up the headers if headers == []: headers = list(model_metrics.keys()) # Initialize the row row = [] for name, value in model_metrics.items(): # If there is an average and standard deviation, then let's output both if type(value ) == dict and 'avg' in value and 'std' in value: row.append('{:.4f}\u00B1{:.4f}'.format( value['avg'], value['std'])) else: row.append(value) rows.append(row) print(tabulate(rows, headers=headers)) def compareModels_tTest(self, a, X=None, y=None): X = X y = y exp_type = '' if type(X) == type(None) or type(y) == type(None): X = self.dm.getX() y = self.dm.gety() exp_type = self.dm.getTargetType() else: # If user provides data, we need to figure out what type of experiment it is exp_type = self.dm.getExperimentTypeOf(y) self.mm.compareModels_tTest(X, y, exp_type=exp_type, a=a)