def __init__(self, data_set_name, metric=RootMeanSquaredError, models=None, ensembles=None, benchmark_id=None): """Initializes benchmark environment.""" self.benchmark_id = benchmark_id self.data_set_name = data_set_name # Creates file name as combination of data set name and and date. self.file_name = self.data_set_name + "_" + self.benchmark_id + "__" + _now.strftime( "%Y_%m_%d__%H_%M_%S") # Loads samples into object. # self.samples = [load_samples(data_set_name, index) for index in range(10)] # self.samples = [load_samples_no_val(data_set_name, index) for index in range(10)] # changed from 30 , change back at the end self.samples = load_standardized_samples(data_set_name) self.metric = metric self.ensembles = ensembles self.models = models # If data set is classification problem, remove regression models. Else, vice versa. if is_classification( self.samples): # original self.samples[0][0] new self.samples self.classification = True if 'mlpr_lbfgs' in self.models.keys(): del self.models['mlpr_lbfgs'] if 'mlpr_adam' in self.models.keys(): del self.models['mlpr_adam'] if 'mlpr_sgd' in self.models.keys(): del self.models['mlpr_sgd'] else: self.classification = False if 'mlpc_lbfgs' in self.models.keys(): del self.models['mlpc_lbfgs'] if 'mlpc_adam' in self.models.keys(): del self.models['mlpc_adam'] if 'mlpc_sgd' in self.models.keys(): del self.models['mlpc_sgd'] # if models = MLP, remove Random Independent Weighting if self.ensembles != None: if 'mlpc_lbfgs' in self.models.keys( ) or 'mlpr_lbfgs' in self.models.keys(): if 'riw' in self.ensembles.keys(): del self.ensembles['riw'] # Create results dictionary with models under study. self.results = { k: [None for i in range(_OUTER_FOLDS)] for k in self.models.keys() } if self.ensembles != None: self.results_ensemble = { ensemble: [None for i in range(_OUTER_FOLDS)] for ensemble in self.ensembles.keys() } self.best_result = [None for i in range(_OUTER_FOLDS)] # Serialize benchmark environment. benchmark_to_pickle(self)
def _standardize_data_set(data_set): """""" data_set_ext = _remove_unary_features(data_set) cols = data_set_ext.columns if is_classification(data_set_ext): data_set_ext[cols[-1]] = data_set_ext[cols[-1]].astype(float) cols = cols[:-1] for c in cols: data_set_ext[c] = (data_set_ext[c] - data_set_ext[c].mean()) / data_set_ext[c].std() return data_set_ext
def __init__(self, data_set_name, metric=RootMeanSquaredError, models=_MODELS): """Initializes benchmark environment.""" self.data_set_name = data_set_name # Creates file name as combination of data set name and and date. self.file_name = self.data_set_name + "__" + _now.strftime( "%Y_%m_%d__%H_%M_%S") # Loads samples into object. self.samples = [ load_samples(data_set_name, index) for index in range(30) ] self.metric = metric self.models = models # If data set is classification problem, remove regression models. Else, vice versa. if is_classification(self.samples[0][0]): if 'svr' in self.models.keys(): del self.models['svr'] if 'mlpr' in self.models.keys(): del self.models['mlpr'] if 'rfr' in self.models.keys(): del self.models['rfr'] else: if 'svc' in self.models.keys(): del self.models['svc'] if 'mlpc' in self.models.keys(): del self.models['mlpc'] if 'rfc' in self.models.keys(): del self.models['rfc'] # Create results dictionary with models under study. self.results = { k: [None for i in self.samples] for k in self.models.keys() } # Serialize benchmark environment. benchmark_to_pickle(self)
def __init__(self, dataset_name, learning_metric=None, selection_metric=None, models=None, ensembles=None, benchmark_id=None, file_path=None): """Initializes benchmark environment.""" self.benchmark_id = benchmark_id self.dataset_name = dataset_name # Creates file name as combination of dataset name and and date self.file_name = self.dataset_name + "_" + self.benchmark_id + "__" + _now.strftime( "%Y_%m_%d__%H_%M_%S") # Loads samples into object self.samples = load_standardized_samples(dataset_name, file_path) self.ensembles = ensembles self.models = models # todo make it more general (small fix can be done later ...) if self.dataset_name == 'data_batch_1': print(self.samples.keys()) b = self.samples[b'labels'] self.samples = pd.DataFrame(self.samples[b'data']) self.samples = self.samples / 255 self.samples[3072] = b self.samples = self.samples.head( 1000) # change it to get all data not sample print(self.samples.shape) # If dataset is classification problem, remove regression models. Else, vice versa. if is_binary( self.samples): # original self.samples[0][0] new self.samples self.classification = True self.binary = True if learning_metric != None: self.learning_metric = learning_metric else: self.learning_metric = RootMeanSquaredError if selection_metric != None: self.selection_metric = selection_metric else: self.selection_metric = AUROC if 'mlpr_lbfgs' in self.models.keys(): del self.models['mlpr_lbfgs'] if 'mlpr_adam' in self.models.keys(): del self.models['mlpr_adam'] if 'mlpr_sgd' in self.models.keys(): del self.models['mlpr_sgd'] elif is_classification(self.samples): self.classification = True self.binary = False if learning_metric != None: self.learning_metric = learning_metric else: self.learning_metric = RootMeanSquaredError if selection_metric != None: self.selection_metric = selection_metric else: self.selection_metric = AUROC else: self.classification = False if learning_metric != None: self.learning_metric = learning_metric else: self.learning_metric = RootMeanSquaredError if selection_metric != None: self.selection_metric = selection_metric else: self.selection_metric = RootMeanSquaredError if 'mlpc_lbfgs' in self.models.keys(): del self.models['mlpc_lbfgs'] if 'mlpc_adam' in self.models.keys(): del self.models['mlpc_adam'] if 'mlpc_sgd' in self.models.keys(): del self.models['mlpc_sgd'] # if models = MLP, remove Random Independent Weighting if self.ensembles != None: if 'mlpc_lbfgs' in self.models.keys( ) or 'mlpr_lbfgs' in self.models.keys(): if 'riw' in self.ensembles.keys(): del self.ensembles['riw'] # Create results dictionary with models under study. self.results = { k: [None for i in range(_OUTER_FOLDS)] for k in self.models.keys() } if self.ensembles != None: self.results_ensemble = { ensemble: [None for i in range(_OUTER_FOLDS)] for ensemble in self.ensembles.keys() } self.best_result = [None for i in range(_OUTER_FOLDS)] # Serialize benchmark environment. benchmark_to_pickle(self)
def __init__(self, dataset_name, learning_metric=None, selection_metric=None, models=None, ensembles=None, benchmark_id=None): """Initializes benchmark environment.""" self.benchmark_id = benchmark_id self.data_set_name = dataset_name # Creates file name as combination of dataset name and and date # this is the benchmark name self.file_name = 'c_' + self.data_set_name + "_" + self.benchmark_id + "__" + _now.strftime("%Y_%m_%d__%H_%M_%S") self.samples = load_pmlb_samples(dataset_name) self.ensembles = ensembles self.models = models # If dataset is classification problem, remove regression models. Else, vice versa. if is_classification(self.samples): # original self.samples[0][0] new self.samples self.classification = True if learning_metric != None: self.learning_metric = learning_metric else: self.learning_metric = RootMeanSquaredError if selection_metric != None: self.selection_metric = selection_metric else: self.selection_metric = AUROC if 'mlpr_lbfgs' in self.models.keys(): del self.models['mlpr_lbfgs'] if 'mlpr_adam' in self.models.keys(): del self.models['mlpr_adam'] if 'mlpr_sgd' in self.models.keys(): del self.models['mlpr_sgd'] else: self.classification = False if learning_metric != None: self.learning_metric = learning_metric else: self.learning_metric = RootMeanSquaredError if selection_metric != None: self.selection_metric = selection_metric else: self.selection_metric = RootMeanSquaredError if 'mlpc_lbfgs' in self.models.keys(): del self.models['mlpc_lbfgs'] if 'mlpc_adam' in self.models.keys(): del self.models['mlpc_adam'] if 'mlpc_sgd' in self.models.keys(): del self.models['mlpc_sgd'] # if models = MLP, remove Random Independent Weighting if self.ensembles != None: if 'mlpc_lbfgs' in self.models.keys() or 'mlpr_lbfgs' in self.models.keys(): if 'riw' in self.ensembles.keys(): del self.ensembles['riw'] # Create results dictionary with models under study. self.results = {k: [None for i in range(_OUTER_FOLDS)] for k in self.models.keys()} if self.ensembles != None: self.results_ensemble = {ensemble: [None for i in range(_OUTER_FOLDS)] for ensemble in self.ensembles.keys()} self.best_result = [None for i in range(_OUTER_FOLDS)] # Serialize benchmark environment. benchmark_to_pickle(self)