def test_normalized_score_becomes_none(self): """Tunables that worked at some point but end up removed are not tried again. After commit ``6a08dc3cf1b68b35630cae6a87783aec4e2c9f83`` the following scenario has been observed: - One tunable produces a score at least once and then fails the next trials. - All the other tunables never produce any score. - Once all the tuners are created, only the one that produced a score is used. - After enough errors, this one is discarded, so `_normalized_errors` is empty. - Since a random.choice is used over the list of tunables, which still contains the one tha has been discarded, at some point the discarded one is tried again. This test certifies that this scenario cannot happen again, by validating that the number of errors is always ``max_errors`` at most. """ scores = [] def scorer(name, proposal): """Produce a score for the first trial and then fail forever.""" if not scores: scores.append(1) # boolean variable fails due to scope unles using global return 1 raise Exception() tunables = { 'a_tunable': { 'a_parameter': { 'type': 'int', 'default': 0, 'range': [0, 10] } }, 'another_tunable': { 'a_parameter': { 'type': 'int', 'default': 0, 'range': [0, 10] } } } session = BTBSession(tunables, scorer, max_errors=3) with pytest.raises(StopTuning): session.run(8) assert session.errors == {'a_tunable': 3, 'another_tunable': 3}
def test_stop(self): tunables = { 'a_tunable': { 'a_parameter': { 'type': 'int', 'default': 0, 'range': [0, 2] } } } session = BTBSession(tunables, self.scorer) with pytest.raises(StopTuning): session.run()
def test_errors(self): tunables = { 'a_tunable': { 'a_parameter': { 'type': 'int', 'default': 0, 'range': [0, 2] } }, 'another_tunable': { 'a_parameter': { 'type': 'int', 'default': 0, 'range': [0, 2] } } } def scorer(name, proposal): if name == 'another_tunable': raise Exception() else: return proposal['a_parameter'] session = BTBSession(tunables, scorer) best = session.run(4) assert best['name'] == 'a_tunable' assert best['config'] == {'a_parameter': 2}
def tune(self, X, y, max_evals=10, scoring=None, verbose=False): """ Tune the pipeline hyper-parameters and select the optimized model. Args: X (pandas.DataFrame or ndarray): Inputs to the pipeline. y (pandas.Series or ndarray): Target values. max_evals (int): Maximum number of hyper-parameter optimization iterations. scoring (str): The name of the scoring function. verbose (bool): Whether to log information during processing. """ tunables = {'0': self._pipeline.get_tunable_hyperparameters(flat=True)} session = BTBSession(tunables, lambda _, hyparam: self.k_fold_validation( hyparam, X=X, y=y, scoring=scoring), max_errors=max_evals, verbose=verbose) best_proposal = session.run(max_evals) self._pipeline.set_hyperparameters(best_proposal['config'])
def test_allow_duplicates(self): tunables = { 'a_tunable': { 'a_parameter': { 'type': 'int', 'default': 0, 'range': [0, 2] } } } session = BTBSession(tunables, self.scorer, allow_duplicates=True) best = session.run(10) assert best['name'] == 'another_tunable' assert best['config'] == {'a_parameter': 2}
def test_run_score_none(self): # setup instance = MagicMock(spec_set=BTBSession) instance.propose.return_value = ('test', {'hp': 'test'}) instance._scorer.side_effect = Exception() instance.best_proposal = {'test': 'config'} instance._range = range instance.iterations = 0 # run result = BTBSession.run(instance, 1) # assert instance._scorer.assert_called_once_with('test', {'hp': 'test'}) instance.record.assert_called_once_with('test', {'hp': 'test'}, None) assert result == {'test': 'config'} assert instance.iterations == 1
def test_run_score(self): # setup instance = MagicMock(spec_set=BTBSession) instance.propose.return_value = ('test', 'config') instance._scorer.return_value = 1 instance.best_proposal = {'test': 'config'} instance._range = range instance.iterations = 0 # run result = BTBSession.run(instance, 1) # assert instance._scorer.assert_called_once_with('test', 'config') instance.record.assert_called_once_with('test', 'config', 1) assert result == {'test': 'config'} assert instance.iterations == 1
def test_minimize(self): tunables = { 'a_tunable': { 'a_parameter': { 'type': 'int', 'default': 0, 'range': [0, 2] } } } session = BTBSession(tunables, self.scorer, maximize=False) best = session.run(3) assert best == session.best_proposal assert best['name'] == 'a_tunable' assert best['config'] == {'a_parameter': 0}
def test_allow_errors(self): tunables = { 'a_tunable': { 'a_parameter': { 'type': 'int', 'default': 0, 'range': [0, 1] } } } def scorer(name, proposal): if proposal['a_parameter'] == 0: raise Exception() return 1 session = BTBSession(tunables, scorer, max_errors=10) best = session.run(10) assert best['name'] == 'a_tunable' assert best['config'] == {'a_parameter': 1}
def test_multiple(self): tunables = { 'a_tunable': { 'a_parameter': { 'type': 'int', 'default': 0, 'range': [0, 2] } }, 'another_tunable': { 'a_parameter': { 'type': 'int', 'default': 0, 'range': [0, 2] } } } session = BTBSession(tunables, self.scorer) best = session.run(6) assert best['name'] == 'another_tunable' assert best['config'] == {'a_parameter': 2}
def train_btb(X_train,X_test,y_train,y_test,mtype,common_name_model,problemtype,classes,default_featurenames,transform_model,settings,model_session): # create file names model_name=common_name_model+'.pickle' folder='btb_session' csvname=common_name_model.split('_')[0] curdir=os.getcwd() files=list() # make a temporary folder for the training session try: os.mkdir(folder) os.chdir(folder) except: shutil.rmtree(folder) os.mkdir(folder) os.chdir(folder) # get training and testing data try: shutil.copy(curdir+'/'+model_session+'/data/'+csvname+'_train_transformed.csv',os.getcwd()+'/train.csv') shutil.copy(curdir+'/'+model_session+'/data/'+csvname+'_test_transformed.csv',os.getcwd()+'/test.csv') except: shutil.copy(curdir+'/'+model_session+'/data/'+csvname+'_train.csv',os.getcwd()+'/train.csv') shutil.copy(curdir+'/'+model_session+'/data/'+csvname+'_test.csv',os.getcwd()+'/test.csv') # create required .JSON dataset_id, filename=create_json(folder, 'train.csv') os.mkdir(dataset_id) os.chdir(dataset_id) os.mkdir('tables') shutil.copy(curdir+'/'+folder+'/train.csv', os.getcwd()+'/tables/train.csv') if mtype=='c': def build_model(name, hyperparameters): model_class = models[name] return model_class(random_state=0, **hyperparameters) def score_model(name, hyperparameters): model = build_model(name, hyperparameters) scores = cross_val_score(model, X_train, y_train) return scores.mean() rf_hyperparams = {'n_estimators': IntHyperParam(min=10, max=500), 'max_depth': IntHyperParam(min=10, max=500)} rf_tunable = Tunable(rf_hyperparams) print(rf_tunable) svc_hyperparams = {'C': FloatHyperParam(min=0.01, max=10.0), 'gamma': FloatHyperParam(0.000000001, 0.0000001)} svc_tunable = Tunable(svc_hyperparams) print(svc_tunable) tuners = {'RF': rf_tunable, 'SVC': svc_tunable} print(tuners) models = {'RF': RandomForestClassifier, 'SVC': SVC} selector = UCB1(['RF', 'SVC']) session = BTBSession(tuners, score_model, verbose=True) best_proposal = session.run(iterations=100) best_model = build_model(best_proposal['name'], best_proposal['config']) best_model.fit(X_train, y_train) accuracy = best_model.score(X_test, y_test) # tuner.record(parameters, score) print('ACCURACY:') print(accuracy) # now save the model in .pickle os.chdir(curdir) f=open(model_name,'wb') pickle.dump(best_model, f) f.close() elif mtype == 'r': tunables = { 'random_forest': { 'n_estimators': {'type': 'int', 'default': 2, 'range': [1, 1000]}, 'max_features': {'type': 'str', 'default': 'log2', 'range': [None, 'auto', 'log2', 'sqrt']}, 'min_samples_split': {'type': 'int', 'default': 2, 'range': [2, 20]}, 'min_samples_leaf': {'type': 'int', 'default': 2, 'range': [1, 20]}, }, 'extra_trees': { 'n_estimators': {'type': 'int', 'default': 2, 'range': [1, 1000]}, 'max_features': {'type': 'str', 'default': 'log2', 'range': [None, 'auto', 'log2', 'sqrt']}, 'min_samples_split': {'type': 'int', 'default': 2, 'range': [2, 20]}, 'min_samples_leaf': {'type': 'int', 'default': 2, 'range': [1, 20]}, } } models = { 'random_forest': RandomForestRegressor, 'extra_trees': ExtraTreesRegressor, } def build_model(name, hyperparameters): model_class = models[name] return model_class(random_state=0, **hyperparameters) def score_model(name, hyperparameters): model = build_model(name, hyperparameters) r2_scorer = make_scorer(r2_score) scores = cross_val_score(model, X_train, y_train, scoring=r2_scorer) return scores.mean() session = BTBSession(tunables, score_model, verbose=True) best_proposal = session.run(iterations=100) best_model = build_model(best_proposal['name'], best_proposal['config']) best_model.fit(X_train, y_train) pred = best_model.predict(X_test) r2_score=r2_score(y_test, pred) print('R2 score!!') print(r2_score) # now save the model in .pickle os.chdir(curdir) f=open(model_name,'wb') pickle.dump(best_model, f) f.close() files.append(model_name) files.append(folder) model_dir=os.getcwd() return model_name, model_dir, files
def test_session(): def build_model(name, hyperparameters): model_class = models[name] return model_class(random_state=0, **hyperparameters) def score_model(name, hyperparameters): model = build_model(name, hyperparameters) r2_scorer = make_scorer(r2_score) scores = cross_val_score(model, X_train, y_train, scoring=r2_scorer) return scores.mean() dataset = load_dataset() X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.3, random_state=0) tunables = { 'random_forest': { 'n_estimators': { 'type': 'int', 'default': 2, 'range': [1, 1000] }, 'max_features': { 'type': 'str', 'default': 'log2', 'range': [None, 'auto', 'log2', 'sqrt'] }, 'min_samples_split': { 'type': 'int', 'default': 2, 'range': [2, 20] }, 'min_samples_leaf': { 'type': 'int', 'default': 2, 'range': [1, 20] }, }, 'extra_trees': { 'n_estimators': { 'type': 'int', 'default': 2, 'range': [1, 1000] }, 'max_features': { 'type': 'str', 'default': 'log2', 'range': [None, 'auto', 'log2', 'sqrt'] }, 'min_samples_split': { 'type': 'int', 'default': 2, 'range': [2, 20] }, 'min_samples_leaf': { 'type': 'int', 'default': 2, 'range': [1, 20] }, } } models = { 'random_forest': RandomForestRegressor, 'extra_trees': ExtraTreesRegressor, } session = BTBSession(tunables, score_model, verbose=True) session.run(2)