def param_search(self, x, y, time_per_sample=3.5, **kwargs): time = int(len(y) * time_per_sample) self.m = AutoSklearnRegressor( time_left_for_this_task=time, resampling_strategy="cv", resampling_strategy_arguments={'folds': 10}) self.m.fit(x, y, metric=mean_squared_error, dataset_name="Land Use Regression") # print(self.m.sprint_statistics()) # score = score_funtion(y, self.m.predict(x)) # print("Reached a score of {}.".format(score)) kf = KFold(n_splits=10, shuffle=True) rmse = [] mae = [] r2 = [] for train_index, test_index in kf.split(x, y): X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] self.m.refit(X_train, y_train) predictions = self.m.predict(X_test) rmse_iter, mae_iter, r2_iter = self.score_function( y_test, predictions) rmse.append(rmse_iter) mae.append(mae_iter) r2.append(r2_iter) # print("Reached a RMSE of {}, MAE of {} and R2 of {}.".format(np.mean(rmse), np.mean(mae), np.mean(r2))) return self.concat_results(np.mean(rmse), np.mean(mae), np.mean(r2))
def __init__(self, **kwargs) -> None: Ensemble.__init__(self) client = Client(processes=False, n_workers=kwargs['n_jobs'], thread_per_worker=1, dashboard_address=None) self.model = AutoSklearnRegressor(**kwargs, dask_client=client)
def test_cv_regression(self): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit') output = os.path.join(self.test_dir, '..', '.out_regression_fit') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=30, per_run_time_limit=5, resampling_strategy='cv', tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (356,)) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds # constraint. With more time_left_for_this_task this is no longer an issue self.assertGreaterEqual(score, -37) self._tearDown(tmp) self._tearDown(output)
def test_cv_regression(self): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit_cv') output = os.path.join(self.test_dir, '..', '.out_regression_fit_cv') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (206, )) score = r2(Y_test, predictions) print(Y_test) print(predictions) self.assertGreaterEqual(score, 0.1) self.assertGreater(self._count_succeses(automl.cv_results_), 0) self._tearDown(tmp) self._tearDown(output)
def test_cv_regression(tmp_dir, output_dir, dask_client): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) # Log file path log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0] predictions = automl.predict(X_test) assert predictions.shape == (206, ) score = r2(Y_test, predictions) assert score >= 0.1, extract_msg_from_log(log_file_path) assert count_succeses( automl.cv_results_) > 0, extract_msg_from_log(log_file_path)
def spawn_regressor( seed, time, search_space, prep_space, metric, dataset_name=None): """Spawn a subprocess. auto-sklearn does not take care of spawning worker processes. This function, which is called several times in the main block is a new process which runs one instance of auto-sklearn. """ # Use the initial configurations from meta-learning only in one out of # the four processes spawned. This prevents auto-sklearn from evaluating # the same configurations in four processes. if seed == 0: initial_configurations_via_metalearning = 25 smac_scenario_args = {} else: initial_configurations_via_metalearning = 0 smac_scenario_args = {'initial_incumbent': 'RANDOM'} # Arguments which are different to other runs of auto-sklearn: # 1. all classifiers write to the same output directory # 2. shared_mode is set to True, this enables sharing of data between # models. # 3. all instances of the AutoSklearnClassifier must have a different # seed! automl = AutoSklearnRegressor( time_left_for_this_task=time, # sec., how long should this seed fit process run per_run_time_limit=15, # sec., each model may only take this long before it's killed ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm shared_mode=True, # tmp folder will be shared between seeds tmp_folder=tmp_folder, output_folder=output_folder, delete_tmp_folder_after_terminate=False, ensemble_size=0, include_estimators=search_space, exclude_estimators=None, include_preprocessors=prep_space, exclude_preprocessors=None, # ensembles will be built when all optimization runs are finished initial_configurations_via_metalearning=( initial_configurations_via_metalearning ), seed=seed, smac_scenario_args=smac_scenario_args, ) automl.fit(X_train, y_train, X_test=X_test, y_test=y_test, metric=metric, dataset_name=dataset_name) # print(automl.cv_results_) return automl.cv_results_
def model_init(self, model_params: Dict[str, Any]) -> None: """Model initialization Инициализация объекта модели в зависимости от типа задачи Args: model_params: Словарь параметров модели """ if self.config['mode'] == 'classification': self.model = AutoSklearnClassifier(**model_params) elif self.config['mode'] == 'regression': self.model = AutoSklearnRegressor(**model_params)
def test_regression_methods_returns_self(self): X_train, y_train, X_test, y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=20, per_run_time_limit=5, ensemble_size=0) automl_fitted = automl.fit(X_train, y_train) self.assertIs(automl, automl_fitted) automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) self.assertIs(automl, automl_ensemble_fitted) automl_refitted = automl.refit(X_train.copy(), y_train.copy()) self.assertIs(automl, automl_refitted)
def test_regression_pandas_support(self): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe self.assertTrue(isinstance(X, pd.DataFrame)) self.assertTrue(isinstance(y, pd.Series)) automl = AutoSklearnRegressor( time_left_for_this_task=30, per_run_time_limit=5, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality self.assertTrue(automl.score(X, y) > 0.5) automl.refit(X, y) # Make sure that at least better than random. self.assertTrue(r2(y, automl.predict(X)) > 0.5)
def test_regression_pandas_support(tmp_dir, output_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnRegressor( time_left_for_this_task=40, per_run_time_limit=5, dask_client=dask_client, tmp_folder=tmp_dir, output_folder=output_dir, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) >= 0.5, print_debug_information(automl) automl.refit(X, y) # Make sure that at least better than random. assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl)
def test_autosklearn_regression_methods_returns_self(dask_client): X_train, y_train, X_test, y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=30, per_run_time_limit=5, dask_client=dask_client, ensemble_size=0) automl_fitted = automl.fit(X_train, y_train) assert automl is automl_fitted automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) assert automl is automl_ensemble_fitted automl_refitted = automl.refit(X_train.copy(), y_train.copy()) assert automl is automl_refitted
def process_auto_sklearn(X_train, X_test, y_train, df_types, m_type, seed, *args): """Function that trains and tests data using auto-sklearn""" from autosklearn.classification import AutoSklearnClassifier from autosklearn.regression import AutoSklearnRegressor from autosklearn.metrics import f1_weighted from autosklearn.metrics import mean_squared_error categ_cols = df_types[df_types.NAME != 'target']['TYPE'].values.ravel() if m_type == 'classification': automl = AutoSklearnClassifier(time_left_for_this_task=TIME_PER_TASK, per_run_time_limit=int(TIME_PER_TASK/8), seed=seed, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}, delete_tmp_folder_after_terminate=False) else: automl = AutoSklearnRegressor(time_left_for_this_task=TIME_PER_TASK, per_run_time_limit=int(TIME_PER_TASK/8), seed=seed, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}, delete_tmp_folder_after_terminate=False) automl.fit(X_train.copy(), y_train.copy(), feat_type=categ_cols, metric=f1_weighted if m_type == 'classification' else mean_squared_error) automl.refit(X_train.copy(), y_train.copy()) return (automl.predict_proba(X_test) if m_type == 'classification' else automl.predict(X_test))
class AutoSklearnRegressorEnsemble(AutoSklearnModel, Ensemble): """ Wrapper around an autosklearn model. """ _kind: ModelType = 'regressor' def __init__(self, **kwargs) -> None: Ensemble.__init__(self) client = Client(processes=False, n_workers=kwargs['n_jobs'], thread_per_worker=1, dashboard_address=None) self.model = AutoSklearnRegressor(**kwargs, dask_client=client) def autosklearn_model(self) -> AutoSklearnRegressor: return self.model def predict(self, X: np.ndarray) -> np.ndarray: """ Get the models prediction """ return self.model.predict(X) def model_predictions(self, X: np.ndarray) -> np.ndarray: """ Get the models probability predicitons """ return np.asarray([m.predict(X) for m in self.models()]) @classmethod def kind(cls) -> ModelType: return cls._kind
def test_regression(self): output = os.path.join(self.test_dir, '..', '.tmp_regression_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (356, )) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average self.assertGreaterEqual(score, -30)
def train_autosklearn(l=None): if l is None: l = get_data() ensemble_size = 1 # 50 ... 1 for vanilla initial_configurations_via_metalearning = 0 # 25 ... 0 for vanilla model = AutoSklearnRegressor( delete_output_folder_after_terminate=True, delete_tmp_folder_after_terminate=True, disable_evaluator_output=False, ensemble_nbest=50, ensemble_size=ensemble_size, exclude_estimators=None, exclude_preprocessors=None, get_smac_object_callback=None, include_estimators=None, include_preprocessors=None, initial_configurations_via_metalearning= initial_configurations_via_metalearning, logging_config=None, ml_memory_limit=3072, output_folder=None, per_run_time_limit=360, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}, # resampling_strategy='holdout', # resampling_strategy_arguments=None, seed=1, shared_mode=False, smac_scenario_args=None, time_left_for_this_task=3600, tmp_folder=None) model.fit(l.X_train.values.copy(), l.y_train.values.squeeze().copy()) model.refit(l.X_train.values.copy(), l.y_train.values.squeeze().copy()) print(model.show_models()) return attributedict_from_locals('model')
def test_regression(self): tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit') output = os.path.join(self.test_dir, '..', '.out_regression_fit') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (356,)) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average self.assertGreaterEqual(score, -30)
def train_regression(): dump_file = os.path.join( AUTO_ML_MODELS_PATH, 'auto_sklearn_regressor' + str(datetime.datetime.now()) + '.dump') features, outcome_slave, _ = file_loader('c99temp_train.snappy.csv') features = features.values outcome_slave = outcome_slave['tempBoardSLAVE'].values model = AutoSklearnRegressor( time_left_for_this_task=3600, per_run_time_limit=600, ) model.fit(features, outcome_slave) with open(dump_file, 'wb') as f: pickle.dump(model, f)
def test_regression(tmp_dir, output_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (356, ) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds # constraint. With more time_left_for_this_task this is no longer an issue assert score >= -37, print_debug_information(automl) assert count_succeses(automl.cv_results_) > 0
def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit): automl = AutoSklearnRegressor() X = [[1], [2], [3]] y = [1, 2, 3] automl.fit(X, y) self.assertEqual(fit.call_count, 1) self.assertIsInstance(fit.call_args[0][0], np.ndarray) self.assertIsInstance(fit.call_args[0][1], np.ndarray) automl.refit(X, y) self.assertEqual(refit.call_count, 1) self.assertIsInstance(refit.call_args[0][0], np.ndarray) self.assertIsInstance(refit.call_args[0][1], np.ndarray) automl.fit_ensemble(y) self.assertEqual(fit_ensemble.call_count, 1) self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
def test_cv_regression(tmp_dir, output_dir, dask_client): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (206, ) score = r2(Y_test, predictions) assert score >= 0.1, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl)
def build_pipeline(self): """ Makes a pipeline based on data_config This is because autosklearn does not perform automatic data encoding """ categorical_list = infer_categoricals(self.X) preprocessing_steps = [] if self.data_config.get("text_columns"): print( "Applying TFIDF to text columns: {data_config.get('text_columns')}" ) preprocessing_steps.append( make_pipeline( ColumnSelector(cols=data_config.get("text_columns"), drop_axis=True), TfidfVectorizer())) categorical_list = [ c for c in categorical_list if c not in data_config["text_columns"] ] if categorical_list: print( f"Applying One Hot Encoding to categorical columns: {categorical_list}" ) preprocessing_steps.append( make_pipeline(ColumnSelector(cols=categorical_list), OneHotEncoder(handle_unknown="impute"))) if preprocessing_steps: preprocessing_steps = make_union(*preprocessing_steps) preprocessing_steps = make_pipeline(preprocessing_steps, SimpleImputer()) else: preprocessing_steps = SimpleImputer() if self.problem_type == "classification": automl = AutoSklearnClassifier(**self.automl_settings) else: automl = AutoSklearnRegressor(**self.automl_settings) automl_pipeline = make_pipeline(preprocessing_steps, automl) return automl_pipeline
'accuracy': accuracy, 'balanced_accuracy': balanced_accuracy, 'roc_auc': roc_auc, 'logloss': log_loss, 'r2': r2, 'mean_squared_error': mean_squared_error, 'root_mean_squared_error': root_mean_squared_error, 'mean_absolute_error': mean_absolute_error, }[metric] automl_arguments['metric'] = metric if task_type == 'classification': automl = AutoSklearnClassifier(**automl_arguments) scorer_list = CLASSIFICATION_METRICS elif task_type == 'regression': automl = AutoSklearnRegressor(**automl_arguments) scorer_list = REGRESSION_METRICS else: raise ValueError(task_type) scoring_functions = [scorer for name, scorer in scorer_list.items()] automl.fit(X_train, y_train, dataset_name=dataset_name, feat_type=cat, X_test=X_test, y_test=y_test) trajectory = automl.trajectory_ incumbent_id_to_model = {}
#for auto vanilla, add this : #ensemble_size=1, initial_configurations_via_metalearning=0 #for specific clf or rgr or prep : include_estimators=["random_forest", ], exclude_estimators=None, # include_preprocessors=["no_preprocessing", ], exclude_preprocessors=None) #-----CLASSIFIER----- #automl = AutoSklearnClassifier(per_run_time_limit=300, ml_memory_limit=1024 * 6, # time_left_for_this_task=300, resampling_strategy='cv', # ensemble_size=1, initial_configurations_via_metalearning=0, # resampling_strategy_arguments={'folds': 5}) #-----REGRESSION----- automl = AutoSklearnRegressor( per_run_time_limit=360, ml_memory_limit=1024 * 8, time_left_for_this_task=3600, resampling_strategy='cv', # ensemble_size=1, # initial_configurations_via_metalearning=0, resampling_strategy_arguments={'folds': 5}) start = time.time() #X_train = X_train.astype('float') # when? automl.fit(X_train, y_train, dataset_name='boston_housing') #change dataset name accordingly automl.refit(X_train.copy(), y_train.copy()) print( '[INFO] Elapsed time finding best model: {} seconds.'.format(time.time() - start)) predictions = automl.predict(X_test) #print('--- CLASSIFICATION REPORT: ---') #not for regression
address = './Dataset/Video_Game_Sales.csv' dataframe = read_csv(address) print(time.strftime("Start time is %Y-%m-%d %H:%M:%S", time.localtime())) # split into input and output elements data = dataframe.values data = data.astype('int') X, y = data[:, :-1], data[:, -1] print(X.shape, y.shape) # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) # define search model = AutoSklearnRegressor(time_left_for_this_task=5 * 60, per_run_time_limit=30, n_jobs=8) # perform the search model.fit(X_train, y_train) # summarize # print(model.sprint_statistics()) # evaluate best model y_hat = model.predict(X_test) mae = mean_absolute_error(y_test, y_hat) r2Score = r2_score(y_test, y_hat) mape = mean_absolute_percentage_error(y_test, y_hat) mse = mean_squared_error(y_test, y_hat) print(time.strftime("End time is %Y-%m-%d %H:%M:%S", time.localtime())) print("MAE: %.3f" % mae) print("R2_score: %.3f" % r2Score) print("MAPE: %.3f" % mape)
523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 536, 563, 564, 565, 566, 567, 568, 572, 574, 607, 614, 616, 619, 620 ] # all zeroes, I think retiro_total = None retiro_final = None for grade in grades: print(grade) # load the school data # drop first column (school code) # last column is y-values if grade == '06': model = XGBRegressor() else: model = AutoSklearnRegressor(time_left_for_this_task=3000, per_run_time_limit=600) def predict(zscoreX, zscoreY): # print algo, accuracy, r-squared? print('with zScore on X: ' + str(zscoreX) + ' and Y: ' + str(zscoreY)) if zscoreX: x = np.copy(scaled_x) else: x = np.copy(original_x) if zscoreY: y = np.copy(scaled_y) else: y = np.copy(original_y)
#for auto vanilla, add this : #ensemble_size=1, initial_configurations_via_metalearning=0 #for specific clf or rgr or prep : include_estimators=["random_forest", ], exclude_estimators=None, # include_preprocessors=["no_preprocessing", ], exclude_preprocessors=None) #-----CLASSIFIER----- #automl = AutoSklearnClassifier(per_run_time_limit=300, ml_memory_limit=1024 * 6, # time_left_for_this_task=300, resampling_strategy='cv', # ensemble_size=1, initial_configurations_via_metalearning=0, # resampling_strategy_arguments={'folds': 5}) #-----REGRESSION----- automl = AutoSklearnRegressor(per_run_time_limit=300, ml_memory_limit=1024 * 4, time_left_for_this_task=1800, resampling_strategy='cv', include_estimators=[ "liblinear_svr", ], exclude_estimators=None, resampling_strategy_arguments={'folds': 5}) start = time.time() #X_train = X_train.astype('float') # when? automl.fit(X_train, y_train, dataset_name='boston_housing') #change dataset name accordingly automl.refit(X_train.copy(), y_train.copy()) print( '[INFO] Elapsed time finding best model: {} seconds.'.format(time.time() - start)) predictions = automl.predict(X_test)
def test_type_of_target(mock_estimator): # Test that classifier raises error for illegal target types. X = np.array([ [1, 2], [2, 3], [3, 4], [4, 5], ]) # Possible target types y_binary = np.array([0, 0, 1, 1]) y_continuous = np.array([0.1, 1.3, 2.1, 4.0]) y_multiclass = np.array([0, 1, 2, 0]) y_multilabel = np.array([ [0, 1], [1, 1], [1, 0], [0, 0], ]) y_multiclass_multioutput = np.array([ [0, 1], [1, 3], [2, 2], [5, 3], ]) y_continuous_multioutput = np.array([ [0.1, 1.5], [1.2, 3.5], [2.7, 2.7], [5.5, 3.9], ]) cls = AutoSklearnClassifier(ensemble_size=0) cls.automl_ = unittest.mock.Mock() cls.automl_.InputValidator = unittest.mock.Mock() cls.automl_.InputValidator.target_validator = unittest.mock.Mock() # Illegal target types for classification: continuous, # multiclass-multioutput, continuous-multioutput. expected_msg = r".*Classification with data of type" " multiclass-multioutput is not supported.*" with pytest.raises(ValueError, match=expected_msg): cls.fit(X=X, y=y_multiclass_multioutput) expected_msg = r".*Classification with data of type" " continuous is not supported.*" with pytest.raises(ValueError, match=expected_msg): cls.fit(X=X, y=y_continuous) expected_msg = r".*Classification with data of type" " continuous-multioutput is not supported.*" with pytest.raises(ValueError, match=expected_msg): cls.fit(X=X, y=y_continuous_multioutput) # Legal target types for classification: binary, multiclass, # multilabel-indicator. try: cls.fit(X, y_binary) except ValueError: pytest.fail("cls.fit() raised ValueError while fitting " "binary targets") try: cls.fit(X, y_multiclass) except ValueError: pytest.fail("cls.fit() raised ValueError while fitting " "multiclass targets") try: cls.fit(X, y_multilabel) except ValueError: pytest.fail("cls.fit() raised ValueError while fitting " "multilabel-indicator targets") # Test that regressor raises error for illegal target types. reg = AutoSklearnRegressor(ensemble_size=0) # Illegal target types for regression: multilabel-indicator # multiclass-multioutput expected_msg = r".*Regression with data of type" " multilabel-indicator is not supported.*" with pytest.raises(ValueError, match=expected_msg): reg.fit( X=X, y=y_multilabel, ) expected_msg = r".*Regression with data of type" " multiclass-multioutput is not supported.*" with pytest.raises(ValueError, match=expected_msg): reg.fit( X=X, y=y_multiclass_multioutput, ) # Legal target types: continuous, multiclass, # continuous-multioutput, # binary try: reg.fit(X, y_continuous) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "continuous targets") try: reg.fit(X, y_multiclass) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "multiclass targets") try: reg.fit(X, y_continuous_multioutput) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "continuous_multioutput targets") try: reg.fit(X, y_binary) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "binary targets")
def test_type_of_target(self, mock_estimator): # Test that classifier raises error for illegal target types. X = np.array([ [1, 2], [2, 3], [3, 4], [4, 5], ]) # Possible target types y_binary = np.array([0, 0, 1, 1]) y_continuous = np.array([0.1, 1.3, 2.1, 4.0]) y_multiclass = np.array([0, 1, 2, 0]) y_multilabel = np.array([ [0, 1], [1, 1], [1, 0], [0, 0], ]) y_multiclass_multioutput = np.array([ [0, 1], [1, 3], [2, 2], [5, 3], ]) y_continuous_multioutput = np.array([ [0.1, 1.5], [1.2, 3.5], [2.7, 2.7], [5.5, 3.9], ]) cls = AutoSklearnClassifier() # Illegal target types for classification: continuous, # multiclass-multioutput, continuous-multioutput. self.assertRaisesRegex( ValueError, "classification with data of type" " multiclass-multioutput is not supported", cls.fit, X=X, y=y_multiclass_multioutput, ) self.assertRaisesRegex( ValueError, "classification with data of type" " continuous is not supported", cls.fit, X=X, y=y_continuous, ) self.assertRaisesRegex( ValueError, "classification with data of type" " continuous-multioutput is not supported", cls.fit, X=X, y=y_continuous_multioutput, ) # Legal target types for classification: binary, multiclass, # multilabel-indicator. try: cls.fit(X, y_binary) except ValueError: self.fail("cls.fit() raised ValueError while fitting " "binary targets") try: cls.fit(X, y_multiclass) except ValueError: self.fail("cls.fit() raised ValueError while fitting " "multiclass targets") try: cls.fit(X, y_multilabel) except ValueError: self.fail("cls.fit() raised ValueError while fitting " "multilabel-indicator targets") # Test that regressor raises error for illegal target types. reg = AutoSklearnRegressor() # Illegal target types for regression: multiclass-multioutput, # multilabel-indicator, continuous-multioutput. self.assertRaisesRegex( ValueError, "regression with data of type" " multiclass-multioutput is not supported", reg.fit, X=X, y=y_multiclass_multioutput, ) self.assertRaisesRegex( ValueError, "regression with data of type" " multilabel-indicator is not supported", reg.fit, X=X, y=y_multilabel, ) self.assertRaisesRegex( ValueError, "regression with data of type" " continuous-multioutput is not supported", reg.fit, X=X, y=y_continuous_multioutput, ) # Legal target types: continuous, binary, multiclass try: reg.fit(X, y_continuous) except ValueError: self.fail("reg.fit() raised ValueError while fitting " "continuous targets") try: reg.fit(X, y_binary) except ValueError: self.fail("reg.fit() raised ValueError while fitting " "binary targets") try: reg.fit(X, y_multiclass) except ValueError: self.fail("reg.fit() raised ValueError while fitting " "multiclass targets")
X, y = rgr_dataset[1](return_X_y=True) #feature_types = (['numerical'] * 3) + ['categorical'] + (['numerical'] * 9) print('[INFO] Splitting.') X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8) print(f'[INFO] Train shape: {X_train.shape}') print(f'[INFO] Test shape: {X_test.shape}') print('[INFO] Finding best model...') #for auto vanilla, add this : #ensemble_size=1, initial_configurations_via_metalearning=0 #-----CLASSIFIER----- #automl = AutoSklearnClassifier(per_run_time_limit=300, ml_memory_limit=1024 * 6, time_left_for_this_task=3600, resampling_strategy='cv', # resampling_strategy_arguments={'folds': 5}) #-----REGRESSION----- automl = AutoSklearnRegressor(per_run_time_limit=300, ml_memory_limit=1024 * 6, time_left_for_this_task=3600, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}) start = time.time() #X_train = X_train.astype('float') automl.fit(X_train, y_train, dataset_name='linnerud') #change dataset name accordingly automl.refit(X_train.copy(), y_train.copy()) print(f'[INFO] Elapsed time finding best model: {time.time() - start} seconds.') predictions = automl.predict(X_test) #print('--- CLASSIFICATION REPORT: ---') #not for regression #print(classification_report(y_test, predictions, digits=5)) print('\n\n--- MODELS: ---') print(automl.show_models()) print('\n\n--- STATISTICS: ---') print(automl.sprint_statistics())
def task_executor(task_info): """Execute task :param task_info: detail of task, dict""" data_path = task_info.get("data_path") time_max = task_info.get("time_max") task_id = task_info.get("task_id") model_type = task_info.get("model_type") LOG.info("Load data, path=%s", data_path) status = "done" try: data_set = pd.read_csv(data_path) x_set = data_set[data_set.columns[:len(data_set.keys()) - 1]] y_set = data_set[data_set.columns[-1]] x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.3, random_state=0) LOG.info("start optimizer.") if platform.system() == "Linux": from autosklearn.classification import AutoSklearnClassifier from autosklearn.regression import AutoSklearnRegressor if model_type == "Classification": model = AutoSklearnClassifier( time_left_for_this_task=time_max + 5, per_run_time_limit=int(time_max / 10), include_preprocessors=["no_preprocessing"], ) elif model_type == "Regression": model = AutoSklearnRegressor( time_left_for_this_task=time_max + 5, per_run_time_limit=int(time_max / 10), include_preprocessors=["no_preprocessing"], ) else: LOG.error("not support model type=%s", model_type) raise ValueError("not support model type") else: from sklearn.ensemble import RandomForestClassifier, \ RandomForestRegressor if model_type == "Classification": model = RandomForestClassifier(n_estimators=500) elif model_type == "Regression": model = RandomForestRegressor(n_estimators=500) else: LOG.error("not support model type=%s", model_type) raise ValueError("not support model type") model.fit(x_train, y_train) prediction = model.predict(x_test) if model_type == "Classification": best_metrics = accuracy_score(y_test, prediction) LOG.info("The accuracy is %s", best_metrics) else: best_metrics = mean_squared_error(y_test, prediction) LOG.info("The mse is %s", best_metrics) except ServerException as server_error: LOG.error("Some thing wrong, reason=%s", server_error) best_metrics = 0 status = "failed" update = dict(end_time=int(time.time()), best_metrics=best_metrics, status=status) Task.objects.filter(task_id=task_id).update(**update)
def get_object_cols(): fin = [] for colname, type_col in df.dtypes.iteritems(): if type_col == "object": fin.append(colname) throw_num_unique_warning(colname, df[colname]) return fin if len(set(save_cols).intersection(set(cols))) > 0: raise Exception( "The arguments cols and save_cols should have no columns in common" ) saved_df = df[save_cols] df = df[cols] if str_action == 'dummies': df = pd.get_dummies(df, drop_first=True, prefix=get_object_cols()) return df, saved_df res = pd.read_csv("nbastats2018-2019.csv") res = res[res["Salary"] != "-"] res["Salary"] = res["Salary"].astype('int64') colnames = [ elem for elem in res.columns if elem != "Name" and elem != "Salary" ] model = Model(res, colnames, "Salary", preprocess_y=np.log) regressor = AutoSklearnRegressor(time_left_for_this_task = 420, per_run_time_limit = 60)\ .fit(model.Xtrain, model.Ytrain.flatten(), metric = metrics.mean_squared_error) print('finished') IPython.embed()