Beispiel #1
0
    def param_search(self, x, y, time_per_sample=3.5, **kwargs):
        time = int(len(y) * time_per_sample)
        self.m = AutoSklearnRegressor(
            time_left_for_this_task=time,
            resampling_strategy="cv",
            resampling_strategy_arguments={'folds': 10})

        self.m.fit(x,
                   y,
                   metric=mean_squared_error,
                   dataset_name="Land Use Regression")
        # print(self.m.sprint_statistics())
        # score = score_funtion(y, self.m.predict(x))
        # print("Reached a score of {}.".format(score))

        kf = KFold(n_splits=10, shuffle=True)
        rmse = []
        mae = []
        r2 = []
        for train_index, test_index in kf.split(x, y):
            X_train, X_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]
            self.m.refit(X_train, y_train)
            predictions = self.m.predict(X_test)
            rmse_iter, mae_iter, r2_iter = self.score_function(
                y_test, predictions)
            rmse.append(rmse_iter)
            mae.append(mae_iter)
            r2.append(r2_iter)

        # print("Reached a RMSE of {}, MAE of {} and R2 of {}.".format(np.mean(rmse), np.mean(mae), np.mean(r2)))

        return self.concat_results(np.mean(rmse), np.mean(mae), np.mean(r2))
Beispiel #2
0
 def __init__(self, **kwargs) -> None:
     Ensemble.__init__(self)
     client = Client(processes=False,
                     n_workers=kwargs['n_jobs'],
                     thread_per_worker=1,
                     dashboard_address=None)
     self.model = AutoSklearnRegressor(**kwargs, dask_client=client)
Beispiel #3
0
    def test_cv_regression(self):
        """
        Makes sure that when using a cv strategy, we are able to fit
        a regressor
        """
        tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit')
        output = os.path.join(self.test_dir, '..', '.out_regression_fit')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=30,
                                      per_run_time_limit=5,
                                      resampling_strategy='cv',
                                      tmp_folder=tmp,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (356,))
        score = mean_squared_error(Y_test, predictions)
        # On average np.sqrt(30) away from the target -> ~5.5 on average
        # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds
        # constraint. With more time_left_for_this_task this is no longer an issue
        self.assertGreaterEqual(score, -37)

        self._tearDown(tmp)
        self._tearDown(output)
    def test_cv_regression(self):
        """
        Makes sure that when using a cv strategy, we are able to fit
        a regressor
        """
        tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit_cv')
        output = os.path.join(self.test_dir, '..', '.out_regression_fit_cv')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset(
            'boston', train_size_maximum=300)
        automl = AutoSklearnRegressor(time_left_for_this_task=60,
                                      per_run_time_limit=10,
                                      resampling_strategy='cv',
                                      tmp_folder=tmp,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (206, ))
        score = r2(Y_test, predictions)
        print(Y_test)
        print(predictions)
        self.assertGreaterEqual(score, 0.1)
        self.assertGreater(self._count_succeses(automl.cv_results_), 0)

        self._tearDown(tmp)
        self._tearDown(output)
def test_cv_regression(tmp_dir, output_dir, dask_client):
    """
    Makes sure that when using a cv strategy, we are able to fit
    a regressor
    """

    X_train, Y_train, X_test, Y_test = putil.get_dataset(
        'boston', train_size_maximum=300)
    automl = AutoSklearnRegressor(time_left_for_this_task=60,
                                  per_run_time_limit=10,
                                  resampling_strategy='cv',
                                  tmp_folder=tmp_dir,
                                  dask_client=dask_client,
                                  output_folder=output_dir)

    automl.fit(X_train, Y_train)

    # Log file path
    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]

    predictions = automl.predict(X_test)
    assert predictions.shape == (206, )
    score = r2(Y_test, predictions)
    assert score >= 0.1, extract_msg_from_log(log_file_path)
    assert count_succeses(
        automl.cv_results_) > 0, extract_msg_from_log(log_file_path)
Beispiel #6
0
    def spawn_regressor(
            seed,
            time,
            search_space,
            prep_space,
            metric,
            dataset_name=None):
        """Spawn a subprocess.

        auto-sklearn does not take care of spawning worker processes. This
        function, which is called several times in the main block is a new
        process which runs one instance of auto-sklearn.
        """

        # Use the initial configurations from meta-learning only in one out of
        # the four processes spawned. This prevents auto-sklearn from evaluating
        # the same configurations in four processes.
        if seed == 0:
            initial_configurations_via_metalearning = 25
            smac_scenario_args = {}
        else:
            initial_configurations_via_metalearning = 0
            smac_scenario_args = {'initial_incumbent': 'RANDOM'}

        # Arguments which are different to other runs of auto-sklearn:
        # 1. all classifiers write to the same output directory
        # 2. shared_mode is set to True, this enables sharing of data between
        # models.
        # 3. all instances of the AutoSklearnClassifier must have a different
        # seed!
        automl = AutoSklearnRegressor(
            time_left_for_this_task=time,
            # sec., how long should this seed fit process run
            per_run_time_limit=15,
            # sec., each model may only take this long before it's killed
            ml_memory_limit=1024,
            # MB, memory limit imposed on each call to a ML algorithm
            shared_mode=True,  # tmp folder will be shared between seeds
            tmp_folder=tmp_folder,
            output_folder=output_folder,
            delete_tmp_folder_after_terminate=False,
            ensemble_size=0,
            include_estimators=search_space, exclude_estimators=None,
            include_preprocessors=prep_space, exclude_preprocessors=None,
            # ensembles will be built when all optimization runs are finished
            initial_configurations_via_metalearning=(
                initial_configurations_via_metalearning
            ),
            seed=seed,
            smac_scenario_args=smac_scenario_args,
        )
        automl.fit(X_train, y_train, X_test=X_test, y_test=y_test,
                   metric=metric, dataset_name=dataset_name)
        # print(automl.cv_results_)
        return automl.cv_results_
    def model_init(self, model_params: Dict[str, Any]) -> None:
        """Model initialization
        Инициализация объекта модели в зависимости от типа задачи

        Args:
            model_params: Словарь параметров модели
        """
        if self.config['mode'] == 'classification':
            self.model = AutoSklearnClassifier(**model_params)
        elif self.config['mode'] == 'regression':
            self.model = AutoSklearnRegressor(**model_params)
    def test_regression_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Beispiel #9
0
    def test_regression_pandas_support(self):
        X, y = sklearn.datasets.fetch_openml(
            data_id=41514,  # diabetes
            return_X_y=True,
            as_frame=True,
        )
        # This test only make sense if input is dataframe
        self.assertTrue(isinstance(X, pd.DataFrame))
        self.assertTrue(isinstance(y, pd.Series))
        automl = AutoSklearnRegressor(
            time_left_for_this_task=30,
            per_run_time_limit=5,
        )

        # Make sure we error out because y is not encoded
        automl.fit(X, y)

        # Make sure that at least better than random.
        # We use same X_train==X_test to test code quality
        self.assertTrue(automl.score(X, y) > 0.5)

        automl.refit(X, y)

        # Make sure that at least better than random.
        self.assertTrue(r2(y, automl.predict(X)) > 0.5)
Beispiel #10
0
    def test_regression_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Beispiel #11
0
def test_regression_pandas_support(tmp_dir, output_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=41514,  # diabetes
        return_X_y=True,
        as_frame=True,
    )
    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnRegressor(
        time_left_for_this_task=40,
        per_run_time_limit=5,
        dask_client=dask_client,
        tmp_folder=tmp_dir,
        output_folder=output_dir,
    )

    # Make sure we error out because y is not encoded
    automl.fit(X, y)

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) >= 0.5, print_debug_information(automl)

    automl.refit(X, y)

    # Make sure that at least better than random.
    assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
Beispiel #12
0
def test_autosklearn_regression_methods_returns_self(dask_client):
    X_train, y_train, X_test, y_test = putil.get_dataset('boston')
    automl = AutoSklearnRegressor(time_left_for_this_task=30,
                                  per_run_time_limit=5,
                                  dask_client=dask_client,
                                  ensemble_size=0)

    automl_fitted = automl.fit(X_train, y_train)
    assert automl is automl_fitted

    automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
    assert automl is automl_ensemble_fitted

    automl_refitted = automl.refit(X_train.copy(), y_train.copy())
    assert automl is automl_refitted
def process_auto_sklearn(X_train, X_test, y_train, df_types, m_type, seed, *args):
    """Function that trains and tests data using auto-sklearn"""

    from autosklearn.classification import AutoSklearnClassifier
    from autosklearn.regression import AutoSklearnRegressor
    from autosklearn.metrics import f1_weighted
    from autosklearn.metrics import mean_squared_error

    categ_cols = df_types[df_types.NAME != 'target']['TYPE'].values.ravel()

    if m_type == 'classification':
        automl = AutoSklearnClassifier(time_left_for_this_task=TIME_PER_TASK,
                                       per_run_time_limit=int(TIME_PER_TASK/8),
                                       seed=seed,
                                       resampling_strategy='cv',
                                       resampling_strategy_arguments={'folds': 5},
                                       delete_tmp_folder_after_terminate=False)
    else:
        automl = AutoSklearnRegressor(time_left_for_this_task=TIME_PER_TASK,
                                      per_run_time_limit=int(TIME_PER_TASK/8),
                                      seed=seed,
                                      resampling_strategy='cv',
                                      resampling_strategy_arguments={'folds': 5},
                                      delete_tmp_folder_after_terminate=False)
    
    automl.fit(X_train.copy(),
        y_train.copy(), 
        feat_type=categ_cols,
        metric=f1_weighted if m_type == 'classification' else mean_squared_error)
    automl.refit(X_train.copy(), y_train.copy())

    return (automl.predict_proba(X_test) if m_type == 'classification' else 
            automl.predict(X_test))
Beispiel #14
0
class AutoSklearnRegressorEnsemble(AutoSklearnModel, Ensemble):
    """
    Wrapper around an autosklearn model.
    """
    _kind: ModelType = 'regressor'

    def __init__(self, **kwargs) -> None:
        Ensemble.__init__(self)
        client = Client(processes=False,
                        n_workers=kwargs['n_jobs'],
                        thread_per_worker=1,
                        dashboard_address=None)
        self.model = AutoSklearnRegressor(**kwargs, dask_client=client)

    def autosklearn_model(self) -> AutoSklearnRegressor:
        return self.model

    def predict(self, X: np.ndarray) -> np.ndarray:
        """ Get the models prediction """
        return self.model.predict(X)

    def model_predictions(self, X: np.ndarray) -> np.ndarray:
        """ Get the models probability predicitons """
        return np.asarray([m.predict(X) for m in self.models()])

    @classmethod
    def kind(cls) -> ModelType:
        return cls._kind
Beispiel #15
0
    def test_regression(self):
        output = os.path.join(self.test_dir, '..', '.tmp_regression_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      tmp_folder=output,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (356, ))
        score = mean_squared_error(Y_test, predictions)
        # On average np.sqrt(30) away from the target -> ~5.5 on average
        self.assertGreaterEqual(score, -30)
Beispiel #16
0
def train_autosklearn(l=None):
    if l is None:
        l = get_data()
    ensemble_size = 1  # 50 ... 1 for vanilla
    initial_configurations_via_metalearning = 0  # 25 ... 0 for vanilla
    model = AutoSklearnRegressor(
        delete_output_folder_after_terminate=True,
        delete_tmp_folder_after_terminate=True,
        disable_evaluator_output=False,
        ensemble_nbest=50,
        ensemble_size=ensemble_size,
        exclude_estimators=None,
        exclude_preprocessors=None,
        get_smac_object_callback=None,
        include_estimators=None,
        include_preprocessors=None,
        initial_configurations_via_metalearning=
        initial_configurations_via_metalearning,
        logging_config=None,
        ml_memory_limit=3072,
        output_folder=None,
        per_run_time_limit=360,
        resampling_strategy='cv',
        resampling_strategy_arguments={'folds': 5},
        # resampling_strategy='holdout',
        # resampling_strategy_arguments=None,
        seed=1,
        shared_mode=False,
        smac_scenario_args=None,
        time_left_for_this_task=3600,
        tmp_folder=None)
    model.fit(l.X_train.values.copy(), l.y_train.values.squeeze().copy())
    model.refit(l.X_train.values.copy(), l.y_train.values.squeeze().copy())
    print(model.show_models())
    return attributedict_from_locals('model')
    def test_regression(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit')
        output = os.path.join(self.test_dir, '..', '.out_regression_fit')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      tmp_folder=tmp,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (356,))
        score = mean_squared_error(Y_test, predictions)
        # On average np.sqrt(30) away from the target -> ~5.5 on average
        self.assertGreaterEqual(score, -30)
Beispiel #18
0
def train_regression():
    dump_file = os.path.join(
        AUTO_ML_MODELS_PATH,
        'auto_sklearn_regressor' + str(datetime.datetime.now()) + '.dump')

    features, outcome_slave, _ = file_loader('c99temp_train.snappy.csv')

    features = features.values
    outcome_slave = outcome_slave['tempBoardSLAVE'].values

    model = AutoSklearnRegressor(
        time_left_for_this_task=3600,
        per_run_time_limit=600,
    )
    model.fit(features, outcome_slave)

    with open(dump_file, 'wb') as f:
        pickle.dump(model, f)
Beispiel #19
0
def test_regression(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
    automl = AutoSklearnRegressor(time_left_for_this_task=30,
                                  per_run_time_limit=5,
                                  tmp_folder=tmp_dir,
                                  dask_client=dask_client,
                                  output_folder=output_dir)

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (356, )
    score = mean_squared_error(Y_test, predictions)

    # On average np.sqrt(30) away from the target -> ~5.5 on average
    # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds
    # constraint. With more time_left_for_this_task this is no longer an issue
    assert score >= -37, print_debug_information(automl)
    assert count_succeses(automl.cv_results_) > 0
Beispiel #20
0
 def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit):
     automl = AutoSklearnRegressor()
     X = [[1], [2], [3]]
     y = [1, 2, 3]
     automl.fit(X, y)
     self.assertEqual(fit.call_count, 1)
     self.assertIsInstance(fit.call_args[0][0], np.ndarray)
     self.assertIsInstance(fit.call_args[0][1], np.ndarray)
     automl.refit(X, y)
     self.assertEqual(refit.call_count, 1)
     self.assertIsInstance(refit.call_args[0][0], np.ndarray)
     self.assertIsInstance(refit.call_args[0][1], np.ndarray)
     automl.fit_ensemble(y)
     self.assertEqual(fit_ensemble.call_count, 1)
     self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
Beispiel #21
0
def test_cv_regression(tmp_dir, output_dir, dask_client):
    """
    Makes sure that when using a cv strategy, we are able to fit
    a regressor
    """

    X_train, Y_train, X_test, Y_test = putil.get_dataset(
        'boston', train_size_maximum=300)
    automl = AutoSklearnRegressor(time_left_for_this_task=60,
                                  per_run_time_limit=10,
                                  resampling_strategy='cv',
                                  tmp_folder=tmp_dir,
                                  dask_client=dask_client,
                                  output_folder=output_dir)

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (206, )
    score = r2(Y_test, predictions)
    assert score >= 0.1, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
 def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit):
     automl = AutoSklearnRegressor()
     X = [[1], [2], [3]]
     y = [1, 2, 3]
     automl.fit(X, y)
     self.assertEqual(fit.call_count, 1)
     self.assertIsInstance(fit.call_args[0][0], np.ndarray)
     self.assertIsInstance(fit.call_args[0][1], np.ndarray)
     automl.refit(X, y)
     self.assertEqual(refit.call_count, 1)
     self.assertIsInstance(refit.call_args[0][0], np.ndarray)
     self.assertIsInstance(refit.call_args[0][1], np.ndarray)
     automl.fit_ensemble(y)
     self.assertEqual(fit_ensemble.call_count, 1)
     self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
Beispiel #23
0
 def build_pipeline(self):
     """
     Makes a pipeline based on data_config
     This is because autosklearn does not perform automatic data encoding
     """
     categorical_list = infer_categoricals(self.X)
     preprocessing_steps = []
     if self.data_config.get("text_columns"):
         print(
             "Applying TFIDF to text columns: {data_config.get('text_columns')}"
         )
         preprocessing_steps.append(
             make_pipeline(
                 ColumnSelector(cols=data_config.get("text_columns"),
                                drop_axis=True), TfidfVectorizer()))
         categorical_list = [
             c for c in categorical_list
             if c not in data_config["text_columns"]
         ]
     if categorical_list:
         print(
             f"Applying One Hot Encoding to categorical columns: {categorical_list}"
         )
         preprocessing_steps.append(
             make_pipeline(ColumnSelector(cols=categorical_list),
                           OneHotEncoder(handle_unknown="impute")))
     if preprocessing_steps:
         preprocessing_steps = make_union(*preprocessing_steps)
         preprocessing_steps = make_pipeline(preprocessing_steps,
                                             SimpleImputer())
     else:
         preprocessing_steps = SimpleImputer()
     if self.problem_type == "classification":
         automl = AutoSklearnClassifier(**self.automl_settings)
     else:
         automl = AutoSklearnRegressor(**self.automl_settings)
     automl_pipeline = make_pipeline(preprocessing_steps, automl)
     return automl_pipeline
Beispiel #24
0
        'accuracy': accuracy,
        'balanced_accuracy': balanced_accuracy,
        'roc_auc': roc_auc,
        'logloss': log_loss,
        'r2': r2,
        'mean_squared_error': mean_squared_error,
        'root_mean_squared_error': root_mean_squared_error,
        'mean_absolute_error': mean_absolute_error,
    }[metric]
    automl_arguments['metric'] = metric

    if task_type == 'classification':
        automl = AutoSklearnClassifier(**automl_arguments)
        scorer_list = CLASSIFICATION_METRICS
    elif task_type == 'regression':
        automl = AutoSklearnRegressor(**automl_arguments)
        scorer_list = REGRESSION_METRICS
    else:
        raise ValueError(task_type)

    scoring_functions = [scorer for name, scorer in scorer_list.items()]

    automl.fit(X_train,
               y_train,
               dataset_name=dataset_name,
               feat_type=cat,
               X_test=X_test,
               y_test=y_test)
    trajectory = automl.trajectory_

    incumbent_id_to_model = {}
Beispiel #25
0
#for auto vanilla, add this : #ensemble_size=1, initial_configurations_via_metalearning=0
#for specific clf or rgr or prep :     include_estimators=["random_forest", ], exclude_estimators=None,
#                                      include_preprocessors=["no_preprocessing", ], exclude_preprocessors=None)

#-----CLASSIFIER-----
#automl = AutoSklearnClassifier(per_run_time_limit=300, ml_memory_limit=1024 * 6,
#                               time_left_for_this_task=300, resampling_strategy='cv',
#                               ensemble_size=1, initial_configurations_via_metalearning=0,
#                               resampling_strategy_arguments={'folds': 5})

#-----REGRESSION-----
automl = AutoSklearnRegressor(
    per_run_time_limit=360,
    ml_memory_limit=1024 * 8,
    time_left_for_this_task=3600,
    resampling_strategy='cv',
    #                              ensemble_size=1,
    #                              initial_configurations_via_metalearning=0,
    resampling_strategy_arguments={'folds': 5})
start = time.time()

#X_train = X_train.astype('float') # when?
automl.fit(X_train, y_train,
           dataset_name='boston_housing')  #change dataset name accordingly
automl.refit(X_train.copy(), y_train.copy())
print(
    '[INFO] Elapsed time finding best model: {} seconds.'.format(time.time() -
                                                                 start))

predictions = automl.predict(X_test)
#print('--- CLASSIFICATION REPORT: ---')        #not for regression
Beispiel #26
0
 address = './Dataset/Video_Game_Sales.csv'
 dataframe = read_csv(address)
 print(time.strftime("Start time is %Y-%m-%d %H:%M:%S", time.localtime()))
 # split into input and output elements
 data = dataframe.values
 data = data.astype('int')
 X, y = data[:, :-1], data[:, -1]
 print(X.shape, y.shape)
 # split into train and test sets
 X_train, X_test, y_train, y_test = train_test_split(X,
                                                     y,
                                                     test_size=0.33,
                                                     random_state=1)
 # define search
 model = AutoSklearnRegressor(time_left_for_this_task=5 * 60,
                              per_run_time_limit=30,
                              n_jobs=8)
 # perform the search
 model.fit(X_train, y_train)
 # summarize
 # print(model.sprint_statistics())
 # evaluate best model
 y_hat = model.predict(X_test)
 mae = mean_absolute_error(y_test, y_hat)
 r2Score = r2_score(y_test, y_hat)
 mape = mean_absolute_percentage_error(y_test, y_hat)
 mse = mean_squared_error(y_test, y_hat)
 print(time.strftime("End time is %Y-%m-%d %H:%M:%S", time.localtime()))
 print("MAE: %.3f" % mae)
 print("R2_score: %.3f" % r2Score)
 print("MAPE: %.3f" % mape)
    523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 536, 563, 564, 565, 566,
    567, 568, 572, 574, 607, 614, 616, 619, 620
]  # all zeroes, I think
retiro_total = None
retiro_final = None

for grade in grades:
    print(grade)
    # load the school data
    # drop first column (school code)
    # last column is y-values

    if grade == '06':
        model = XGBRegressor()
    else:
        model = AutoSklearnRegressor(time_left_for_this_task=3000,
                                     per_run_time_limit=600)

    def predict(zscoreX, zscoreY):
        # print algo, accuracy, r-squared?
        print('with zScore on X: ' + str(zscoreX) + ' and Y: ' + str(zscoreY))

        if zscoreX:
            x = np.copy(scaled_x)
        else:
            x = np.copy(original_x)

        if zscoreY:
            y = np.copy(scaled_y)
        else:
            y = np.copy(original_y)
Beispiel #28
0
#for auto vanilla, add this : #ensemble_size=1, initial_configurations_via_metalearning=0
#for specific clf or rgr or prep :     include_estimators=["random_forest", ], exclude_estimators=None,
#                                      include_preprocessors=["no_preprocessing", ], exclude_preprocessors=None)

#-----CLASSIFIER-----
#automl = AutoSklearnClassifier(per_run_time_limit=300, ml_memory_limit=1024 * 6,
#                               time_left_for_this_task=300, resampling_strategy='cv',
#                               ensemble_size=1, initial_configurations_via_metalearning=0,
#                               resampling_strategy_arguments={'folds': 5})

#-----REGRESSION-----
automl = AutoSklearnRegressor(per_run_time_limit=300,
                              ml_memory_limit=1024 * 4,
                              time_left_for_this_task=1800,
                              resampling_strategy='cv',
                              include_estimators=[
                                  "liblinear_svr",
                              ],
                              exclude_estimators=None,
                              resampling_strategy_arguments={'folds': 5})
start = time.time()

#X_train = X_train.astype('float') # when?
automl.fit(X_train, y_train,
           dataset_name='boston_housing')  #change dataset name accordingly
automl.refit(X_train.copy(), y_train.copy())
print(
    '[INFO] Elapsed time finding best model: {} seconds.'.format(time.time() -
                                                                 start))

predictions = automl.predict(X_test)
Beispiel #29
0
def test_type_of_target(mock_estimator):
    # Test that classifier raises error for illegal target types.
    X = np.array([
        [1, 2],
        [2, 3],
        [3, 4],
        [4, 5],
    ])
    # Possible target types
    y_binary = np.array([0, 0, 1, 1])
    y_continuous = np.array([0.1, 1.3, 2.1, 4.0])
    y_multiclass = np.array([0, 1, 2, 0])
    y_multilabel = np.array([
        [0, 1],
        [1, 1],
        [1, 0],
        [0, 0],
    ])
    y_multiclass_multioutput = np.array([
        [0, 1],
        [1, 3],
        [2, 2],
        [5, 3],
    ])
    y_continuous_multioutput = np.array([
        [0.1, 1.5],
        [1.2, 3.5],
        [2.7, 2.7],
        [5.5, 3.9],
    ])

    cls = AutoSklearnClassifier(ensemble_size=0)
    cls.automl_ = unittest.mock.Mock()
    cls.automl_.InputValidator = unittest.mock.Mock()
    cls.automl_.InputValidator.target_validator = unittest.mock.Mock()

    # Illegal target types for classification: continuous,
    # multiclass-multioutput, continuous-multioutput.
    expected_msg = r".*Classification with data of type"
    " multiclass-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_multiclass_multioutput)

    expected_msg = r".*Classification with data of type"
    " continuous is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_continuous)

    expected_msg = r".*Classification with data of type"
    " continuous-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_continuous_multioutput)

    # Legal target types for classification: binary, multiclass,
    # multilabel-indicator.
    try:
        cls.fit(X, y_binary)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "binary targets")

    try:
        cls.fit(X, y_multiclass)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "multiclass targets")

    try:
        cls.fit(X, y_multilabel)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "multilabel-indicator targets")

    # Test that regressor raises error for illegal target types.
    reg = AutoSklearnRegressor(ensemble_size=0)
    # Illegal target types for regression: multilabel-indicator
    # multiclass-multioutput
    expected_msg = r".*Regression with data of type"
    " multilabel-indicator is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        reg.fit(
            X=X,
            y=y_multilabel,
        )

    expected_msg = r".*Regression with data of type"
    " multiclass-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        reg.fit(
            X=X,
            y=y_multiclass_multioutput,
        )

    # Legal target types: continuous, multiclass,
    # continuous-multioutput,
    # binary
    try:
        reg.fit(X, y_continuous)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "continuous targets")

    try:
        reg.fit(X, y_multiclass)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "multiclass targets")

    try:
        reg.fit(X, y_continuous_multioutput)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "continuous_multioutput targets")

    try:
        reg.fit(X, y_binary)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "binary targets")
Beispiel #30
0
    def test_type_of_target(self, mock_estimator):
        # Test that classifier raises error for illegal target types.
        X = np.array([
            [1, 2],
            [2, 3],
            [3, 4],
            [4, 5],
        ])
        # Possible target types
        y_binary = np.array([0, 0, 1, 1])
        y_continuous = np.array([0.1, 1.3, 2.1, 4.0])
        y_multiclass = np.array([0, 1, 2, 0])
        y_multilabel = np.array([
            [0, 1],
            [1, 1],
            [1, 0],
            [0, 0],
        ])
        y_multiclass_multioutput = np.array([
            [0, 1],
            [1, 3],
            [2, 2],
            [5, 3],
        ])
        y_continuous_multioutput = np.array([
            [0.1, 1.5],
            [1.2, 3.5],
            [2.7, 2.7],
            [5.5, 3.9],
        ])

        cls = AutoSklearnClassifier()
        # Illegal target types for classification: continuous,
        # multiclass-multioutput, continuous-multioutput.
        self.assertRaisesRegex(
            ValueError,
            "classification with data of type"
            " multiclass-multioutput is not supported",
            cls.fit,
            X=X,
            y=y_multiclass_multioutput,
        )

        self.assertRaisesRegex(
            ValueError,
            "classification with data of type"
            " continuous is not supported",
            cls.fit,
            X=X,
            y=y_continuous,
        )

        self.assertRaisesRegex(
            ValueError,
            "classification with data of type"
            " continuous-multioutput is not supported",
            cls.fit,
            X=X,
            y=y_continuous_multioutput,
        )

        # Legal target types for classification: binary, multiclass,
        # multilabel-indicator.
        try:
            cls.fit(X, y_binary)
        except ValueError:
            self.fail("cls.fit() raised ValueError while fitting "
                      "binary targets")

        try:
            cls.fit(X, y_multiclass)
        except ValueError:
            self.fail("cls.fit() raised ValueError while fitting "
                      "multiclass targets")

        try:
            cls.fit(X, y_multilabel)
        except ValueError:
            self.fail("cls.fit() raised ValueError while fitting "
                      "multilabel-indicator targets")

        # Test that regressor raises error for illegal target types.
        reg = AutoSklearnRegressor()
        # Illegal target types for regression: multiclass-multioutput,
        # multilabel-indicator, continuous-multioutput.
        self.assertRaisesRegex(
            ValueError,
            "regression with data of type"
            " multiclass-multioutput is not supported",
            reg.fit,
            X=X,
            y=y_multiclass_multioutput,
        )

        self.assertRaisesRegex(
            ValueError,
            "regression with data of type"
            " multilabel-indicator is not supported",
            reg.fit,
            X=X,
            y=y_multilabel,
        )

        self.assertRaisesRegex(
            ValueError,
            "regression with data of type"
            " continuous-multioutput is not supported",
            reg.fit,
            X=X,
            y=y_continuous_multioutput,
        )
        # Legal target types: continuous, binary, multiclass
        try:
            reg.fit(X, y_continuous)
        except ValueError:
            self.fail("reg.fit() raised ValueError while fitting "
                      "continuous targets")

        try:
            reg.fit(X, y_binary)
        except ValueError:
            self.fail("reg.fit() raised ValueError while fitting "
                      "binary targets")

        try:
            reg.fit(X, y_multiclass)
        except ValueError:
            self.fail("reg.fit() raised ValueError while fitting "
                      "multiclass targets")
Beispiel #31
0
X, y = rgr_dataset[1](return_X_y=True)
#feature_types = (['numerical'] * 3) + ['categorical'] + (['numerical'] * 9)
 
print('[INFO] Splitting.')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8)
 
print(f'[INFO] Train shape: {X_train.shape}')
print(f'[INFO] Test shape: {X_test.shape}')
 
print('[INFO] Finding best model...')
#for auto vanilla, add this : #ensemble_size=1, initial_configurations_via_metalearning=0
#-----CLASSIFIER-----
#automl = AutoSklearnClassifier(per_run_time_limit=300, ml_memory_limit=1024 * 6, time_left_for_this_task=3600, resampling_strategy='cv',
#        resampling_strategy_arguments={'folds': 5})
#-----REGRESSION-----
automl = AutoSklearnRegressor(per_run_time_limit=300, ml_memory_limit=1024 * 6, time_left_for_this_task=3600, resampling_strategy='cv',
        resampling_strategy_arguments={'folds': 5})
start = time.time()
 
#X_train = X_train.astype('float')
automl.fit(X_train, y_train, dataset_name='linnerud')   #change dataset name accordingly
automl.refit(X_train.copy(), y_train.copy())
print(f'[INFO] Elapsed time finding best model: {time.time() - start} seconds.') 

predictions = automl.predict(X_test)
#print('--- CLASSIFICATION REPORT: ---')        #not for regression
#print(classification_report(y_test, predictions, digits=5))
print('\n\n--- MODELS: ---')
print(automl.show_models())
print('\n\n--- STATISTICS: ---')
print(automl.sprint_statistics()) 
Beispiel #32
0
def task_executor(task_info):
    """Execute task
    :param task_info: detail of task, dict"""
    data_path = task_info.get("data_path")
    time_max = task_info.get("time_max")
    task_id = task_info.get("task_id")
    model_type = task_info.get("model_type")
    LOG.info("Load data, path=%s", data_path)
    status = "done"
    try:
        data_set = pd.read_csv(data_path)
        x_set = data_set[data_set.columns[:len(data_set.keys()) - 1]]
        y_set = data_set[data_set.columns[-1]]
        x_train, x_test, y_train, y_test = train_test_split(x_set,
                                                            y_set,
                                                            test_size=0.3,
                                                            random_state=0)
        LOG.info("start optimizer.")
        if platform.system() == "Linux":
            from autosklearn.classification import AutoSklearnClassifier
            from autosklearn.regression import AutoSklearnRegressor
            if model_type == "Classification":
                model = AutoSklearnClassifier(
                    time_left_for_this_task=time_max + 5,
                    per_run_time_limit=int(time_max / 10),
                    include_preprocessors=["no_preprocessing"],
                )
            elif model_type == "Regression":
                model = AutoSklearnRegressor(
                    time_left_for_this_task=time_max + 5,
                    per_run_time_limit=int(time_max / 10),
                    include_preprocessors=["no_preprocessing"],
                )
            else:
                LOG.error("not support model type=%s", model_type)
                raise ValueError("not support model type")
        else:
            from sklearn.ensemble import RandomForestClassifier, \
                RandomForestRegressor
            if model_type == "Classification":
                model = RandomForestClassifier(n_estimators=500)
            elif model_type == "Regression":
                model = RandomForestRegressor(n_estimators=500)
            else:
                LOG.error("not support model type=%s", model_type)
                raise ValueError("not support model type")
        model.fit(x_train, y_train)
        prediction = model.predict(x_test)

        if model_type == "Classification":
            best_metrics = accuracy_score(y_test, prediction)
            LOG.info("The accuracy is %s", best_metrics)
        else:
            best_metrics = mean_squared_error(y_test, prediction)
            LOG.info("The mse is %s", best_metrics)
    except ServerException as server_error:
        LOG.error("Some thing wrong, reason=%s", server_error)
        best_metrics = 0
        status = "failed"

    update = dict(end_time=int(time.time()),
                  best_metrics=best_metrics,
                  status=status)
    Task.objects.filter(task_id=task_id).update(**update)
Beispiel #33
0
        def get_object_cols():
            fin = []
            for colname, type_col in df.dtypes.iteritems():
                if type_col == "object":
                    fin.append(colname)
                    throw_num_unique_warning(colname, df[colname])
            return fin

        if len(set(save_cols).intersection(set(cols))) > 0:
            raise Exception(
                "The arguments cols and save_cols should have no columns in common"
            )
        saved_df = df[save_cols]
        df = df[cols]
        if str_action == 'dummies':
            df = pd.get_dummies(df, drop_first=True, prefix=get_object_cols())
        return df, saved_df


res = pd.read_csv("nbastats2018-2019.csv")
res = res[res["Salary"] != "-"]
res["Salary"] = res["Salary"].astype('int64')
colnames = [
    elem for elem in res.columns if elem != "Name" and elem != "Salary"
]
model = Model(res, colnames, "Salary", preprocess_y=np.log)
regressor = AutoSklearnRegressor(time_left_for_this_task = 420, per_run_time_limit = 60)\
        .fit(model.Xtrain, model.Ytrain.flatten(), metric = metrics.mean_squared_error)
print('finished')
IPython.embed()