Ejemplo n.º 1
0
def ensemble_classifier_basic_test(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ensemble_config = [{
        'model_name': 'LGBMClassifier'
    }, {
        'model_name': 'RandomForestClassifier'
    }]

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, ensemble_config=ensemble_config)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.15 < test_score < -0.131
Ejemplo n.º 2
0
def test_calibrate_final_model_missing_X_test_y_test_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    # Take a third of our test data (a tenth of our overall data) for calibration
    df_titanic_test, df_titanic_calibration = train_test_split(df_titanic_test,
                                                               test_size=0.33,
                                                               random_state=42)

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    # This should still work, just with warning printed
    ml_predictor.train(df_titanic_train, calibrate_final_model=True)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.14 < test_score < -0.12
Ejemplo n.º 3
0
def test_verify_features_finds_no_missing_features_when_none_are_missing():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(df_titanic_train, verify_features=True)

    file_name = ml_predictor.save(str(random.random()))

    with open(file_name, 'rb') as read_file:
        saved_ml_pipeline = dill.load(read_file)
    os.remove(file_name)

    missing_features = saved_ml_pipeline.named_steps[
        'final_model'].verify_features(df_titanic_test)
    print('missing_features')
    print(missing_features)

    print("len(missing_features['prediction_not_training'])")
    print(len(missing_features['prediction_not_training']))
    print("len(missing_features['training_not_prediction'])")
    print(len(missing_features['training_not_prediction']))
    assert len(missing_features['prediction_not_training']) == 0
    assert len(missing_features['training_not_prediction']) == 0
Ejemplo n.º 4
0
def regression_test():
    # a random seed of 42 has ExtraTreesRegressor getting the best CV score,
    # and that model doesn't generalize as well as GradientBoostingRegressor.
    np.random.seed(0)
    model_name = 'LGBMRegressor'

    df_boston_train, df_boston_test = get_boston_regression_dataset()
    many_dfs = []
    for i in range(100):
        many_dfs.append(df_boston_train)
    df_boston_train = pd.concat(many_dfs)

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, model_names=[model_name], perform_feature_scaling=False)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    lower_bound = -3.2
    if model_name == 'DeepLearningRegressor':
        lower_bound = -7.8
    if model_name == 'LGBMRegressor':
        lower_bound = -4.95
    if model_name == 'XGBRegressor':
        lower_bound = -3.4

    assert lower_bound < test_score < -2.7
Ejemplo n.º 5
0
def test_unexpected_datetime_column_handled_without_errors():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train)

    test_dict = df_titanic_test.sample(frac=0.1).to_dict('records')[0]

    test_dict['unexpected_column'] = datetime.date.today()
    test_dict['anoter_unexpected_column'] = datetime.datetime.today()

    ml_predictor.predict(test_dict)

    # We want to make sure the above does not throw an error
    assert True
Ejemplo n.º 6
0
def test_calibrate_uncertainty():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)
    uncertainty_data, uncertainty_calibration_data = train_test_split(
        uncertainty_data, test_size=0.5)

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    uncertainty_calibration_settings = {'num_buckets': 3, 'percentiles': [25, 50, 75]}
    ml_predictor.train(
        df_boston_train,
        perform_feature_selection=True,
        train_uncertainty_model=True,
        uncertainty_data=uncertainty_data,
        calibrate_uncertainty=True,
        uncertainty_calibration_settings=uncertainty_calibration_settings,
        uncertainty_calibration_data=uncertainty_calibration_data)

    uncertainty_score = ml_predictor.predict_uncertainty(df_boston_test)

    assert 'percentile_25_delta' in list(uncertainty_score.columns)
    assert 'percentile_50_delta' in list(uncertainty_score.columns)
    assert 'percentile_75_delta' in list(uncertainty_score.columns)
    assert 'bucket_num' in list(uncertainty_score.columns)
Ejemplo n.º 7
0
def test_pass_in_list_of_dictionaries_predict_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    list_titanic_train = df_titanic_train.to_dict('records')

    ml_predictor.train(df_titanic_train)

    test_score = ml_predictor.score(df_titanic_test.to_dict('records'),
                                    df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.16 < test_score < -0.135
Ejemplo n.º 8
0
def test_unmarked_categorical_column_throws_warning():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output'
        # This is the column we are "forgetting" to mark as categorical
        # , 'sex': 'categorical'
        ,
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    with warnings.catch_warnings(record=True) as caught_w:

        ml_predictor.train(df_titanic_train)
        print(
            'we should be throwing a warning for the user to give them useful feedback on the unlabeled categorical column'
        )
        assert len(caught_w) == 1

    ml_predictor.predict(df_titanic_test)

    # We want to make sure the above does not throw an error
    assert True
Ejemplo n.º 9
0
def test_include_bad_y_vals_predict_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    df_titanic_test.iloc[1]['survived'] = float('nan')
    df_titanic_test.iloc[8]['survived'] = float('inf')
    df_titanic_test.iloc[26]['survived'] = None

    ml_predictor.train(df_titanic_train)

    test_score = ml_predictor.score(df_titanic_test.to_dict('records'),
                                    df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.16 < test_score < -0.135
def test_perform_feature_selection_true_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train,
                       perform_feature_selection=True,
                       model_names=model_name)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    # Bumping this up since without these features our score drops
    lower_bound = -4.0
    if model_name == 'DeepLearningRegressor':
        lower_bound = -14.5
    if model_name == 'LGBMRegressor':
        lower_bound = -4.95

    assert lower_bound < test_score < -2.8
Ejemplo n.º 11
0
def test_list_of_single_model_name_classification():
    np.random.seed(0)
    model_name = 'GradientBoostingClassifier'

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, model_names=[model_name])

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.16 < test_score < -0.135
def test_prediction_intervals_actually_work():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, predict_intervals=[0.05, 0.95])

    df_boston_test = df_boston_test.reset_index(drop=True)
    intervals = ml_predictor.predict_intervals(df_boston_test)
    actuals = df_boston_test.MEDV

    count_under = 0
    count_over = 0
    # print(intervals)
    for idx, row in intervals.iterrows():
        actual = actuals.iloc[idx]

        if actual < row['interval_0.05']:
            count_under += 1
        if actual > row['interval_0.95']:
            count_over += 1

    len_intervals = len(intervals)

    pct_under = count_under * 1.0 / len_intervals
    pct_over = count_over * 1.0 / len_intervals
    # There's a decent bit of noise since this is such a small dataset
    assert pct_under < 0.2
    assert pct_over < 0.1
Ejemplo n.º 13
0
    def test_compare_all_models_classification():
        np.random.seed(0)

        df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
        )

        column_descriptions = {
            'survived': 'output',
            'sex': 'categorical',
            'embarked': 'categorical',
            'pclass': 'categorical'
        }

        ml_predictor = Predictor(type_of_estimator='classifier',
                                 column_descriptions=column_descriptions)

        ml_predictor.train(df_titanic_train, compare_all_models=True)

        test_score = ml_predictor.score(df_titanic_test,
                                        df_titanic_test.survived)

        print('test_score')
        print(test_score)

        assert -0.16 < test_score < -0.135
Ejemplo n.º 14
0
def classification_test():
    np.random.seed(0)
    # model_name = 'GradientBoostingClassifier'
    model_name = 'LGBMClassifier'

    df_titanic_train, df_titanic_test = get_titanic_binary_classification_dataset()
    df_titanic_train['DELETE_THIS_FIELD'] = 1

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical',
        'sex': 'categorical',
        'this_does_not_exist': 'ignore',
        'DELETE_THIS_FIELD': 'ignore'
    }

    ml_predictor = Predictor(
        type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    lower_bound = -0.16
    if model_name == 'DeepLearningClassifier':
        lower_bound = -0.245
    if model_name == 'LGBMClassifier':
        lower_bound = -0.225

    assert lower_bound < test_score < -0.135
Ejemplo n.º 15
0
def test_all_algos_regression():
    # a random seed of 42 has ExtraTreesRegressor getting the best CV score,
    # and that model doesn't generalize as well as GradientBoostingRegressor.
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(
        df_boston_train,
        model_names=[
            'LinearRegression', 'RandomForestRegressor', 'Ridge', 'GradientBoostingRegressor',
            'AdaBoostRegressor', 'SGDRegressor', 'PassiveAggressiveRegressor', 'Lasso', 'LassoLars',
            'ElasticNet', 'OrthogonalMatchingPursuit', 'BayesianRidge', 'ARDRegression',
            'MiniBatchKMeans', 'DeepLearningRegressor', 'LGBMRegressor', 'XGBClassifier',
            'LinearSVR', 'CatBoostRegressor'
        ])

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.4 < test_score < -2.8
Ejemplo n.º 16
0
def test_all_algos_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(
        type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(
        df_titanic_train,
        model_names=[
            'LogisticRegression', 'RandomForestClassifier', 'RidgeClassifier',
            'GradientBoostingClassifier', 'ExtraTreesClassifier', 'AdaBoostClassifier',
            'SGDClassifier', 'Perceptron', 'PassiveAggressiveClassifier', 'DeepLearningClassifier',
            'XGBClassifier', 'LGBMClassifier', 'LinearSVC'
        ])

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    # Linear models aren't super great on this dataset...
    assert -0.215 < test_score < -0.131
Ejemplo n.º 17
0
def train_basic_regressor(df_boston_train):
    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, verbose=False)
    return ml_predictor
def test_already_transformed_X():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    # Take a third of our test data (a tenth of our overall data) for calibration
    df_titanic_test, df_titanic_calibration = train_test_split(df_titanic_test,
                                                               test_size=0.33,
                                                               random_state=42)

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    # pass in return_trans_pipeline, and get the trans pipeline
    trans_pipeline = ml_predictor.train(df_titanic_train,
                                        return_transformation_pipeline=True)

    # get transformed X through transformation_only
    X_train_transformed = ml_predictor.transform_only(df_titanic_train)

    # create a new predictor
    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    # pass in trained trans pipeline, and make sure it works
    ml_predictor.train(df_titanic_train,
                       trained_transformation_pipeline=trans_pipeline)
    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.14 < test_score < -0.12

    # pass in both a trans pipeline and a previously transformed X, and make sure that works
    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(None,
                       trained_transformation_pipeline=trans_pipeline,
                       transformed_X=X_train_transformed,
                       transformed_y=df_titanic_train.survived)
    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.14 < test_score < -0.12
Ejemplo n.º 19
0
def train_basic_binary_classifier(df_titanic_train):
    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(df_titanic_train)

    return ml_predictor
def test_predict_uncertainty_true():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, predict_intervals=True)

    intervals = ml_predictor.predict_intervals(df_boston_test)

    assert isinstance(intervals, pd.DataFrame)
    assert intervals.shape[0] == df_boston_test.shape[0]

    result_list = ml_predictor.predict_intervals(df_boston_test,
                                                 return_type='list')

    assert isinstance(result_list, list)
    assert len(result_list) == df_boston_test.shape[0]
    for idx, row in enumerate(result_list):
        assert isinstance(row, list)
        assert len(row) == 3

    singles = df_boston_test.head().to_dict('records')

    for row in singles:
        result = ml_predictor.predict_intervals(row)
        assert isinstance(result, dict)
        assert 'prediction' in result
        assert 'interval_0.05' in result
        assert 'interval_0.95' in result

    for row in singles:
        result = ml_predictor.predict_intervals(row, return_type='list')
        assert isinstance(result, list)
        assert len(result) == 3

    df_intervals = ml_predictor.predict_intervals(df_boston_test,
                                                  return_type='df')
    assert isinstance(df_intervals, pd.DataFrame)

    try:
        ml_predictor.predict_intervals(df_boston_test,
                                       return_type='this will not work')
        assert False
    except ValueError:
        assert True
Ejemplo n.º 21
0
def train_basic_multilabel_classifier(df_twitter_train):
    column_descriptions = {
        'airline_sentiment': 'output',
        'airline': 'categorical',
        'text': 'ignore',
        'tweet_location': 'categorical',
        'user_timezone': 'categorical',
        'tweet_created': 'date'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(df_twitter_train)

    return ml_predictor
def test_predict_intervals_should_fail_if_not_trained():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train)

    try:
        intervals = ml_predictor.predict_intervals(df_boston_test)
        assert False
    except ValueError:
        assert True
def test_prediction_intervals_lets_the_user_specify_number_of_intervals():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train,
                       predict_intervals=True,
                       prediction_intervals=[.2])

    intervals = ml_predictor.predict_intervals(df_boston_test,
                                               return_type='list')

    assert len(intervals[0]) == 2
def test_perform_feature_scaling_true_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, perform_feature_scaling=True)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.0 < test_score < -2.7
def test_compare_all_models_regression():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, compare_all_models=True)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    # ExtraTrees again throws this off
    assert -3.6 < test_score < -2.8
Ejemplo n.º 26
0
def test_verify_features_finds_missing_training_features():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    # Remove the "sibsp" column from our training data
    df_titanic_train = df_titanic_train.drop('sibsp', axis=1)

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(df_titanic_train, verify_features=True)

    file_name = ml_predictor.save(str(random.random()))

    with open(file_name, 'rb') as read_file:
        saved_ml_pipeline = dill.load(read_file)
    os.remove(file_name)
    try:
        keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5'
        os.remove(keras_file_name)
    except:
        pass

    missing_features = saved_ml_pipeline.named_steps[
        'final_model'].verify_features(df_titanic_test)
    print('missing_features')
    print(missing_features)

    print("len(missing_features['prediction_not_training'])")
    print(len(missing_features['prediction_not_training']))
    print("len(missing_features['training_not_prediction'])")
    print(len(missing_features['training_not_prediction']))
    assert len(missing_features['prediction_not_training']) == 1
    assert len(missing_features['training_not_prediction']) == 0
Ejemplo n.º 27
0
def test_predict_uncertainty_returns_pandas_DataFrame_for_more_than_one_value():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(
        df_boston_train,
        perform_feature_selection=True,
        train_uncertainty_model=True,
        uncertainty_data=uncertainty_data)

    uncertainties = ml_predictor.predict_uncertainty(df_boston_test)

    assert isinstance(uncertainties, pd.DataFrame)
Ejemplo n.º 28
0
def optimize_final_model_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    # We just want to make sure these run, not necessarily make sure that they're super accurate
    # (which takes more time, and is dataset dependent)
    df_titanic_train = df_titanic_train.sample(frac=0.5)

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train,
                       optimize_final_model=True,
                       model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    # Small sample sizes mean there's a fair bit of noise here
    lower_bound = -0.18
    if model_name == 'DeepLearningClassifier':
        lower_bound = -0.255
    if model_name == 'LGBMClassifier':
        lower_bound = -0.221
    if model_name == 'GradientBoostingClassifier':
        lower_bound = -0.225
    if model_name == 'CatBoostClassifier':
        lower_bound = -0.221

    assert lower_bound < test_score < -0.135
Ejemplo n.º 29
0
def test_predict_uncertainty_returns_dict_for_one_value():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(
        df_boston_train,
        perform_feature_selection=True,
        train_uncertainty_model=True,
        uncertainty_data=uncertainty_data)

    test_list = df_boston_test.to_dict('records')

    for item in test_list:
        prediction = ml_predictor.predict_uncertainty(item)
        assert isinstance(prediction, dict)
Ejemplo n.º 30
0
def optimize_final_model_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    # We just want to make sure these run, not necessarily make sure that they're super accurate
    # (which takes more time, and is dataset dependent)
    df_boston_train = df_boston_train.sample(frac=0.5)

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train,
                       optimize_final_model=True,
                       model_names=model_name)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    # the random seed gets a score of -3.21 on python 3.5
    # There's a ton of noise here, due to small sample sizes
    lower_bound = -3.4
    if model_name == 'DeepLearningRegressor':
        lower_bound = -24
    if model_name == 'LGBMRegressor':
        lower_bound = -16
    if model_name == 'GradientBoostingRegressor':
        lower_bound = -5.1
    if model_name == 'CatBoostRegressor':
        lower_bound = -4.5
    if model_name == 'XGBRegressor':
        lower_bound = -4.8

    assert lower_bound < test_score < -2.75