Esempio n. 1
0
def test_input_df_unmodified():
    np.random.seed(42)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    df_shape = df_boston_train.shape
    ml_predictor.train(df_boston_train)

    training_shape = df_boston_train.shape
    assert training_shape[0] == df_shape[0]
    assert training_shape[1] == df_shape[1]


    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.35 < test_score < -2.8
def test_calibrate_final_model_missing_X_test_y_test_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    # Take a third of our test data (a tenth of our overall data) for calibration
    df_titanic_test, df_titanic_calibration = train_test_split(df_titanic_test,
                                                               test_size=0.33,
                                                               random_state=42)

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    # This should still work, just with warning printed
    ml_predictor.train(df_titanic_train, calibrate_final_model=True)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
Esempio n. 3
0
def optimize_final_model_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train,
                       optimize_final_model=True,
                       model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    # Small sample sizes mean there's a fair bit of noise here
    lower_bound = -0.215

    if model_name == 'DeepLearningClassifier':
        lower_bound = -0.25

    assert lower_bound < test_score < -0.17
Esempio n. 4
0
def categorical_ensembling_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train_categorical_ensemble(df_boston_train,
                                            perform_feature_selection=True,
                                            model_names=model_name,
                                            categorical_column='CHAS')

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    # Bumping this up since without these features our score drops
    lower_bound = -4.0
    if model_name == 'DeepLearningRegressor':
        lower_bound = -19
    if model_name == 'LGBMRegressor':
        lower_bound = -4.95

    assert lower_bound < test_score < -2.8
def test_perform_feature_scaling_false_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, perform_feature_scaling=False, model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    lower_bound = -0.215
    if model_name == 'DeepLearningClassifier':
        lower_bound = -0.235

    assert lower_bound < test_score < -0.17
def test_include_bad_y_vals_predict_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    df_titanic_test.ix[1, 'survived'] = float('nan')
    df_titanic_test.ix[8, 'survived'] = float('inf')
    df_titanic_test.ix[26, 'survived'] = None

    ml_predictor.train(df_titanic_train, model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test.to_dict('records'), df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
Esempio n. 7
0
def optimize_final_model_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train,
                       optimize_final_model=True,
                       model_names=model_name)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    # the random seed gets a score of -3.21 on python 3.5
    # There's a ton of noise here, due to small sample sizes
    lower_bound = -3.4
    if model_name == 'DeepLearningRegressor':
        lower_bound = -20
    if model_name == 'LGBMRegressor':
        lower_bound = -5.5
    if model_name == 'GradientBoostingRegressor':
        lower_bound = -3.5

    assert lower_bound < test_score < -2.8
def test_categorical_ensemble_basic_classifier():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'pclass': 'categorical',
        'embarked': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train_categorical_ensemble(df_titanic_train,
                                            categorical_column='pclass',
                                            optimize_final_model=False)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    # Small sample sizes mean there's a fair bit of noise here
    assert -0.226 < test_score < -0.17
Esempio n. 9
0
def test_perform_feature_scaling_false_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, perform_feature_scaling=False, model_names=model_name)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    lower_bound = -3.2
    if model_name == 'DeepLearningRegressor':
        lower_bound = -8.8
    if model_name == 'LGBMRegressor':
        lower_bound = -4.95

    assert lower_bound < test_score < -2.8
Esempio n. 10
0
def test_perform_feature_scaling_true_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, perform_feature_scaling=True)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.2 < test_score < -2.8
Esempio n. 11
0
def test_compare_all_models_regression():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, compare_all_models=True)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.35 < test_score < -2.8
Esempio n. 12
0
def test_all_algos_regression():
    # a random seed of 42 has ExtraTreesRegressor getting the best CV score, and that model doesn't generalize as well as GradientBoostingRegressor.
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, model_names=['LinearRegression', 'RandomForestRegressor', 'Ridge', 'GradientBoostingRegressor', 'ExtraTreesRegressor', 'AdaBoostRegressor', 'SGDRegressor', 'PassiveAggressiveRegressor', 'Lasso', 'LassoLars', 'ElasticNet', 'OrthogonalMatchingPursuit', 'BayesianRidge', 'ARDRegression', 'MiniBatchKMeans', 'DeepLearningRegressor'])

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.35 < test_score < -2.8
Esempio n. 13
0
def test_all_algos_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, model_names=['LogisticRegression', 'RandomForestClassifier', 'RidgeClassifier', 'GradientBoostingClassifier', 'ExtraTreesClassifier', 'AdaBoostClassifier', 'SGDClassifier', 'Perceptron', 'PassiveAggressiveClassifier', 'DeepLearningClassifier', 'XGBClassifier', 'LGBMClassifier'])

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
def test_list_of_single_model_name_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, model_names=[model_name])

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
Esempio n. 15
0
def test_linear_model_analytics_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, model_names='RidgeClassifier')

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    # Linear models aren't super great on this dataset...
    assert -0.37 < test_score < -0.17
Esempio n. 16
0
def categorical_ensembling_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train_categorical_ensemble(df_titanic_train,
                                            model_names=model_name,
                                            categorical_column='embarked')

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    lower_bound = -0.215

    if model_name == 'DeepLearningClassifier':
        lower_bound = -0.24
    if model_name == 'XGBClassifier':
        lower_bound = -0.235
    if model_name == 'LGBMClassifier':
        lower_bound = -0.22
    if model_name == 'GradientBoostingClassifier':
        lower_bound = -0.23

    assert lower_bound < test_score < -0.17