def test_perform_feature_scaling_false_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, perform_feature_scaling=False, model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    lower_bound = -0.215
    if model_name == 'DeepLearningClassifier':
        lower_bound = -0.235

    assert lower_bound < test_score < -0.17
Beispiel #2
0
def test_input_df_unmodified():
    np.random.seed(42)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    df_shape = df_boston_train.shape
    ml_predictor.train(df_boston_train)

    training_shape = df_boston_train.shape
    assert training_shape[0] == df_shape[0]
    assert training_shape[1] == df_shape[1]


    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.35 < test_score < -2.8
Beispiel #3
0
def test_perform_feature_scaling_false_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, perform_feature_scaling=False, model_names=model_name)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    lower_bound = -3.2
    if model_name == 'DeepLearningRegressor':
        lower_bound = -8.8
    if model_name == 'LGBMRegressor':
        lower_bound = -4.95

    assert lower_bound < test_score < -2.8
def test_include_bad_y_vals_predict_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    df_titanic_test.ix[1, 'survived'] = float('nan')
    df_titanic_test.ix[8, 'survived'] = float('inf')
    df_titanic_test.ix[26, 'survived'] = None

    ml_predictor.train(df_titanic_train, model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test.to_dict('records'), df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
Beispiel #5
0
def test_perform_feature_selection_true_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, perform_feature_selection=True, model_names=model_name)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    # Bumping this up since without these features our score drops
    lower_bound = -4.0
    if model_name == 'DeepLearningRegressor':
        lower_bound = -14.5
    if model_name == 'LGBMRegressor':
        lower_bound = -4.95


    assert lower_bound < test_score < -2.8
Beispiel #6
0
def test_verify_features_finds_no_missing_features_when_none_are_missing():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical',
        'sex': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(df_titanic_train, verify_features=True)

    file_name = ml_predictor.save(str(random.random()))

    with open(file_name, 'rb') as read_file:
        saved_ml_pipeline = dill.load(read_file)
    os.remove(file_name)

    missing_features = saved_ml_pipeline.named_steps[
        'final_model'].verify_features(df_titanic_test)
    print('missing_features')
    print(missing_features)

    print("len(missing_features['prediction_not_training'])")
    print(len(missing_features['prediction_not_training']))
    print("len(missing_features['training_not_prediction'])")
    print(len(missing_features['training_not_prediction']))
    assert len(missing_features['prediction_not_training']) == 0
    assert len(missing_features['training_not_prediction']) == 0
Beispiel #7
0
def optimize_final_model_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train,
                       optimize_final_model=True,
                       model_names=model_name)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    # the random seed gets a score of -3.21 on python 3.5
    # There's a ton of noise here, due to small sample sizes
    lower_bound = -3.4
    if model_name == 'DeepLearningRegressor':
        lower_bound = -20
    if model_name == 'LGBMRegressor':
        lower_bound = -5.5
    if model_name == 'GradientBoostingRegressor':
        lower_bound = -3.5

    assert lower_bound < test_score < -2.8
Beispiel #8
0
def test_throws_warning_when_fl_data_equals_df_train():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    with warnings.catch_warnings(record=True) as w:

        try:
            ml_predictor.train(df_titanic_train,
                               feature_learning=True,
                               fl_data=df_titanic_train)
        except KeyError as e:
            pass
    # We should not be getting to this line- we should be throwing an error above
        for thing in w:
            print(thing)
        assert len(w) == 1
Beispiel #9
0
def optimize_final_model_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train,
                       optimize_final_model=True,
                       model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    # Small sample sizes mean there's a fair bit of noise here
    lower_bound = -0.215

    if model_name == 'DeepLearningClassifier':
        lower_bound = -0.25

    assert lower_bound < test_score < -0.17
def test_calibrate_final_model_missing_X_test_y_test_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    # Take a third of our test data (a tenth of our overall data) for calibration
    df_titanic_test, df_titanic_calibration = train_test_split(df_titanic_test,
                                                               test_size=0.33,
                                                               random_state=42)

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    # This should still work, just with warning printed
    ml_predictor.train(df_titanic_train, calibrate_final_model=True)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
Beispiel #11
0
def train_basic_regressor(df_boston_train):
    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, verbose=False)
    return ml_predictor
Beispiel #12
0
def train_basic_binary_classifier(df_titanic_train):
    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(df_titanic_train)

    return ml_predictor
Beispiel #13
0
def train_basic_multilabel_classifier(df_twitter_train):
    column_descriptions = {
        'airline_sentiment': 'output',
        'airline': 'categorical',
        'text': 'ignore',
        'tweet_location': 'categorical',
        'user_timezone': 'categorical',
        'tweet_created': 'date'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(df_twitter_train)

    return ml_predictor
Beispiel #14
0
def test_verify_features_finds_missing_training_features():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical',
        'sex': 'categorical'
    }

    # Remove the "sibsp" column from our training data
    df_titanic_train = df_titanic_train.drop('sibsp', axis=1)

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(df_titanic_train, verify_features=True)

    file_name = ml_predictor.save(str(random.random()))

    with open(file_name, 'rb') as read_file:
        saved_ml_pipeline = dill.load(read_file)
    os.remove(file_name)
    try:
        keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5'
        os.remove(keras_file_name)
    except:
        pass

    missing_features = saved_ml_pipeline.named_steps[
        'final_model'].verify_features(df_titanic_test)
    print('missing_features')
    print(missing_features)

    print("len(missing_features['prediction_not_training'])")
    print(len(missing_features['prediction_not_training']))
    print("len(missing_features['training_not_prediction'])")
    print(len(missing_features['training_not_prediction']))
    assert len(missing_features['prediction_not_training']) == 1
    assert len(missing_features['training_not_prediction']) == 0
Beispiel #15
0
def test_compare_all_models_regression():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, compare_all_models=True)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.35 < test_score < -2.8
Beispiel #16
0
def test_perform_feature_scaling_true_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, perform_feature_scaling=True)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.2 < test_score < -2.8
Beispiel #17
0
def test_all_algos_regression():
    # a random seed of 42 has ExtraTreesRegressor getting the best CV score, and that model doesn't generalize as well as GradientBoostingRegressor.
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, model_names=['LinearRegression', 'RandomForestRegressor', 'Ridge', 'GradientBoostingRegressor', 'ExtraTreesRegressor', 'AdaBoostRegressor', 'SGDRegressor', 'PassiveAggressiveRegressor', 'Lasso', 'LassoLars', 'ElasticNet', 'OrthogonalMatchingPursuit', 'BayesianRidge', 'ARDRegression', 'MiniBatchKMeans', 'DeepLearningRegressor'])

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.35 < test_score < -2.8
def test_list_of_single_model_name_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, model_names=[model_name])

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
Beispiel #19
0
def test_all_algos_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, model_names=['LogisticRegression', 'RandomForestClassifier', 'RidgeClassifier', 'GradientBoostingClassifier', 'ExtraTreesClassifier', 'AdaBoostClassifier', 'SGDClassifier', 'Perceptron', 'PassiveAggressiveClassifier', 'DeepLearningClassifier', 'XGBClassifier', 'LGBMClassifier'])

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
Beispiel #20
0
def test_linear_model_analytics_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, model_names='RidgeClassifier')

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    # Linear models aren't super great on this dataset...
    assert -0.37 < test_score < -0.17
Beispiel #21
0
def test_unexpected_datetime_column_handled_without_errors():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train)

    test_dict = df_titanic_test.sample(frac=0.1).to_dict('records')[0]

    test_dict['unexpected_column'] = datetime.date.today()
    test_dict['anoter_unexpected_column'] = datetime.datetime.today()

    ml_predictor.predict(test_dict)

    # We want to make sure the above does not throw an error
    assert True
Beispiel #22
0
def feature_learning_getting_single_predictions_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    # NOTE: this is bad practice to pass in our same training set as our fl_data set, but we don't have enough data to do it any other way
    df_boston_train, fl_data = train_test_split(df_boston_train, test_size=0.2)
    ml_predictor.train(df_boston_train,
                       model_names=model_name,
                       feature_learning=True,
                       fl_data=fl_data)

    file_name = ml_predictor.save(str(random.random()))

    # from quantile_ml.utils_models import load_keras_model

    # saved_ml_pipeline = load_keras_model(file_name)

    saved_ml_pipeline = load_ml_model(file_name)

    os.remove(file_name)
    try:
        keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5'
        os.remove(keras_file_name)
    except:
        pass

    df_boston_test_dictionaries = df_boston_test.to_dict('records')

    # 1. make sure the accuracy is the same

    predictions = []
    for row in df_boston_test_dictionaries:
        predictions.append(saved_ml_pipeline.predict(row))

    first_score = utils.calculate_rmse(df_boston_test.MEDV, predictions)
    print('first_score')
    print(first_score)
    # Make sure our score is good, but not unreasonably good

    lower_bound = -3.2
    if model_name == 'DeepLearningRegressor':
        lower_bound = -23
    if model_name == 'LGBMRegressor':
        lower_bound = -4.95
    if model_name == 'XGBRegressor':
        lower_bound = -3.3

    assert lower_bound < first_score < -2.8

    # 2. make sure the speed is reasonable (do it a few extra times)
    data_length = len(df_boston_test_dictionaries)
    start_time = datetime.datetime.now()
    for idx in range(1000):
        row_num = idx % data_length
        saved_ml_pipeline.predict(df_boston_test_dictionaries[row_num])
    end_time = datetime.datetime.now()
    duration = end_time - start_time

    print('duration.total_seconds()')
    print(duration.total_seconds())

    # It's very difficult to set a benchmark for speed that will work across all machines.
    # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions
    # That's about 1 millisecond per prediction
    # Assuming we might be running on a test box that's pretty weak, multiply by 3
    # Also make sure we're not running unreasonably quickly
    assert 0.2 < duration.total_seconds() / 1.0 < 15

    # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time)

    predictions = []
    for row in df_boston_test_dictionaries:
        predictions.append(saved_ml_pipeline.predict(row))

    second_score = utils.calculate_rmse(df_boston_test.MEDV, predictions)
    print('second_score')
    print(second_score)
    # Make sure our score is good, but not unreasonably good

    assert lower_bound < second_score < -2.8
    def test_getting_single_predictions_nlp_date_multilabel_classification(model_name=None):
        # quantile_ml does not support multilabel classification for deep learning at the moment
        if model_name == 'DeepLearningClassifier':
            return

        np.random.seed(0)

        df_twitter_train, df_twitter_test = utils.get_twitter_sentiment_multilabel_classification_dataset()

        column_descriptions = {
            'airline_sentiment': 'output'
            , 'airline': 'categorical'
            , 'text': 'nlp'
            , 'tweet_location': 'categorical'
            , 'user_timezone': 'categorical'
            , 'tweet_created': 'date'
        }

        ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)
        ml_predictor.train(df_twitter_train, model_names=model_name)

        file_name = ml_predictor.save(str(random.random()))

        # if model_name == 'DeepLearningClassifier':
        #     from quantile_ml.utils_models import load_keras_model

        #     saved_ml_pipeline = load_keras_model(file_name)
        # else:
        #     with open(file_name, 'rb') as read_file:
        #         saved_ml_pipeline = dill.load(read_file)
        saved_ml_pipeline = load_ml_model(file_name)

        os.remove(file_name)
        try:
            keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5'
            os.remove(keras_file_name)
        except:
            pass

        df_twitter_test_dictionaries = df_twitter_test.to_dict('records')

        # 1. make sure the accuracy is the same

        predictions = []
        for row in df_twitter_test_dictionaries:
            predictions.append(saved_ml_pipeline.predict(row))

        print('predictions')
        print(predictions)

        first_score = accuracy_score(df_twitter_test.airline_sentiment, predictions)
        print('first_score')
        print(first_score)
        # Make sure our score is good, but not unreasonably good
        lower_bound = 0.73
        # if model_name == 'LGBMClassifier':
        #     lower_bound = 0.655
        assert lower_bound < first_score < 0.79

        # 2. make sure the speed is reasonable (do it a few extra times)
        data_length = len(df_twitter_test_dictionaries)
        start_time = datetime.datetime.now()
        for idx in range(1000):
            row_num = idx % data_length
            saved_ml_pipeline.predict(df_twitter_test_dictionaries[row_num])
        end_time = datetime.datetime.now()
        duration = end_time - start_time

        print('duration.total_seconds()')
        print(duration.total_seconds())

        # It's very difficult to set a benchmark for speed that will work across all machines.
        # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions
        # That's about 1 millisecond per prediction
        # Assuming we might be running on a test box that's pretty weak, multiply by 3
        # Also make sure we're not running unreasonably quickly
        # time_upper_bound = 3
        # if model_name == 'XGBClassifier':
        #     time_upper_bound = 4
        assert 0.2 < duration.total_seconds() < 15


        # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time)

        predictions = []
        for row in df_twitter_test_dictionaries:
            predictions.append(saved_ml_pipeline.predict(row))

        print('predictions')
        print(predictions)
        print('df_twitter_test_dictionaries')
        print(df_twitter_test_dictionaries)
        second_score = accuracy_score(df_twitter_test.airline_sentiment, predictions)
        print('second_score')
        print(second_score)
        # Make sure our score is good, but not unreasonably good
        assert lower_bound < second_score < 0.79
Beispiel #24
0
def getting_single_predictions_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, model_names=model_name)

    file_name = ml_predictor.save(str(random.random()))

    saved_ml_pipeline = load_ml_model(file_name)
    # if model_name == 'DeepLearningClassifier':
    #     from quantile_ml.utils_models import load_keras_model

    #     saved_ml_pipeline = load_keras_model(file_name)
    # else:
    #     with open(file_name, 'rb') as read_file:
    #         saved_ml_pipeline = dill.load(read_file)

    os.remove(file_name)
    try:
        keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5'
        os.remove(keras_file_name)
    except:
        pass

    df_titanic_test_dictionaries = df_titanic_test.to_dict('records')

    # 1. make sure the accuracy is the same

    predictions = []
    for row in df_titanic_test_dictionaries:
        predictions.append(saved_ml_pipeline.predict_proba(row)[1])

    print('predictions')
    print(predictions)

    first_score = utils.calculate_brier_score_loss(df_titanic_test.survived,
                                                   predictions)
    print('first_score')
    print(first_score)
    # Make sure our score is good, but not unreasonably good

    lower_bound = -0.215
    if model_name == 'DeepLearningClassifier':
        lower_bound = -0.25

    assert lower_bound < first_score < -0.17

    # 2. make sure the speed is reasonable (do it a few extra times)
    data_length = len(df_titanic_test_dictionaries)
    start_time = datetime.datetime.now()
    for idx in range(1000):
        row_num = idx % data_length
        saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num])
    end_time = datetime.datetime.now()
    duration = end_time - start_time

    print('duration.total_seconds()')
    print(duration.total_seconds())

    # It's very difficult to set a benchmark for speed that will work across all machines.
    # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions
    # That's about 1 millisecond per prediction
    # Assuming we might be running on a test box that's pretty weak, multiply by 3
    # Also make sure we're not running unreasonably quickly
    assert 0.2 < duration.total_seconds() < 15

    # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time)

    predictions = []
    for row in df_titanic_test_dictionaries:
        predictions.append(saved_ml_pipeline.predict_proba(row)[1])

    print('predictions')
    print(predictions)
    print('df_titanic_test_dictionaries')
    print(df_titanic_test_dictionaries)
    second_score = utils.calculate_brier_score_loss(df_titanic_test.survived,
                                                    predictions)
    print('second_score')
    print(second_score)
    # Make sure our score is good, but not unreasonably good

    assert lower_bound < second_score < -0.17