def test_categorical_ensemble_basic_classifier(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'pclass': 'categorical', 'embarked': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train_categorical_ensemble(df_titanic_train, categorical_column='pclass', optimize_final_model=False) test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived) print('test_score') print(test_score) # Small sample sizes mean there's a fair bit of noise here assert -0.226 < test_score < -0.17
def test_perform_feature_scaling_false_regression(model_name=None): np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = { 'MEDV': 'output' , 'CHAS': 'categorical' } ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_boston_train, perform_feature_scaling=False, model_names=model_name) test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV) print('test_score') print(test_score) lower_bound = -3.2 if model_name == 'DeepLearningRegressor': lower_bound = -8.8 if model_name == 'LGBMRegressor': lower_bound = -4.95 assert lower_bound < test_score < -2.8
def test_calibrate_final_model_missing_X_test_y_test_classification(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) # Take a third of our test data (a tenth of our overall data) for calibration df_titanic_test, df_titanic_calibration = train_test_split(df_titanic_test, test_size=0.33, random_state=42) column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) # This should still work, just with warning printed ml_predictor.train(df_titanic_train, calibrate_final_model=True) test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived) print('test_score') print(test_score) assert -0.215 < test_score < -0.17
def test_input_df_unmodified(): np.random.seed(42) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = { 'MEDV': 'output' , 'CHAS': 'categorical' } ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) df_shape = df_boston_train.shape ml_predictor.train(df_boston_train) training_shape = df_boston_train.shape assert training_shape[0] == df_shape[0] assert training_shape[1] == df_shape[1] test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV) print('test_score') print(test_score) assert -3.35 < test_score < -2.8
def optimize_final_model_classification(model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, optimize_final_model=True, model_names=model_name) test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived) print('test_score') print(test_score) # Small sample sizes mean there's a fair bit of noise here lower_bound = -0.215 if model_name == 'DeepLearningClassifier': lower_bound = -0.25 assert lower_bound < test_score < -0.17
def test_perform_feature_scaling_false_classification(model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() column_descriptions = { 'survived': 'output' , 'embarked': 'categorical' , 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, perform_feature_scaling=False, model_names=model_name) test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived) print('test_score') print(test_score) lower_bound = -0.215 if model_name == 'DeepLearningClassifier': lower_bound = -0.235 assert lower_bound < test_score < -0.17
def test_verify_features_finds_no_missing_features_when_none_are_missing(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical', 'sex': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, verify_features=True) file_name = ml_predictor.save(str(random.random())) with open(file_name, 'rb') as read_file: saved_ml_pipeline = dill.load(read_file) os.remove(file_name) missing_features = saved_ml_pipeline.named_steps[ 'final_model'].verify_features(df_titanic_test) print('missing_features') print(missing_features) print("len(missing_features['prediction_not_training'])") print(len(missing_features['prediction_not_training'])) print("len(missing_features['training_not_prediction'])") print(len(missing_features['training_not_prediction'])) assert len(missing_features['prediction_not_training']) == 0 assert len(missing_features['training_not_prediction']) == 0
def categorical_ensembling_regression(model_name=None): np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'} ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train_categorical_ensemble(df_boston_train, perform_feature_selection=True, model_names=model_name, categorical_column='CHAS') test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV) print('test_score') print(test_score) # Bumping this up since without these features our score drops lower_bound = -4.0 if model_name == 'DeepLearningRegressor': lower_bound = -19 if model_name == 'LGBMRegressor': lower_bound = -4.95 assert lower_bound < test_score < -2.8
def test_include_bad_y_vals_predict_classification(model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() column_descriptions = { 'survived': 'output' , 'embarked': 'categorical' , 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) df_titanic_test.ix[1, 'survived'] = float('nan') df_titanic_test.ix[8, 'survived'] = float('inf') df_titanic_test.ix[26, 'survived'] = None ml_predictor.train(df_titanic_train, model_names=model_name) test_score = ml_predictor.score(df_titanic_test.to_dict('records'), df_titanic_test.survived) print('test_score') print(test_score) assert -0.215 < test_score < -0.17
def test_throws_warning_when_fl_data_equals_df_train(): df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) with warnings.catch_warnings(record=True) as w: try: ml_predictor.train(df_titanic_train, feature_learning=True, fl_data=df_titanic_train) except KeyError as e: pass # We should not be getting to this line- we should be throwing an error above for thing in w: print(thing) assert len(w) == 1
def optimize_final_model_regression(model_name=None): np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'} ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_boston_train, optimize_final_model=True, model_names=model_name) test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV) print('test_score') print(test_score) # the random seed gets a score of -3.21 on python 3.5 # There's a ton of noise here, due to small sample sizes lower_bound = -3.4 if model_name == 'DeepLearningRegressor': lower_bound = -20 if model_name == 'LGBMRegressor': lower_bound = -5.5 if model_name == 'GradientBoostingRegressor': lower_bound = -3.5 assert lower_bound < test_score < -2.8
def train_basic_regressor(df_boston_train): column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'} ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_boston_train, verbose=False) return ml_predictor
def train_basic_binary_classifier(df_titanic_train): column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train) return ml_predictor
def train_basic_multilabel_classifier(df_twitter_train): column_descriptions = { 'airline_sentiment': 'output', 'airline': 'categorical', 'text': 'ignore', 'tweet_location': 'categorical', 'user_timezone': 'categorical', 'tweet_created': 'date' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_twitter_train) return ml_predictor
def test_verify_features_finds_missing_training_features(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical', 'sex': 'categorical' } # Remove the "sibsp" column from our training data df_titanic_train = df_titanic_train.drop('sibsp', axis=1) ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, verify_features=True) file_name = ml_predictor.save(str(random.random())) with open(file_name, 'rb') as read_file: saved_ml_pipeline = dill.load(read_file) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass missing_features = saved_ml_pipeline.named_steps[ 'final_model'].verify_features(df_titanic_test) print('missing_features') print(missing_features) print("len(missing_features['prediction_not_training'])") print(len(missing_features['prediction_not_training'])) print("len(missing_features['training_not_prediction'])") print(len(missing_features['training_not_prediction'])) assert len(missing_features['prediction_not_training']) == 1 assert len(missing_features['training_not_prediction']) == 0
def test_perform_feature_scaling_true_regression(model_name=None): np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = { 'MEDV': 'output' , 'CHAS': 'categorical' } ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_boston_train, perform_feature_scaling=True) test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV) print('test_score') print(test_score) assert -3.2 < test_score < -2.8
def test_compare_all_models_regression(): np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = { 'MEDV': 'output' , 'CHAS': 'categorical' } ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_boston_train, compare_all_models=True) test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV) print('test_score') print(test_score) assert -3.35 < test_score < -2.8
def test_all_algos_classification(model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() column_descriptions = { 'survived': 'output' , 'embarked': 'categorical' , 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, model_names=['LogisticRegression', 'RandomForestClassifier', 'RidgeClassifier', 'GradientBoostingClassifier', 'ExtraTreesClassifier', 'AdaBoostClassifier', 'SGDClassifier', 'Perceptron', 'PassiveAggressiveClassifier', 'DeepLearningClassifier', 'XGBClassifier', 'LGBMClassifier']) test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived) print('test_score') print(test_score) assert -0.215 < test_score < -0.17
def test_all_algos_regression(): # a random seed of 42 has ExtraTreesRegressor getting the best CV score, and that model doesn't generalize as well as GradientBoostingRegressor. np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = { 'MEDV': 'output' , 'CHAS': 'categorical' } ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_boston_train, model_names=['LinearRegression', 'RandomForestRegressor', 'Ridge', 'GradientBoostingRegressor', 'ExtraTreesRegressor', 'AdaBoostRegressor', 'SGDRegressor', 'PassiveAggressiveRegressor', 'Lasso', 'LassoLars', 'ElasticNet', 'OrthogonalMatchingPursuit', 'BayesianRidge', 'ARDRegression', 'MiniBatchKMeans', 'DeepLearningRegressor']) test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV) print('test_score') print(test_score) assert -3.35 < test_score < -2.8
def test_list_of_single_model_name_classification(model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() column_descriptions = { 'survived': 'output' , 'embarked': 'categorical' , 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, model_names=[model_name]) test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived) print('test_score') print(test_score) assert -0.215 < test_score < -0.17
def test_linear_model_analytics_classification(model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() column_descriptions = { 'survived': 'output' , 'embarked': 'categorical' , 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, model_names='RidgeClassifier') test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived) print('test_score') print(test_score) # Linear models aren't super great on this dataset... assert -0.37 < test_score < -0.17
def test_bad_val_for_type_of_estimator(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { # 'survived': 'output' 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='invalid_type_of_estimator', column_descriptions=column_descriptions)
def test_missing_output_col_in_column_descriptions(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { # 'survived': 'output' 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)
def categorical_ensembling_classification(model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train_categorical_ensemble(df_titanic_train, model_names=model_name, categorical_column='embarked') test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived) print('test_score') print(test_score) lower_bound = -0.215 if model_name == 'DeepLearningClassifier': lower_bound = -0.24 if model_name == 'XGBClassifier': lower_bound = -0.235 if model_name == 'LGBMClassifier': lower_bound = -0.22 if model_name == 'GradientBoostingClassifier': lower_bound = -0.23 assert lower_bound < test_score < -0.17
def test_bad_val_in_column_descriptions(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical', 'fare': 'this_is_a_bad_value' } with warnings.catch_warnings(record=True) as w: ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) print( 'we should be throwing a warning for the user to give them useful feedback' ) assert len(w) == 1
def test_unexpected_datetime_column_handled_without_errors(): df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train) test_dict = df_titanic_test.sample(frac=0.1).to_dict('records')[0] test_dict['unexpected_column'] = datetime.date.today() test_dict['anoter_unexpected_column'] = datetime.datetime.today() ml_predictor.predict(test_dict) # We want to make sure the above does not throw an error assert True
def feature_learning_categorical_ensembling_getting_single_predictions_regression( model_name=None): np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'} ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) # NOTE: this is bad practice to pass in our same training set as our fl_data set, but we don't have enough data to do it any other way df_boston_train, fl_data = train_test_split(df_boston_train, test_size=0.2) ml_predictor.train_categorical_ensemble(df_boston_train, model_names=model_name, feature_learning=False, fl_data=fl_data, categorical_column='CHAS') file_name = ml_predictor.save(str(random.random())) from quantile_ml.utils_models import load_ml_model saved_ml_pipeline = load_ml_model(file_name) # with open(file_name, 'rb') as read_file: # saved_ml_pipeline = dill.load(read_file) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_boston_test_dictionaries = df_boston_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_boston_test_dictionaries: predictions.append(saved_ml_pipeline.predict(row)) first_score = utils.calculate_rmse(df_boston_test.MEDV, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -3.2 if model_name == 'DeepLearningRegressor': lower_bound = -21.5 if model_name == 'LGBMRegressor': lower_bound = -5.1 if model_name == 'XGBRegressor': lower_bound = -3.6 if model_name == 'GradientBoostingRegressor': lower_bound = -3.6 assert lower_bound < first_score < -2.8 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_boston_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_boston_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() / 1.0 < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_boston_test_dictionaries: predictions.append(saved_ml_pipeline.predict(row)) second_score = utils.calculate_rmse(df_boston_test.MEDV, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < -2.8
def test_getting_single_predictions_nlp_date_multilabel_classification(model_name=None): # quantile_ml does not support multilabel classification for deep learning at the moment if model_name == 'DeepLearningClassifier': return np.random.seed(0) df_twitter_train, df_twitter_test = utils.get_twitter_sentiment_multilabel_classification_dataset() column_descriptions = { 'airline_sentiment': 'output' , 'airline': 'categorical' , 'text': 'nlp' , 'tweet_location': 'categorical' , 'user_timezone': 'categorical' , 'tweet_created': 'date' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_twitter_train, model_names=model_name) file_name = ml_predictor.save(str(random.random())) # if model_name == 'DeepLearningClassifier': # from quantile_ml.utils_models import load_keras_model # saved_ml_pipeline = load_keras_model(file_name) # else: # with open(file_name, 'rb') as read_file: # saved_ml_pipeline = dill.load(read_file) saved_ml_pipeline = load_ml_model(file_name) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_twitter_test_dictionaries = df_twitter_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_twitter_test_dictionaries: predictions.append(saved_ml_pipeline.predict(row)) print('predictions') print(predictions) first_score = accuracy_score(df_twitter_test.airline_sentiment, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = 0.73 # if model_name == 'LGBMClassifier': # lower_bound = 0.655 assert lower_bound < first_score < 0.79 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_twitter_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_twitter_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly # time_upper_bound = 3 # if model_name == 'XGBClassifier': # time_upper_bound = 4 assert 0.2 < duration.total_seconds() < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_twitter_test_dictionaries: predictions.append(saved_ml_pipeline.predict(row)) print('predictions') print(predictions) print('df_twitter_test_dictionaries') print(df_twitter_test_dictionaries) second_score = accuracy_score(df_twitter_test.airline_sentiment, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < 0.79
def getting_single_predictions_classification(model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, model_names=model_name) file_name = ml_predictor.save(str(random.random())) saved_ml_pipeline = load_ml_model(file_name) # if model_name == 'DeepLearningClassifier': # from quantile_ml.utils_models import load_keras_model # saved_ml_pipeline = load_keras_model(file_name) # else: # with open(file_name, 'rb') as read_file: # saved_ml_pipeline = dill.load(read_file) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -0.215 if model_name == 'DeepLearningClassifier': lower_bound = -0.25 assert lower_bound < first_score < -0.17 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_titanic_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) print('df_titanic_test_dictionaries') print(df_titanic_test_dictionaries) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < -0.17