def test_saving_basic_ensemble_classifier(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) ml_predictor = utils.make_titanic_ensemble(df_titanic_train) file_name = ml_predictor.save(str(random.random())) with open(file_name, 'rb') as read_file: saved_ml_pipeline = dill.load(read_file) os.remove(file_name) probas = saved_ml_pipeline.predict_proba(df_titanic_test) probas = [proba[1] for proba in probas] # print(probas) test_score = utils.calculate_brier_score_loss(df_titanic_test.survived, probas) print('test_score') print(test_score) # Very rough ensembles don't do as well on this problem as a standard GradientBoostingClassifier does # Right now we're getting a score of -.22 # Make sure our score is good, but not unreasonably good assert -0.225 < test_score < -0.17
def test_binary_classification_predict_proba_on_Predictor_instance(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() ml_predictor = utils.train_basic_binary_classifier(df_titanic_train) # predictions = ml_predictor.predict_proba(df_titanic_test) predictions = [pred[1] for pred in predictions] test_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) # Make sure our score is good, but not unreasonably good print(test_score) assert -0.16 < test_score < -0.135
def test_user_input_func_classification(model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) def age_bucketing(data): def define_buckets(age): if age <= 17: return 'youth' elif age <= 40: return 'adult' elif age <= 60: return 'adult2' else: return 'over_60' if isinstance(data, dict): data['age_bucket'] = define_buckets(data['age']) else: data['age_bucket'] = data.age.apply(define_buckets) return data column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical', 'age_bucket': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, perform_feature_scaling=False, user_input_func=age_bucketing, model_names=model_name) file_name = ml_predictor.save(str(random.random())) # if model_name == 'DeepLearningClassifier': # from auto_ml.utils_models import load_keras_model # saved_ml_pipeline = load_keras_model(file_name) # else: # with open(file_name, 'rb') as read_file: # saved_ml_pipeline = dill.load(read_file) saved_ml_pipeline = load_ml_model(file_name) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -0.215 if model_name == 'DeepLearningClassifier': lower_bound = -0.237 assert lower_bound < first_score < -0.17 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_titanic_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) print('df_titanic_test_dictionaries') print(df_titanic_test_dictionaries) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < -0.17
def test_feature_learning_getting_single_predictions_classification( model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'sex': 'categorical', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) # NOTE: this is bad practice to pass in our same training set as our fl_data set, but we don't have enough data to do it any other way df_titanic_train, fl_data = train_test_split(df_titanic_train, test_size=0.2) ml_predictor.train(df_titanic_train, model_names=model_name, feature_learning=True, fl_data=fl_data) file_name = ml_predictor.save(str(random.random())) saved_ml_pipeline = load_ml_model(file_name) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -0.16 if model_name == 'DeepLearningClassifier': lower_bound = -0.187 assert lower_bound < first_score < -0.133 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_titanic_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) print('df_titanic_test_dictionaries') print(df_titanic_test_dictionaries) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < -0.133
def test_getting_single_predictions_classification(): df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) ml_predictor = utils.train_basic_binary_classifier(df_titanic_train) file_name = ml_predictor.save() with open(file_name, 'rb') as read_file: saved_ml_pipeline = dill.load(read_file) df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good assert -0.215 < first_score < -0.17 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_titanic_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() < 3 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) print('df_titanic_test_dictionaries') print(df_titanic_test_dictionaries) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert -0.215 < second_score < -0.17
def test_get_basic_ensemble_predictions_one_at_a_time_classifier(): df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) ml_predictor = utils.make_titanic_ensemble(df_titanic_train) file_name = ml_predictor.save() with open(file_name, 'rb') as read_file: saved_ml_pipeline = dill.load(read_file) df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # These predictions take a while. So we'll cut out 80% of our data to make this run much faster df_titanic_test_dictionaries, df_titanic_test_dictionaries_ignored, df_titanic_test, df_titanic_test_ignored = train_test_split( df_titanic_test_dictionaries, df_titanic_test, train_size=0.05, random_state=0) # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: prediction = saved_ml_pipeline.predict_proba(row) predictions.append(prediction) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good assert -0.235 < first_score < -0.17 # 2. make sure the speed is reasonable (do it a few extra times) # data_length = len(df_titanic_test_dictionaries) # start_time = datetime.datetime.now() # for idx in range(1000): # row_num = idx % data_length # saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) # end_time = datetime.datetime.now() # duration = end_time - start_time # print('duration.total_seconds()') # print(duration.total_seconds()) # # It's very difficult to set a benchmark for speed that will work across all machines. # # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # # That's about 1 millisecond per prediction # # Assuming we might be running on a test box that's pretty weak, multiply by 3 # # Also make sure we're not running unreasonably quickly # assert 0.4 < duration.total_seconds() < 3 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) # Make sure our score is good, but not unreasonably good assert -0.235 < second_score < -0.17
def getting_single_predictions_classifier_test(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() column_descriptions = { 'survived': 'output' , 'sex': 'categorical' , 'embarked': 'categorical' , 'pclass': 'categorical' , 'age_bucket': 'categorical' } ensemble_config = [ { 'model_name': 'LGBMClassifier' } , { 'model_name': 'RandomForestClassifier' } ] ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, ensemble_config=ensemble_config) file_name = ml_predictor.save(str(random.random())) saved_ml_pipeline = load_ml_model(file_name) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -0.16 assert -0.15 < first_score < -0.135 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_titanic_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() < 60 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) print('df_titanic_test_dictionaries') print(df_titanic_test_dictionaries) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert -0.15 < second_score < -0.135
def train_old_model(): print('auto_ml_version') print(auto_ml_version) if auto_ml_version > '2.1.6': raise(TypeError) np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() column_descriptions = { 'survived': 'output' , 'sex': 'categorical' , 'embarked': 'categorical' , 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train) file_name = ml_predictor.save('trained_ml_model_v_2_1_6.dill') saved_ml_pipeline = load_ml_model(file_name) df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) # Make sure our score is good, but not unreasonably good lower_bound = -0.16 assert -0.16 < first_score < -0.135 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_titanic_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) # Make sure our score is good, but not unreasonably good assert -0.16 < second_score < -0.135
def test_backwards_compatibility_with_version_2_1_6(): np.random.seed(0) print('auto_ml_version') print(auto_ml_version) if auto_ml_version <= '2.9.0': raise(TypeError) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() saved_ml_pipeline = load_ml_model(os.path.join('tests', 'backwards_compatibility_tests', 'trained_ml_model_v_2_1_6.dill')) df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -0.215 assert lower_bound < first_score < -0.17 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_titanic_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) print('df_titanic_test_dictionaries') print(df_titanic_test_dictionaries) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < -0.17