def print_training_summary(self, gs): print( 'The best CV score from GridSearchCV (by default averaging across k-fold CV) for ' + self.output_column + ' is:') if self.took_log_of_y: print( ' Note that this score is calculated using the natural logs of the y values.' ) print(gs.best_score_) print('The best params were') # Remove 'final_model__model' from what we print- it's redundant with model name, and is difficult to read quickly in a list since it's a python object. if 'model' in gs.best_params_: printing_copy = {} for k, v in gs.best_params_.items(): if k != 'model': printing_copy[k] = v else: printing_copy[k] = utils_models.get_name_from_model(v) else: printing_copy = gs.best_params_ print(printing_copy) if self.verbose: print('Here are all the hyperparameters that were tried:') raw_scores = gs.grid_scores_ sorted_scores = sorted(raw_scores, key=lambda x: x[1], reverse=True) for score in sorted_scores: for k, v in score[0].items(): if k == 'model': score[0][k] = utils_models.get_name_from_model(v) print(score)
def fit(self, X, y): self.model_name = get_name_from_model(self.model) # if self.model_name[:3] == 'XGB' and scipy.sparse.issparse(X): # ones = [[1] for x in range(X.shape[0])] # # Trying to force XGBoost to play nice with sparse matrices # X_fit = scipy.sparse.hstack((X, ones)) # else: X_fit = X if self.model_name[:12] == 'DeepLearning' or self.model_name in [ 'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit', 'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier', 'SGDClassifier', 'RidgeClassifier', 'LogisticRegression', ]: if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() # num_cols = X_fit.shape[1] # kwargs = { # 'num_cols':num_cols # , 'nb_epoch': 20 # , 'batch_size': 10 # , 'verbose': 1 # } # model_params = self.model.get_params() # del model_params['build_fn'] # for k, v in model_params.items(): # if k not in kwargs: # kwargs[k] = v # if self.type_of_estimator == 'regressor': # self.model = KerasRegressor(build_fn=make_deep_learning_model, **kwargs) try: self.model.fit(X_fit, y) except TypeError as e: if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() self.model.fit() return self
def fit(self, X, y): self.model_name = get_name_from_model(self.model) # if self.model_name[:3] == 'XGB' and scipy.sparse.issparse(X): # ones = [[1] for x in range(X.shape[0])] # # Trying to force XGBoost to play nice with sparse matrices # X_fit = scipy.sparse.hstack((X, ones)) # else: X_fit = X if self.model_name[:12] == 'DeepLearning' or self.model_name in [ 'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit', 'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier', 'SGDClassifier', 'RidgeClassifier', 'LogisticRegression' ]: if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() if self.model_name[:12] == 'DeepLearning': if keras_installed: # For Keras, we need to tell it how many input nodes to expect, which is our num_cols num_cols = X_fit.shape[1] model_params = self.model.get_params() del model_params['build_fn'] if self.type_of_estimator == 'regressor': self.model = KerasRegressor( build_fn=utils_models.make_deep_learning_model, num_cols=num_cols, **model_params) elif self.type_of_estimator == 'classifier': self.model = KerasClassifier( build_fn=utils_models. make_deep_learning_classifier, num_cols=num_cols, **model_params) else: print( 'WARNING: We did not detect that Keras was available.') raise TypeError( 'A DeepLearning model was requested, but Keras was not available to import' ) try: if self.model_name[:12] == 'DeepLearning': print( 'Stopping training early if we have not seen an improvement in training accuracy in 25 epochs' ) from keras.callbacks import EarlyStopping early_stopping = EarlyStopping(monitor='loss', patience=25, verbose=1) self.model.fit(X_fit, y, callbacks=[early_stopping]) else: self.model.fit(X_fit, y) except TypeError as e: if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() self.model.fit(X_fit, y) except KeyboardInterrupt as e: pass return self
def fit(self, X, y): global keras_imported, KerasRegressor, KerasClassifier, EarlyStopping, ModelCheckpoint, TerminateOnNaN, keras_load_model self.model_name = get_name_from_model(self.model) X_fit = X if self.model_name[:12] == 'DeepLearning' or self.model_name in [ 'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit', 'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier', 'SGDClassifier', 'RidgeClassifier', 'LogisticRegression' ]: if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() if self.model_name[:12] == 'DeepLearning': if keras_imported == False: # Suppress some level of logs os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from keras.callbacks import EarlyStopping, ModelCheckpoint, TerminateOnNaN from keras.models import load_model as keras_load_model from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier keras_imported = True # For Keras, we need to tell it how many input nodes to expect, which is our num_cols num_cols = X_fit.shape[1] model_params = self.model.get_params() del model_params['build_fn'] try: del model_params['feature_learning'] except: pass try: del model_params['num_cols'] except: pass if self.type_of_estimator == 'regressor': self.model = KerasRegressor( build_fn=utils_models.make_deep_learning_model, num_cols=num_cols, feature_learning=self.feature_learning, **model_params) elif self.type_of_estimator == 'classifier': self.model = KerasClassifier( build_fn=utils_models.make_deep_learning_classifier, num_cols=num_cols, feature_learning=self.feature_learning, **model_params) if self.model_name[:12] == 'DeepLearning': try: if self.is_hp_search == True: patience = 5 verbose = 0 else: patience = 25 verbose = 2 X_fit, y, X_test, y_test = self.get_X_test(X_fit, y) try: X_test = X_test.toarray() except AttributeError as e: pass if not self.is_hp_search: print( '\nWe will stop training early if we have not seen an improvement in validation accuracy in {} epochs' .format(patience)) print( 'To measure validation accuracy, we will split off a random 10 percent of your training data set' ) early_stopping = EarlyStopping(monitor='val_loss', patience=patience, verbose=verbose) terminate_on_nan = TerminateOnNaN() now_time = datetime.datetime.now() time_string = str(now_time.year) + '_' + str( now_time.month) + '_' + str(now_time.day) + '_' + str( now_time.hour) + '_' + str(now_time.minute) temp_file_name = 'tmp_dl_model_checkpoint_' + time_string + str( random.random()) + '.h5' model_checkpoint = ModelCheckpoint(temp_file_name, monitor='val_loss', save_best_only=True, mode='min', period=1) callbacks = [early_stopping, terminate_on_nan] if not self.is_hp_search: callbacks.append(model_checkpoint) self.model.fit(X_fit, y, callbacks=callbacks, validation_data=(X_test, y_test), verbose=verbose) # TODO: give some kind of logging on how the model did here! best epoch, best accuracy, etc. if self.is_hp_search is False: self.model = keras_load_model(temp_file_name) try: os.remove(temp_file_name) except OSError as e: pass except KeyboardInterrupt as e: print( 'Stopping training at this point because we heard a KeyboardInterrupt' ) print( 'If the deep learning model is functional at this point, we will output the model in its latest form' ) print( 'Note that this feature is an unofficial beta-release feature that is known to fail on occasion' ) if self.is_hp_search is False: self.model = keras_load_model(temp_file_name) try: os.remove(temp_file_name) except OSError as e: pass elif self.model_name[:4] == 'LGBM': X_fit = X.toarray() X_fit, y, X_test, y_test = self.get_X_test(X_fit, y) try: X_test = X_test.toarray() except AttributeError as e: pass if self.type_of_estimator == 'regressor': eval_metric = 'rmse' elif self.type_of_estimator == 'classifier': if len(set(y_test)) > 2: eval_metric = 'multi_logloss' else: eval_metric = 'binary_logloss' verbose = True if self.is_hp_search == True: verbose = False if self.X_test is not None: eval_name = 'X_test_the_user_passed_in' else: eval_name = 'random_holdout_set_from_training_data' cat_feature_indices = self.get_categorical_feature_indices() if cat_feature_indices is None: self.model.fit(X_fit, y, eval_set=[(X_test, y_test)], early_stopping_rounds=100, eval_metric=eval_metric, eval_names=[eval_name], verbose=verbose) else: self.model.fit(X_fit, y, eval_set=[(X_test, y_test)], early_stopping_rounds=100, eval_metric=eval_metric, eval_names=[eval_name], categorical_feature=cat_feature_indices, verbose=verbose) elif self.model_name[:8] == 'CatBoost': X_fit = X_fit.toarray() if self.type_of_estimator == 'classifier' and len( pd.Series(y).unique()) > 2: # TODO: we might have to modify the format of the y values, converting them all to ints, then back again (sklearn has a useful inverse_transform on some preprocessing classes) self.model.set_params(loss_function='MultiClass') cat_feature_indices = self.get_categorical_feature_indices() self.model.fit(X_fit, y, cat_features=cat_feature_indices) elif self.model_name[:16] == 'GradientBoosting': if not sklearn_version > '0.18.1': X_fit = X_fit.toarray() patience = 20 best_val_loss = -10000000000 num_worse_rounds = 0 best_model = deepcopy(self.model) X_fit, y, X_test, y_test = self.get_X_test(X_fit, y) # Add a variable number of trees each time, depending how far into the process we are if os.environ.get('is_test_suite', False) == 'True': num_iters = list(range(1, 50, 1)) + list(range( 50, 100, 2)) + list(range(100, 250, 3)) else: num_iters = list(range( 1, 50, 1)) + list(range(50, 100, 2)) + list( range(100, 250, 3)) + list(range(250, 500, 5)) + list( range(500, 1000, 10)) + list(range( 1000, 2000, 20)) + list(range( 2000, 10000, 100)) # TODO: get n_estimators from the model itself, and reduce this list to only those values that come under the value from the model try: for num_iter in num_iters: warm_start = True if num_iter == 1: warm_start = False self.model.set_params(n_estimators=num_iter, warm_start=warm_start) self.model.fit(X_fit, y) if self.training_prediction_intervals == True: val_loss = self.model.score(X_test, y_test) else: try: val_loss = self._scorer.score(self, X_test, y_test) except Exception as e: val_loss = self.model.score(X_test, y_test) if val_loss - self.min_step_improvement > best_val_loss: best_val_loss = val_loss num_worse_rounds = 0 best_model = deepcopy(self.model) else: num_worse_rounds += 1 print( '[' + str(num_iter) + '] random_holdout_set_from_training_data\'s score is: ' + str(round(val_loss, 3))) if num_worse_rounds >= patience: break except KeyboardInterrupt: print( 'Heard KeyboardInterrupt. Stopping training, and using the best checkpointed GradientBoosting model' ) pass self.model = best_model print( 'The number of estimators that were the best for this training dataset: ' + str(self.model.get_params()['n_estimators'])) print('The best score on the holdout set: ' + str(best_val_loss)) else: self.model.fit(X_fit, y) if self.X_test is not None: del self.X_test del self.y_test return self
def fit(self, X, y): self.model_name = get_name_from_model(self.model) X_fit = X if self.model_name[:12] == 'DeepLearning' or self.model_name in ['BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit', 'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier', 'SGDClassifier', 'RidgeClassifier', 'LogisticRegression']: if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() if self.model_name[:12] == 'DeepLearning': # For Keras, we need to tell it how many input nodes to expect, which is our num_cols num_cols = X_fit.shape[1] model_params = self.model.get_params() del model_params['build_fn'] if self.type_of_estimator == 'regressor': self.model = KerasRegressor(build_fn=utils_models.make_deep_learning_model, num_cols=num_cols, feature_learning=self.feature_learning, **model_params) elif self.type_of_estimator == 'classifier': self.model = KerasClassifier(build_fn=utils_models.make_deep_learning_classifier, num_cols=num_cols, feature_learning=self.feature_learning, **model_params) try: if self.model_name[:12] == 'DeepLearning': print('\nWe will stop training early if we have not seen an improvement in training accuracy in 25 epochs') from keras.callbacks import EarlyStopping early_stopping = EarlyStopping(monitor='loss', patience=25, verbose=1) self.model.fit(X_fit, y, callbacks=[early_stopping]) elif self.model_name[:16] == 'GradientBoosting': if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() patience = 20 best_val_loss = -10000000000 num_worse_rounds = 0 best_model = deepcopy(self.model) X_fit, X_test, y, y_test = train_test_split(X_fit, y, test_size=0.15) # Add a variable number of trees each time, depending how far into the process we are num_iters = list(range(1, 50, 1)) + list(range(50, 100, 2)) + list(range(100, 250, 3)) + list(range(250, 500, 5)) + list(range(500, 1000, 10)) + list(range(1000, 2000, 20)) + list(range(2000, 10000, 100)) try: for num_iter in num_iters: warm_start = True if num_iter == 1: warm_start = False self.model.set_params(n_estimators=num_iter, warm_start=warm_start) self.model.fit(X_fit, y) try: val_loss = self._scorer.score(self, X_test, y_test) except Exception as e: val_loss = self.model.score(X_test, y_test) if val_loss > best_val_loss: best_val_loss = val_loss num_worse_rounds = 0 best_model = deepcopy(self.model) else: num_worse_rounds += 1 if num_worse_rounds >= patience: break except KeyboardInterrupt: print('Heard KeyboardInterrupt. Stopping training, and using the best checkpointed GradientBoosting model') pass self.model = best_model print('The number of estimators that were the best for this training dataset: ' + str(self.model.get_params()['n_estimators'])) print('The best score on a random 15 percent holdout set of the training data: ' + str(best_val_loss)) else: self.model.fit(X_fit, y) except TypeError as e: if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() self.model.fit(X_fit, y) except KeyboardInterrupt as e: print('Stopping training at this point because we heard a KeyboardInterrupt') print('If the model is functional at this point, we will output the model in its latest form') print('Note that not all models can be interrupted and still used, and that this feature generally is an unofficial beta-release feature that is known to fail on occasion') pass return self
def fit(self, X, y): self.model_name = get_name_from_model(self.model) X_fit = X if self.model_name[:12] == 'DeepLearning' or self.model_name in [ 'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit', 'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier', 'SGDClassifier', 'RidgeClassifier', 'LogisticRegression' ]: if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() if self.model_name[:12] == 'DeepLearning': if keras_installed: # For Keras, we need to tell it how many input nodes to expect, which is our num_cols num_cols = X_fit.shape[1] model_params = self.model.get_params() del model_params['build_fn'] if self.type_of_estimator == 'regressor': self.model = KerasRegressor( build_fn=utils_models.make_deep_learning_model, num_cols=num_cols, feature_learning=self.feature_learning, **model_params) elif self.type_of_estimator == 'classifier': self.model = KerasClassifier( build_fn=utils_models. make_deep_learning_classifier, num_cols=num_cols, feature_learning=self.feature_learning, **model_params) else: print( 'WARNING: We did not detect that Keras was available.') raise TypeError( 'A DeepLearning model was requested, but Keras was not available to import' ) try: if self.model_name[:12] == 'DeepLearning': print( '\nWe will stop training early if we have not seen an improvement in training accuracy in 25 epochs' ) from keras.callbacks import EarlyStopping early_stopping = EarlyStopping(monitor='loss', patience=25, verbose=1) self.model.fit(X_fit, y, callbacks=[early_stopping]) else: self.model.fit(X_fit, y) except TypeError as e: if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() self.model.fit(X_fit, y) except KeyboardInterrupt as e: print( 'Stopping training at this point because we heard a KeyboardInterrupt' ) print( 'If the model is functional at this point, we will output the model in its latest form' ) print( 'Note that not all models can be interrupted and still used, and that this feature generally is an unofficial beta-release feature that is known to fail on occasion' ) pass return self
def fit(self, X, y): self.model_name = get_name_from_model(self.model) X_fit = X if self.model_name[:12] == 'DeepLearning' or self.model_name in [ 'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit', 'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier', 'SGDClassifier', 'RidgeClassifier', 'LogisticRegression' ]: if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() if self.model_name[:12] == 'DeepLearning': # For Keras, we need to tell it how many input nodes to expect, which is our num_cols num_cols = X_fit.shape[1] model_params = self.model.get_params() del model_params['build_fn'] if self.type_of_estimator == 'regressor': self.model = KerasRegressor( build_fn=utils_models.make_deep_learning_model, num_cols=num_cols, feature_learning=self.feature_learning, **model_params) elif self.type_of_estimator == 'classifier': self.model = KerasClassifier( build_fn=utils_models.make_deep_learning_classifier, num_cols=num_cols, feature_learning=self.feature_learning, **model_params) try: if self.model_name[:12] == 'DeepLearning': print( '\nWe will stop training early if we have not seen an improvement in training accuracy in 25 epochs' ) from keras.callbacks import EarlyStopping early_stopping = EarlyStopping(monitor='loss', patience=25, verbose=1) self.model.fit(X_fit, y, callbacks=[early_stopping]) elif self.model_name[:4] == 'LGBM': X_fit, X_test, y, y_test = train_test_split(X_fit, y, test_size=0.15) if self.type_of_estimator == 'regressor': eval_metric = 'rmse' elif self.type_of_estimator == 'classifier': if len(set(y_test)) > 2: eval_metric = 'multi_logloss' else: eval_metric = 'binary_logloss' self.model.fit( X_fit, y, eval_set=[(X_test, y_test)], early_stopping_rounds=50, eval_metric=eval_metric, eval_names=['random_holdout_set_from_training_data']) elif self.model_name[:8] == 'CatBoost': X_fit = pd.DataFrame(X_fit.todense()) if self.type_of_estimator == 'classifier' and len( pd.Series(y).unique()) > 2: # TODO: we might have to modify the format of the y values, converting them all to ints, then back again somehow self.model.set_params(loss_function='MultiClass') self.model.fit(X_fit, y) elif self.model_name[:16] == 'GradientBoosting': if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() patience = 20 best_val_loss = -10000000000 num_worse_rounds = 0 best_model = deepcopy(self.model) X_fit, X_test, y, y_test = train_test_split(X_fit, y, test_size=0.15) # Add a variable number of trees each time, depending how far into the process we are if os.environ.get('is_test_suite', False) == 'True': num_iters = list(range(1, 50, 1)) + list(range( 50, 100, 2)) + list(range(100, 250, 3)) else: num_iters = list(range(1, 50, 1)) + list(range( 50, 100, 2)) + list(range(100, 250, 3)) + list( range(250, 500, 5)) + list( range(500, 1000, 10)) + list( range(1000, 2000, 20)) + list( range(2000, 10000, 100)) try: for num_iter in num_iters: warm_start = True if num_iter == 1: warm_start = False self.model.set_params(n_estimators=num_iter, warm_start=warm_start) self.model.fit(X_fit, y) if self.training_prediction_intervals == True: val_loss = self.model.score(X_test, y_test) else: try: val_loss = self._scorer.score( self, X_test, y_test) except Exception as e: val_loss = self.model.score(X_test, y_test) if val_loss - self.min_step_improvement > best_val_loss: best_val_loss = val_loss num_worse_rounds = 0 best_model = deepcopy(self.model) else: num_worse_rounds += 1 print( '[' + str(num_iter) + '] random_holdout_set_from_training_data\'s score is: ' + str(round(val_loss, 3))) if num_worse_rounds >= patience: break except KeyboardInterrupt: print( 'Heard KeyboardInterrupt. Stopping training, and using the best checkpointed GradientBoosting model' ) pass self.model = best_model print( 'The number of estimators that were the best for this training dataset: ' + str(self.model.get_params()['n_estimators'])) print( 'The best score on a random 15 percent holdout set of the training data: ' + str(best_val_loss)) else: self.model.fit(X_fit, y) except TypeError as e: if scipy.sparse.issparse(X_fit): X_fit = X_fit.todense() self.model.fit(X_fit, y) except KeyboardInterrupt as e: print( 'Stopping training at this point because we heard a KeyboardInterrupt' ) print( 'If the model is functional at this point, we will output the model in its latest form' ) print( 'Note that not all models can be interrupted and still used, and that this feature generally is an unofficial beta-release feature that is known to fail on occasion' ) pass return self
def fit_grid_search(self, X_df, y, gs_params): model = gs_params['model'] # Sometimes we're optimizing just one model, sometimes we're comparing a bunch of non-optimized models. if isinstance(model, list): model = model[0] model_name = utils_models.get_name_from_model(model) full_pipeline = self._construct_pipeline(model_name=model_name) ppl = full_pipeline.named_steps['final_model'] if self.verbose: grid_search_verbose = 5 else: grid_search_verbose = 0 gs = GridSearchCV( # Fit on the pipeline. ppl, # Two splits of cross-validation, by default cv=self.cv, param_grid=gs_params, # Train across all cores. n_jobs=-1, # Be verbose (lots of printing). verbose=grid_search_verbose, # Print warnings when we fail to fit a given combination of parameters, but do not raise an error. # Set the score on this partition to some very negative number, so that we do not choose this estimator. error_score=-1000000000, scoring=self._scorer.score, # Don't allocate memory for all jobs upfront. Instead, only allocate enough memory to handle the current jobs plus an additional 50% pre_dispatch='1.5*n_jobs') if self.verbose: print( '\n\n********************************************************************************************' ) if self.optimize_final_model == True: print( 'About to run GridSearchCV on the pipeline for the model ' + model_name + ' to predict ' + self.output_column) else: print( 'About to run GridSearchCV on the pipeline for several models to predict ' + self.output_column) # Note that we will only report analytics results on the final model that ultimately gets selected, and trained on the entire dataset gs.fit(X_df, y) if self.write_gs_param_results_to_file: utils.write_gs_param_results_to_file(gs, self.gs_param_file_name) if self.verbose: self.print_training_summary(gs) self.trained_final_model = gs.best_estimator_ if 'model' in gs.best_params_: model_name = gs.best_params_['model'] self.print_results(model_name) return gs