def _prep_keras_inputs(train, test): """Get the train/test data into the right format for Keras. First, split the train and test into features and target. Next, since keras models only accept np.ndarray's, format the training/test data to meet that. In addition, since this is a classification problem and a `softmax` will be used at the final layer, the test set needs to be fed in as two dimensional. Args: ---- train: pandas DataFrame test: pandas DataFrame Return: ------ train_features: np.ndarray train_target: np.ndarray test_features: np.ndarray test_target: np.ndarray """ # Break out the DataFrames. train_target, train_features = get_target_features(train) test_target, test_features = get_target_features(test) # Format everything. train_target = train_target.astype(int) train_target = np_utils.to_categorical(train_target) test_target = test_target.astype(int) test_target = np_utils.to_categorical(test_target) return train_features, train_target, test_features, test_target
def run_sklearn_param_search(model, train, cv_fold_generator, model_name, random=False, num_iterations=10): """Perform a model search over possible parameter values. Args: ---- model: varied Holds the model to perform the grid search over. Expected to implement the sklearn model interface. train: np.ndarray cv_fold_generator: SequentialTimeFold object An object that generates folds to perform cross-validation over. model_name: str random (optional): bool Holds whether or not to use RandomizedSearchCV or GridSearchCV. num_iterations (optional): int Number of iterations to use for random searching (if used). Returns: ------- best_model: sklearn.<searcher>.best_estimator_ best_mean_score: float """ train_target, train_features = get_target_features(train) eval_metric = return_scorer('auc_precision_recall') fit_params={} if random: params = _get_random_params(model_name) grid_search = RandomizedSearchCV(estimator=model, param_distributions=params, scoring=eval_metric, cv=cv_fold_generator, fit_params=fit_params, n_iter=num_iterations) else: params = _get_grid_params(model_name) grid_search = GridSearchCV(estimator=model, param_grid=params, scoring=eval_metric, cv=cv_fold_generator, fit_params=fit_params) grid_search.fit(train_features.values, train_target.values) best_model = grid_search.best_estimator_ best_mean_score = grid_search.best_score_ return best_model, best_mean_score
log_train_results(model_name, validation, best_fit_model, best_score, scores) # log_scores(best_fit_model, hold_out_features, hold_out_target, model_name, # date_parts, hold_out_feats_pre_norm) else: beg_dt, end_dt = sys.argv[3], sys.argv[4] beg_date, end_date = format_date(beg_dt), format_date(end_dt) model = get_model(model_name, {}) best_params = get_best_params(model_name) model.set_params(**best_params) dt_range = pd.date_range(beg_date, end_date) for dt in dt_range: validation, hold_out = get_train_test(input_df, 'date_fire', dt, train=False) validation, hold_out = prep_data(validation), prep_data(hold_out) Y_train, X_train = get_target_features(validation) Y_test, X_test = get_target_features(hold_out) # Don't run models if there are no obs if X_train.shape[0] and X_test.shape[0]: model.fit(X_train, Y_train) pred_probs = model.predict_proba(X_test)[:, 1] roc_auc, pr_auc = None, None # We can't get area under the curve if there are no fires :(. if Y_test.sum() != 0: roc_auc = return_score('auc_roc', pred_probs, Y_test) pr_auc = return_score('auc_precision_recall', pred_probs, Y_test) log_feat_importances(model, X_train, dt) log_test_results(dt, Y_test, pred_probs, roc_auc, pr_auc)
log_train_results(model_name, validation, best_fit_model, best_score, score_type) else: beg_dt, end_dt = sys.argv[3], sys.argv[4] beg_date, end_date = format_date(beg_dt), format_date(end_dt) model = get_model(model_name) best_params = get_best_params(model_name) model.set_params(**best_params) dt_range = pd.date_range(beg_date, end_date) for dt in dt_range: validation, hold_out = get_train_test(input_df, 'date_fire', dt) validation, hold_out = prep_data(validation), prep_data(hold_out) Y_train, X_train = get_target_features(validation) Y_test, X_test = get_target_features(hold_out) # Don't run models if there are no obs for a day. if X_train.shape[0] and X_test.shape[0]: model.fit(X_train, Y_train) pred_probs = model.predict_proba(X_test)[:, 1] roc_auc, pr_auc = None, None # We can't get area under the curve if there are no fires :(. if Y_test.sum() != 0: roc_auc = return_score('auc_roc', pred_probs, Y_test) pr_auc = return_score('auc_precision_recall', pred_probs, Y_test) log_feat_importances(model, X_train, dt) log_test_results(dt, geo_cols_df, Y_test, pred_probs, roc_auc, pr_auc)
def run_sklearn_param_search(model, train, cv_fold_generator, random=False, num_iterations=10, model_name=None, test=None): """Perform a model grid search over the inputted parameters and folds. For the given model and the relevant grid parameters, perform a grid search with those grid parameters, and return the best model. Args: ---- model: varied Holds the model to perform the grid search over. Expected to implement the sklearn model interface. train: np.ndarray cv_fold_generator: SequentialTimeFold/StratifiedTimeFold object An object that generates folds to perform cross-validation over. random (optional): bool Holds whether or not to use RandomizedSearchCV or GridSearchCV. num_iterations (optional): int Number of iterations to use for random searching (if used). model_name (optional): str Holds the model_name, to be used to determine if it is a boosting model, and whether or not to use early stopping. Must be passed in if `early_stopping_tolerance` is passed in. test (optional): np.ndarray To be used for early stopping if passed in. Returns: ------- best_model: sklearn.<searcher>.best_estimator_ The best model as obtained through the parameter search. best_mean_score: float The `mean_validation_score` from a sklearn.<searcher> object. scores: list The scores from each run of the paramateter search. """ train_target, train_features = get_target_features(train) eval_metric = return_scorer('auc_precision_recall') fit_params={} if test and (model_name == 'gboosting' or model_name == 'xgboost'): test_target, test_features = get_target_features(test) # The monitor callback and xgboost use code under the hood # that requires these changes. test_target = test_target.values.astype('float32') test_features = test_features.values.astype('float32') test_target = test_target.copy(order='C') test_features = test_features.copy(order='C') early_stopping_tolerance = 5 fit_params = _prep_fit_params(model_name, fit_params, early_stopping_tolerance, test_features, test_target) if random: params = _get_random_params(model_name) grid_search = RandomizedSearchCV(estimator=model, param_distributions=params, scoring=eval_metric, cv=cv_fold_generator, fit_params=fit_params, n_iter=num_iterations) else: params = _get_grid_params(model_name) grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='roc_auc', cv=cv_fold_generator, fit_params=fit_params) grid_search.fit(train_features.values, train_target.values) best_model = grid_search.best_estimator_ best_mean_score = grid_search.best_score_ scores = grid_search.grid_scores_ return best_model, best_mean_score, scores