def bayesian_optimize(clf, param_grid, X_train, X_test, y_train, y_test, beta=0.2, threshold=0.1, n_iter=20, verbose=0, n_jobs=1): """ Wrapper for Bayesian Optimization for hyperparameter tuning. Arguments: clf -- input classifier. May be of sklearn class. param_grid -- input parameter grid, in a dictionary format. X_train, X_test, y_train, y_test -- X and y data. beta -- float, beta < 1 favors more on NPV, while beta > 1 more on specificity threshold -- float, Threshold for classifying positive vs. negative classes. n_iter -- int, number of iterations to carry out Bayesian Optimization. verbose -- int, verbosity controller. n_jobs -- int, number of processes/threads being used. Returns: opt_rf -- The optimized classifier. """ # RandomForest + Bayesian optimization opt_rf = BayesSearchCV(clf, param_grid, n_iter=n_iter, verbose=verbose, scoring=make_scorer(adjusted_neg_Fbeta_score, needs_proba=True, beta=beta, threshold=threshold), n_jobs=n_jobs) # callback handler def on_step(optim_result): score = opt_rf.best_score_ print("\t- best score: %s" % score) if score >= 0.99: print('* Interrupting...') return True opt_rf.fit(X_train, y_train.ravel(), callback=[on_step]) y_prob = np.asarray(opt_rf.predict_proba(X_test)) y_test = np.asarray(y_test) summarize_res(opt_rf, y_prob, y_test) return opt_rf
def run_shallow(data_dir: str, results_dir: str, splits: List[str], metric: str, n_iter: int, n_points: int, n_folds: int, n_jobs: int) -> None: """Evaluate shallow baselines on the scruples resource. Train shallow baseline models on the scruples resource, reading the dataset from DATA_DIR, and writing trained models, logs, and other results to RESULTS_DIR. Performance is reported for each split provided as an argument. """ # Step 1: Manage and construct paths. logger.info('Creating the results directory.') os.makedirs(results_dir) model_paths = {} metrics_paths = collections.defaultdict(dict) predictions_paths = collections.defaultdict(dict) for baseline in baselines.resource.SHALLOW_BASELINES.keys(): os.makedirs(os.path.join(results_dir, baseline)) model_paths[baseline] = os.path.join(results_dir, baseline, 'model.pkl') for split in splits: os.makedirs(os.path.join(results_dir, baseline, split)) metrics_paths[baseline][split] = os.path.join( results_dir, baseline, split, 'metrics.json') predictions_paths[baseline][split] = os.path.join( results_dir, baseline, split, 'predictions.jsonl') # Step 2: Load the data. logger.info(f'Loading the data from {data_dir}.') dataset = ScruplesResource(data_dir=data_dir) # Step 3: Run the baselines. logger.info('Running the baselines.') for baseline, (Model, hyper_parameter_space) in tqdm.tqdm( baselines.resource.SHALLOW_BASELINES.items(), **settings.TQDM_KWARGS): # tune the hyper-parameters and train the model ids, features, labels, label_scores = dataset.train if hyper_parameter_space: model = BayesSearchCV( Model, hyper_parameter_space, scoring=make_scorer(score_func=METRICS[metric][1], **METRICS[metric][2]), n_iter=n_iter, n_points=n_points, cv=n_folds, n_jobs=os.cpu_count() if n_jobs == 0 else n_jobs, refit=True) else: model = Model model.fit(features, labels) # Step 4: Save the model. with open(model_paths[baseline], 'wb') as model_file: dill.dump(model, model_file) # Step 5: Run evaluation on the splits. for split in splits: ids, features, labels, label_scores = getattr(dataset, split) predictions = model.predict(features) probabilities = model.predict_proba(features) with open(metrics_paths[baseline][split], 'w') as metrics_file: json.dump( { key: metric( y_true=labels, y_pred=probabilities if scorer_kwargs['needs_proba'] else predictions) for key, (_, metric, scorer_kwargs) in METRICS.items() }, metrics_file) with open(predictions_paths[baseline][split], 'w')\ as predictions_file: for id_, probs, prediction in zip(ids, probabilities, predictions): predictions_file.write( json.dumps({ 'id': id_, 'label': prediction.tolist(), 'label_scores': probs.tolist() }) + '\n')
test_df = as_category(test_df) test_X = test_df.drop(['CONTACT_DATE', 'SNAP_DATE'], axis=1) if clf_name != 'FeaturePredictor': cols = list(set(test_X.columns).difference(test_X.select_dtypes(include='category').columns)) test_X.loc[:, cols] = test_X.loc[:, cols].fillna(0).replace([np.inf, -np.inf], 0) for c in test_X.select_dtypes(include='category').columns: test_X.loc[:, c] = test_X.loc[:, c].cat.codes adv_auc = 0 adv_train_x, adv_train_y, adv_test_x, adv_test_y = adversial_train_test_split(train_X.loc[:, features], train_y, test_X.loc[:, features], topK=1000) bayes_cv_tuner._fit_best_model(adv_train_x, adv_train_y) adv_pred_y = bayes_cv_tuner.predict_proba(adv_test_x)[:, 1] adv_auc = roc_auc_score(adv_test_y, adv_pred_y) print(f'Adversial AUC = {adv_auc} by {len(adv_test_y)} samples') bayes_cv_tuner._fit_best_model(train_X, train_y) test_y = bayes_cv_tuner.predict_proba(test_X) df = pd.DataFrame(test_y[:, 1]) df.to_csv(f"submits/" f"{best_estimator.__class__.__name__}" f"_{datetime.now().strftime('%d_%H_%M')}" f"_{bayes_cv_tuner.best_score_:0.4f}" f"_{adv_auc:0.4f}.csv", header=None, index=None)
} # scorer metric = make_scorer(score_func=log_loss, greater_is_better=False, needs_proba=True, labels=train['Category'].unique()) # cv kfold_cv = KFold(n_splits=5, shuffle=True, random_state=42) # bayessearch cv bayes_tuned_pipeline = BayesSearchCV(estimator=estimator_pipeline, search_spaces=search_space, n_iter=10, scoring=metric, cv=kfold_cv, verbose=12, n_jobs=-1, refit=True) bayes_tuned_pipeline.fit(X_train, y_train) # Saving model using pickle pickle.dump(bayes_tuned_pipeline, open('logistic_tuned_pipeline.pkl', 'wb')) phat_val = bayes_tuned_pipeline.predict_proba(X_val) log_loss(y_val, phat_val) make_submission_file(bayes_tuned_pipeline, test, 'onsite_logistic.csv')
###### CatBoost with Tuning cb_param_grid = {'iterations': Integer(10, 1000), 'depth': Integer(1, 8), 'learning_rate': Real(0.01, 1.0, 'log-uniform'), 'random_strength': Real(1e-9, 10, 'log-uniform'), 'bagging_temperature': Real(0.0, 1.0), 'border_count': Integer(1, 255), 'l2_leaf_reg': Integer(2, 30), 'scale_pos_weight':Real(0.01, 1.0, 'uniform')} cb_bs = BayesSearchCV(cb, cb_param_grid, scoring = 'roc_auc', n_iter = 100, n_jobs = 1, return_train_score = False, refit = True, optimizer_kwargs = {'base_estimator': 'GP'}, random_state = 123) cb_bs.fit(x_train, y_train) y_probs = cb_bs.predict_proba(x_test) y_probs = y_probs[:, 1] y_pred = cb_bs.predict(x_test) print(classification_report(y_test, y_pred)) print(roc_auc_score(y_test, y_probs)) ### 0.903 fpr, tpr, thresholds = roc_curve(y_test, y_probs) plot_roc_curve(fpr, tpr) # Find the best parameters cb_bs.best_params_ # Use the parameters to re-run the model cb_tuned = CatBoostClassifier(iterations = 1000, depth = 8, learning_rate = 0.11574, random_strength = 1e-9, bagging_temperature = 1.0,
'min_samples_leaf': Integer(2, 6), 'bootstrap': Categorical([False]), }, n_iter=300, random_state=42, cv=5, n_jobs=6, verbose=1) # y_metales.values.reshape(-1,1) # Fit the random search model opt.fit(X_train, y_train) opt.best_params_ opt.best_score_ opt_pred = opt.predict_proba(X_new)[:, 1] Confusion_Matrix(y_new, opt_pred, pred_prob=True) Confusion_Matrix(np.array(pd.concat([y_test, y_new], axis=0)), opt.predict_proba(pd.concat([X_test, X_new], axis=0))[:, 1], pred_prob=bool) ''' Best parameters ''' best_parameters = { 'GridSearchCV': { 'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto',