def param_tuning(X_train, y_train): # shuffle X and y X_train, y_train = shuffle(X_train, y_train, random_state=0) print('Searching best parameters...') model = CatBoostClassifier(verbose=0) param_dist = { 'depth': [3, 6, 9], 'learning_rate': [0.01, 0.02, 0.05], 'l2_leaf_reg': [1, 3, 6], } grid_search = GridSearchCV(model, param_grid=param_dist, cv=3, verbose=10, scoring=metrics.make_scorer( au_prc, needs_proba=True, greater_is_better=True), n_jobs=args.n_jobs) grid_search.fit(X_train, y_train) print('Best parameters:', grid_search.best_params_) model_best = grid_search.best_estimator_ print('Done.') return model_best
def cv_train(X, y): X, y = shuffle(X, y, random_state=1) model = XGBClassifier( objective='binary:logistic', booster='gbtree') if args.gb_tool == 'xgboost' else CatBoostClassifier( verbose=0, cat_features=cat_features) kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) scoring = { 'AU-ROC': metrics.make_scorer(metrics.roc_auc_score, needs_proba=True, greater_is_better=True), 'AU-PRC': metrics.make_scorer(au_prc, needs_proba=True, greater_is_better=True), } results = cross_validate(model, X, y, cv=kfold, scoring=scoring, return_train_score=True) print(results) print("AU-ROC: %.2f%% (%.2f%%)" % (results['test_AU-ROC'].mean() * 100, results['test_AU-ROC'].std() * 100)) print("AU-PRC: %.2f%% (%.2f%%)" % (results['test_AU-PRC'].mean() * 100, results['test_AU-PRC'].std() * 100))
def train_gbtree(X_train, y_train): # Training print('Training model...') # shuffle X and y X_train, y_train = shuffle(X_train, y_train, random_state=0) if args.gb_tool == 'xgboost': model = XGBClassifier( objective='binary:logistic', booster='gbtree', learning_rate=0.05, n_estimators=200, max_depth=3, min_child_weight=6, verbosity=1, ) model.fit(X_train, y_train) params = model.get_params() else: model = CatBoostClassifier( verbose=0, cat_features=cat_features, random_state=args.rs_model, # scale_pos_weight=(1 - pos_rate) / pos_rate ) model.fit(X_train, y_train) params = model.get_all_params() print('Parameters:', params) print('Done.') return model
def param_tuning(X_train, y_train): # shuffle X and y X_train, y_train = shuffle(X_train, y_train, random_state=0 ) print('Searching best parameters...') if args.gb_tool == 'xgboost': model = XGBClassifier(objective='binary:logistic', booster='gbtree') param_dist = {"max_depth": [3], "min_child_weight": [6], "n_estimators": [100, 200, 1000], "learning_rate": [0.05, 0.3]} else: model = CatBoostClassifier(verbose=0) param_dist = {'depth': [3, 6, 9], 'learning_rate': [0.01, 0.02, 0.05], 'l2_leaf_reg': [1, 3, 6], } grid_search = GridSearchCV(model, param_grid=param_dist, cv=3, verbose=10, scoring=metrics.make_scorer(au_prc, needs_proba=True, greater_is_better=True), n_jobs=args.n_jobs ) grid_search.fit(X_train, y_train) print('Best parameters:', grid_search.best_params_) model_best = grid_search.best_estimator_ print('Done.') return model_best
def train_gbtree(X_train, y_train, pos_rate, args): # Training print('Training model...') if args.gb_tool == 'xgboost': model = XGBClassifier(objective='binary:logistic', booster='gbtree', learning_rate=0.05, n_estimators=200, max_depth=3, min_child_weight=6, verbosity=1 ) else: model = CatBoostClassifier(verbose=0, # scale_pos_weight=(1 - pos_rate) / pos_rate, learning_rate=args.lr, depth=args.depth, l2_leaf_reg=args.l2 ) model.fit(X_train, y_train) params = model.get_params() if args.gb_tool == 'xgboost' else model.get_all_params() print('Parameters:', params) print('Done.') return model
def param_tuning(X, y): # shuffle X and y X, y = shuffle(X, y, random_state=0) print('Searching best parameters...') if args.gb_tool == 'xgboost': model = XGBClassifier(objective='binary:logistic', booster='gbtree') param_dist = { "max_depth": [3], "min_child_weight": [6], "n_estimators": [100, 200, 1000], "learning_rate": [0.05, 0.3], } """ Best Parameters (Default): {'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0, 'gpu_id': -1, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': 0.05, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 6, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 1000, 'n_jobs': 0, 'num_parallel_tree': 1, 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': None, 'validate_parameters': False, 'verbosity': 1} """ else: model = CatBoostClassifier(verbose=0) param_dist = { 'depth': [3, 6, 10], 'learning_rate': [0.01, 0.05, 0.1], 'l2_leaf_reg': [1, 3, 6], } """ Best Parameters (Default): {'nan_mode': 'Min', 'eval_metric': 'Logloss', 'iterations': 1000, 'sampling_frequency': v 'PerTree', 'leaf_estimation_method': 'Newton', 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.10000000149011612, 'l2_leaf_reg': 3, 'random_strength': 1, 'rsm': 1, 'boost_from_average': False, 'model_size_reg': 0.5, 'subsample': 0.800000011920929, 'use_best_model': False, 'class_names': [0, 1], 'random_seed': 0, 'depth': 6, 'border_count': 254, 'classes_count': 0, 'sparse_features_conflict_fraction': 0, 'leaf_estimation_backtracking': 'AnyImprovement', 'best_model_min_trees': 1, 'model_shrink_rate': 0, 'min_data_in_leaf': 1, 'loss_function': 'Logloss', 'learning_rate': 0.058687999844551086, 'score_function': 'Cosine', 'task_type': 'CPU', 'leaf_estimation_iterations': 10, 'bootstrap_type': 'MVS', 'max_leaves': 64} """ grid_search = GridSearchCV(model, param_grid=param_dist, cv=5, verbose=10, scoring=metrics.make_scorer( au_prc, needs_proba=True, greater_is_better=True), n_jobs=-1) grid_search.fit(X, y) print('Best parameters:', grid_search.best_params_) model_best = grid_search.best_estimator_ print('Done.') return model_best
def train_gbtree(X_train, y_train, pos_rate, args): X_train, y_train = shuffle(X_train, y_train, random_state=0) result_table = pd.DataFrame(columns=[ 'random_state', 'model', 'fpr', 'tpr', 'roc', 'prec', 'rec', 'prc', 'pos_rate' ]) for rs in range(1): classifiers = [ CatBoostClassifier( verbose=0, # scale_pos_weight=(1 - pos_rate) / pos_rate, learning_rate=args.lr, depth=args.depth, l2_leaf_reg=args.l2, random_state=rs) ] for cls in classifiers: print('Round', rs) print('Training:', cls.__class__.__name__) model = cls.fit(X_train, y_train) y_prob = model.predict_proba(X_test)[::, 1] # Evaluation fpr, tpr, _ = metrics.roc_curve(y_test, y_prob) prec, rec, _ = metrics.precision_recall_curve(y_test, y_prob) print('--------------------------------------------') print('Evaluation of test set:', cls.__class__.__name__) print("AU-ROC:", "%0.4f" % metrics.auc(fpr, tpr), "AU-PRC:", "%0.4f" % metrics.auc(rec, prec)) print('--------------------------------------------') result_table = result_table.append( { 'random_state': rs, 'model': cls.__class__.__name__, 'fpr': fpr, 'tpr': tpr, 'roc': metrics.auc(fpr, tpr), 'prec': prec, 'rec': rec, 'prc': metrics.auc(rec, prec), 'y_test': y_test, 'y_prob': y_prob, 'pos_rate': pos_rate }, ignore_index=True) save_name = 'data/result/model_comparison/realtime_gbtree_random.pkl' # save results result_table.to_pickle(save_name)
def train_model(index_pair): (model_idx, rs) = index_pair classifiers = [ CatBoostClassifier(random_state=rs, verbose=0, learning_rate=0.02, depth=6, l2_leaf_reg=3), LogisticRegression(random_state=rs, penalty='l2', n_jobs=-1), CalibratedClassifierCV(RandomForestClassifier(random_state=rs, n_jobs=-1)), CalibratedClassifierCV(LinearSVC(random_state=rs, max_iter=3000)), MLPClassifier(random_state=rs, max_iter=10000) ] model_names = [ "CatBoost", "LogisticRegression", "RandomForest", "LinearSVC", "MLP", ] cls = classifiers[model_idx] result_table = pd.DataFrame(columns=['model', 'random_state', 'fpr', 'tpr', 'roc', 'prec', 'rec', 'prc', 'pos_rate']) print('Round', rs) print('Training:', cls.__class__.__name__) model = cls.fit(X_train, y_train) y_prob = model.predict_proba(X_test)[::, 1] # Evaluation fpr, tpr, _ = metrics.roc_curve(y_test, y_prob) prec, rec, _ = metrics.precision_recall_curve(y_test, y_prob) print('--------------------------------------------') print('Evaluation of test set:', cls.__class__.__name__) print("AU-ROC:", "%0.4f" % metrics.auc(fpr, tpr), "AU-PRC:", "%0.4f" % metrics.auc(rec, prec)) print('--------------------------------------------') result_table = result_table.append({ 'model': model_names[model_idx], 'random_state': rs, 'fpr': fpr, 'tpr': tpr, 'roc': metrics.auc(fpr, tpr), 'prec': prec, 'rec': rec, 'prc': metrics.auc(rec, prec), 'y_test': y_test, 'y_prob': y_prob, 'pos_rate': pos_rate }, ignore_index=True) return result_table