def param_tuning(X_train, y_train):

    # shuffle X and y
    X_train, y_train = shuffle(X_train, y_train, random_state=0)
    print('Searching best parameters...')

    model = CatBoostClassifier(verbose=0)
    param_dist = {
        'depth': [3, 6, 9],
        'learning_rate': [0.01, 0.02, 0.05],
        'l2_leaf_reg': [1, 3, 6],
    }

    grid_search = GridSearchCV(model,
                               param_grid=param_dist,
                               cv=3,
                               verbose=10,
                               scoring=metrics.make_scorer(
                                   au_prc,
                                   needs_proba=True,
                                   greater_is_better=True),
                               n_jobs=args.n_jobs)
    grid_search.fit(X_train, y_train)
    print('Best parameters:', grid_search.best_params_)
    model_best = grid_search.best_estimator_
    print('Done.')

    return model_best
def cv_train(X, y):
    X, y = shuffle(X, y, random_state=1)
    model = XGBClassifier(
        objective='binary:logistic',
        booster='gbtree') if args.gb_tool == 'xgboost' else CatBoostClassifier(
            verbose=0, cat_features=cat_features)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    scoring = {
        'AU-ROC':
        metrics.make_scorer(metrics.roc_auc_score,
                            needs_proba=True,
                            greater_is_better=True),
        'AU-PRC':
        metrics.make_scorer(au_prc, needs_proba=True, greater_is_better=True),
    }
    results = cross_validate(model,
                             X,
                             y,
                             cv=kfold,
                             scoring=scoring,
                             return_train_score=True)
    print(results)
    print("AU-ROC: %.2f%% (%.2f%%)" % (results['test_AU-ROC'].mean() * 100,
                                       results['test_AU-ROC'].std() * 100))
    print("AU-PRC: %.2f%% (%.2f%%)" % (results['test_AU-PRC'].mean() * 100,
                                       results['test_AU-PRC'].std() * 100))
def train_gbtree(X_train, y_train):

    # Training
    print('Training model...')
    # shuffle X and y
    X_train, y_train = shuffle(X_train, y_train, random_state=0)
    if args.gb_tool == 'xgboost':
        model = XGBClassifier(
            objective='binary:logistic',
            booster='gbtree',
            learning_rate=0.05,
            n_estimators=200,
            max_depth=3,
            min_child_weight=6,
            verbosity=1,
        )
        model.fit(X_train, y_train)
        params = model.get_params()
    else:
        model = CatBoostClassifier(
            verbose=0,
            cat_features=cat_features,
            random_state=args.rs_model,
            # scale_pos_weight=(1 - pos_rate) / pos_rate
        )
        model.fit(X_train, y_train)
        params = model.get_all_params()

    print('Parameters:', params)
    print('Done.')

    return model
def param_tuning(X_train, y_train):

    # shuffle X and y
    X_train, y_train = shuffle(X_train, y_train,
                               random_state=0
                               )
    print('Searching best parameters...')
    if args.gb_tool == 'xgboost':
        model = XGBClassifier(objective='binary:logistic', booster='gbtree')
        param_dist = {"max_depth": [3],
                      "min_child_weight": [6],
                      "n_estimators": [100, 200, 1000],
                      "learning_rate": [0.05, 0.3]}
    else:
        model = CatBoostClassifier(verbose=0)
        param_dist = {'depth': [3, 6, 9],
                      'learning_rate': [0.01, 0.02, 0.05],
                      'l2_leaf_reg': [1, 3, 6],
                      }
    grid_search = GridSearchCV(model,
                               param_grid=param_dist,
                               cv=3,
                               verbose=10,
                               scoring=metrics.make_scorer(au_prc, needs_proba=True, greater_is_better=True),
                               n_jobs=args.n_jobs
                               )
    grid_search.fit(X_train, y_train)
    print('Best parameters:', grid_search.best_params_)
    model_best = grid_search.best_estimator_
    print('Done.')

    return model_best
def train_gbtree(X_train, y_train, pos_rate, args):
    # Training
    print('Training model...')
    if args.gb_tool == 'xgboost':
        model = XGBClassifier(objective='binary:logistic',
                              booster='gbtree',
                              learning_rate=0.05,
                              n_estimators=200,
                              max_depth=3,
                              min_child_weight=6,
                              verbosity=1
                              )
    else:
        model = CatBoostClassifier(verbose=0,
                                   # scale_pos_weight=(1 - pos_rate) / pos_rate,
                                   learning_rate=args.lr,
                                   depth=args.depth,
                                   l2_leaf_reg=args.l2
                                   )

    model.fit(X_train, y_train)
    params = model.get_params() if args.gb_tool == 'xgboost' else model.get_all_params()
    print('Parameters:', params)
    print('Done.')

    return model
def param_tuning(X, y):

    # shuffle X and y
    X, y = shuffle(X, y, random_state=0)
    print('Searching best parameters...')
    if args.gb_tool == 'xgboost':
        model = XGBClassifier(objective='binary:logistic', booster='gbtree')
        param_dist = {
            "max_depth": [3],
            "min_child_weight": [6],
            "n_estimators": [100, 200, 1000],
            "learning_rate": [0.05, 0.3],
        }
        """
        Best Parameters (Default): {'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 
        'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0, 'gpu_id': -1, 
        'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': 0.05, 'max_delta_step': 0, 
        'max_depth': 3, 'min_child_weight': 6, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 1000, 
        'n_jobs': 0, 'num_parallel_tree': 1, 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 
        'subsample': 1, 'tree_method': None, 'validate_parameters': False, 'verbosity': 1}
        """
    else:
        model = CatBoostClassifier(verbose=0)
        param_dist = {
            'depth': [3, 6, 10],
            'learning_rate': [0.01, 0.05, 0.1],
            'l2_leaf_reg': [1, 3, 6],
        }
        """
        Best Parameters (Default): {'nan_mode': 'Min', 'eval_metric': 'Logloss', 'iterations': 1000, 
        'sampling_frequency': v    'PerTree', 'leaf_estimation_method': 'Newton', 'grow_policy': 'SymmetricTree', 
        'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 
        'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.10000000149011612, 
        'l2_leaf_reg': 3, 'random_strength': 1, 'rsm': 1, 'boost_from_average': False, 
        'model_size_reg': 0.5, 'subsample': 0.800000011920929, 'use_best_model': False, 'class_names': [0, 1], 
        'random_seed': 0, 'depth': 6, 'border_count': 254, 'classes_count': 0, 'sparse_features_conflict_fraction': 0, 
        'leaf_estimation_backtracking': 'AnyImprovement', 'best_model_min_trees': 1, 'model_shrink_rate': 0, 
        'min_data_in_leaf': 1, 'loss_function': 'Logloss', 'learning_rate': 0.058687999844551086, 
        'score_function': 'Cosine', 'task_type': 'CPU', 'leaf_estimation_iterations': 10, 
        'bootstrap_type': 'MVS', 'max_leaves': 64}
        """
    grid_search = GridSearchCV(model,
                               param_grid=param_dist,
                               cv=5,
                               verbose=10,
                               scoring=metrics.make_scorer(
                                   au_prc,
                                   needs_proba=True,
                                   greater_is_better=True),
                               n_jobs=-1)
    grid_search.fit(X, y)
    print('Best parameters:', grid_search.best_params_)
    model_best = grid_search.best_estimator_
    print('Done.')

    return model_best
Esempio n. 7
0
def train_gbtree(X_train, y_train, pos_rate, args):

    X_train, y_train = shuffle(X_train, y_train, random_state=0)

    result_table = pd.DataFrame(columns=[
        'random_state', 'model', 'fpr', 'tpr', 'roc', 'prec', 'rec', 'prc',
        'pos_rate'
    ])

    for rs in range(1):
        classifiers = [
            CatBoostClassifier(
                verbose=0,
                # scale_pos_weight=(1 - pos_rate) / pos_rate,
                learning_rate=args.lr,
                depth=args.depth,
                l2_leaf_reg=args.l2,
                random_state=rs)
        ]
        for cls in classifiers:

            print('Round', rs)
            print('Training:', cls.__class__.__name__)
            model = cls.fit(X_train, y_train)
            y_prob = model.predict_proba(X_test)[::, 1]

            # Evaluation
            fpr, tpr, _ = metrics.roc_curve(y_test, y_prob)
            prec, rec, _ = metrics.precision_recall_curve(y_test, y_prob)

            print('--------------------------------------------')
            print('Evaluation of test set:', cls.__class__.__name__)
            print("AU-ROC:", "%0.4f" % metrics.auc(fpr, tpr), "AU-PRC:",
                  "%0.4f" % metrics.auc(rec, prec))
            print('--------------------------------------------')

            result_table = result_table.append(
                {
                    'random_state': rs,
                    'model': cls.__class__.__name__,
                    'fpr': fpr,
                    'tpr': tpr,
                    'roc': metrics.auc(fpr, tpr),
                    'prec': prec,
                    'rec': rec,
                    'prc': metrics.auc(rec, prec),
                    'y_test': y_test,
                    'y_prob': y_prob,
                    'pos_rate': pos_rate
                },
                ignore_index=True)

    save_name = 'data/result/model_comparison/realtime_gbtree_random.pkl'
    # save results
    result_table.to_pickle(save_name)
def train_model(index_pair):
    (model_idx, rs) = index_pair

    classifiers = [
        CatBoostClassifier(random_state=rs, verbose=0, learning_rate=0.02, depth=6, l2_leaf_reg=3),
        LogisticRegression(random_state=rs, penalty='l2', n_jobs=-1),
        CalibratedClassifierCV(RandomForestClassifier(random_state=rs, n_jobs=-1)),
        CalibratedClassifierCV(LinearSVC(random_state=rs, max_iter=3000)),
        MLPClassifier(random_state=rs, max_iter=10000)
    ]
    model_names = [
        "CatBoost",
        "LogisticRegression",
        "RandomForest",
        "LinearSVC",
        "MLP",
    ]

    cls = classifiers[model_idx]
    result_table = pd.DataFrame(columns=['model', 'random_state', 'fpr', 'tpr', 'roc', 'prec', 'rec', 'prc', 'pos_rate'])
    print('Round', rs)
    print('Training:', cls.__class__.__name__)
    model = cls.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)[::, 1]

    # Evaluation
    fpr, tpr, _ = metrics.roc_curve(y_test, y_prob)
    prec, rec, _ = metrics.precision_recall_curve(y_test, y_prob)
    print('--------------------------------------------')
    print('Evaluation of test set:', cls.__class__.__name__)
    print("AU-ROC:", "%0.4f" % metrics.auc(fpr, tpr),
          "AU-PRC:", "%0.4f" % metrics.auc(rec, prec))
    print('--------------------------------------------')

    result_table = result_table.append({
        'model': model_names[model_idx],
        'random_state': rs,
        'fpr': fpr,
        'tpr': tpr,
        'roc': metrics.auc(fpr, tpr),
        'prec': prec,
        'rec': rec,
        'prc': metrics.auc(rec, prec),
        'y_test': y_test,
        'y_prob': y_prob,
        'pos_rate': pos_rate
    }, ignore_index=True)

    return result_table