Esempio n. 1
0
        indices = np.argsort(importances)[::-1]
        print('{}特征权值分布为: '.format(model_name))
        for f in range(X_train.shape[1]):
            print("%d. feature %d [%s] (%f)" %
                  (f + 1, indices[f], features_to_train[indices[f]],
                   importances[indices[f]]))
            features_distribution.append(
                (f + 1, indices[f], features_to_train[indices[f]],
                 importances[indices[f]]))
            important_features.append(features_to_train[indices[f]])
        print(important_features)
    except AttributeError:
        print('{} has no feture_importances_'.format(model_name))

    try:
        y_score = clf.decision_function(X_test)[:, 1]
    except AttributeError:
        print('{} has no decision_function, use predict func.'.format(
            model_name))
        y_score = clf.predict_proba(X_test)[:, 1]

    # Compute ROC curve and ROC area for each class
    roc_auc = roc_auc_score(y_test, y_score, sample_weight=None)

    # Plot ROC curve
    print('{} ROC curve (area = {})'.format(model_name, roc_auc))

    print('Saving model %s to %s......' % (model_name, model_file))
    joblib.dump(clf, model_file)
    model_metainfo = {
        'sub_file': sub_file,
Esempio n. 2
0
def cvgrid_search(X, y, X_test, model_type):
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      stratify=y,
                                                      shuffle=True,
                                                      test_size=0.2,
                                                      random_state=42)

    if model_type == 'cat':
        model = CatBoostClassifier(**PARAMS['cat_def'])
        grid_search_result = model.grid_search(PARAMS['cat_cv'],
                                               X=X_train,
                                               y=y_train,
                                               plot=False,
                                               cv=5,
                                               stratified=True,
                                               verbose=0)
        best_model = CatBoostClassifier(**PARAMS['cat_def'],
                                        **grid_search_result['params'])

        best_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=0)
        get_preds = lambda model, x: model.predict(
            x, prediction_type='Probability')[:, 1]

    if model_type == 'rf':
        rf = RandomForestClassifier()
        rf_random = RandomizedSearchCV(estimator=rf,
                                       param_distributions=PARAMS['rf_cv'],
                                       n_iter=50,
                                       cv=5,
                                       verbose=0,
                                       random_state=42,
                                       n_jobs=-1)
        # Tune hyperparams
        rf_random.fit(X_train, y_train)
        best_model = rf_random.best_estimator_
        # Train set only
        best_model.fit(X_train, y_train)
        get_preds = lambda model, x: model.predict_proba(x)[:, 1]

    if model_type == 'lr':
        lr = linear_model.LogisticRegression()
        lr_random = RandomizedSearchCV(estimator=lr,
                                       param_distributions=PARAMS['lr_cv'],
                                       n_iter=50,
                                       cv=5,
                                       verbose=0,
                                       random_state=42,
                                       n_jobs=-1)
        # Tune hyperparams
        lr_random.fit(X_train, y_train)
        best_model = lr_random.best_estimator_
        # Train set only
        best_model.fit(X_train, y_train)
        get_preds = lambda model, x: model.predict_proba(x)[:, 1]

    if model_type == 'svm':
        svc = SVC()
        svc_random = RandomizedSearchCV(estimator=svc,
                                        param_distributions=PARAMS['svm_cv'],
                                        n_iter=50,
                                        cv=5,
                                        verbose=0,
                                        random_state=42,
                                        n_jobs=-1)
        # Tune hyperparams
        svc_random.fit(X_train, y_train)
        best_model = svc_random.best_estimator_
        # Train set only
        best_model.fit(X_train, y_train)
        get_preds = lambda model, x: model.decision_function(x)

    y_pred_train = get_preds(best_model, X_train)
    y_pred_val = get_preds(best_model, X_val)
    train_auc = roc_auc_score(y_train, y_pred_train)
    val_auc = roc_auc_score(y_val, y_pred_val)
    train_ap = average_precision_score(y_train, y_pred_train)
    val_ap = average_precision_score(y_val, y_pred_val)

    # Final fit
    best_model.fit(X, y)

    return (get_preds(best_model, X),
            get_preds(best_model,
                      X_test)), (train_auc, val_auc), (train_ap, val_ap)