indices = np.argsort(importances)[::-1] print('{}特征权值分布为: '.format(model_name)) for f in range(X_train.shape[1]): print("%d. feature %d [%s] (%f)" % (f + 1, indices[f], features_to_train[indices[f]], importances[indices[f]])) features_distribution.append( (f + 1, indices[f], features_to_train[indices[f]], importances[indices[f]])) important_features.append(features_to_train[indices[f]]) print(important_features) except AttributeError: print('{} has no feture_importances_'.format(model_name)) try: y_score = clf.decision_function(X_test)[:, 1] except AttributeError: print('{} has no decision_function, use predict func.'.format( model_name)) y_score = clf.predict_proba(X_test)[:, 1] # Compute ROC curve and ROC area for each class roc_auc = roc_auc_score(y_test, y_score, sample_weight=None) # Plot ROC curve print('{} ROC curve (area = {})'.format(model_name, roc_auc)) print('Saving model %s to %s......' % (model_name, model_file)) joblib.dump(clf, model_file) model_metainfo = { 'sub_file': sub_file,
def cvgrid_search(X, y, X_test, model_type): X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.2, random_state=42) if model_type == 'cat': model = CatBoostClassifier(**PARAMS['cat_def']) grid_search_result = model.grid_search(PARAMS['cat_cv'], X=X_train, y=y_train, plot=False, cv=5, stratified=True, verbose=0) best_model = CatBoostClassifier(**PARAMS['cat_def'], **grid_search_result['params']) best_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=0) get_preds = lambda model, x: model.predict( x, prediction_type='Probability')[:, 1] if model_type == 'rf': rf = RandomForestClassifier() rf_random = RandomizedSearchCV(estimator=rf, param_distributions=PARAMS['rf_cv'], n_iter=50, cv=5, verbose=0, random_state=42, n_jobs=-1) # Tune hyperparams rf_random.fit(X_train, y_train) best_model = rf_random.best_estimator_ # Train set only best_model.fit(X_train, y_train) get_preds = lambda model, x: model.predict_proba(x)[:, 1] if model_type == 'lr': lr = linear_model.LogisticRegression() lr_random = RandomizedSearchCV(estimator=lr, param_distributions=PARAMS['lr_cv'], n_iter=50, cv=5, verbose=0, random_state=42, n_jobs=-1) # Tune hyperparams lr_random.fit(X_train, y_train) best_model = lr_random.best_estimator_ # Train set only best_model.fit(X_train, y_train) get_preds = lambda model, x: model.predict_proba(x)[:, 1] if model_type == 'svm': svc = SVC() svc_random = RandomizedSearchCV(estimator=svc, param_distributions=PARAMS['svm_cv'], n_iter=50, cv=5, verbose=0, random_state=42, n_jobs=-1) # Tune hyperparams svc_random.fit(X_train, y_train) best_model = svc_random.best_estimator_ # Train set only best_model.fit(X_train, y_train) get_preds = lambda model, x: model.decision_function(x) y_pred_train = get_preds(best_model, X_train) y_pred_val = get_preds(best_model, X_val) train_auc = roc_auc_score(y_train, y_pred_train) val_auc = roc_auc_score(y_val, y_pred_val) train_ap = average_precision_score(y_train, y_pred_train) val_ap = average_precision_score(y_val, y_pred_val) # Final fit best_model.fit(X, y) return (get_preds(best_model, X), get_preds(best_model, X_test)), (train_auc, val_auc), (train_ap, val_ap)