Beispiel #1
0
def fit_logistic_GLM(X, y, 
                     C_value = np.array([-4,5]), 
                     num_cv = 5, 
                     verbose = False, 
                     intercept_scaling = 10,
                     penalty = 'l1',
                     reg_strength = None,
                     plot_results = False
                     ):
    scores_to_return = []
    loss_score = []
    X = pp.scale(X)
    # If regularization strength isn't specified, CV to find it
    if reg_strength == None:
        kf = SKFold(y = y, n_folds = num_cv)
        C_values = np.logspace(C_value[0], C_value[1], 10)
        C_dict = {"C": C_values}
        best_param = []
    #------------------------------------------------------------------------------ 
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            # Do grid search for regularization parameter
            clf = GSCV(
                        LR(C=1, penalty=penalty, dual=False,intercept_scaling=intercept_scaling),
                        C_dict,
                        cv=num_cv
                        )
            # Fit model
            clf.fit(X_train,y_train)
            best_param.append(clf.best_params_['C'])
            if verbose:
                for params, mean_score, scores in clf.grid_scores_:
                    print("%0.3f (+/-%0.03f) for %r"
                      % (mean_score, scores.std() / 2, params))
        if verbose:
            print np.mean(np.asarray(scores))
        reg_strength = gmean(best_param)
#------------------------------------------------------------------------------ 
    kf2 = SKFold(y = y, n_folds = num_cv)
    clf = []
    clf_temp = LR(
             penalty=penalty, 
             dual=False,
             C = reg_strength,
             intercept_scaling = intercept_scaling
             )
    for train, test in kf2:
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        clf_temp.fit(X_train, y_train)
        scores_to_return.append(clf_temp.score(X_test, y_test))
        clf.append(clf_temp)
        pred = clf_temp.predict_proba(X_test)[:,1]
        loss_score.append(lossFx(y_test,pred))
#------------------------------------------------------------------------------ 
    # Plot results
    if plot_results:
        plot_logistic_fit(clf,X,kf2)
    # Returns model, scores of each CV, best C parameter, CV fold indices
    return clf, scores_to_return, loss_score, reg_strength, kf2
Beispiel #2
0
def fit_RF(X,y,
           num_estimators = None, 
           verbose = False,
           plot_importance = False,
           num_cv = 5):
    X = pp.scale(X)
    # If num_estimators not provided, CV to estimate it
    if num_estimators == None:
        kf = SKFold(y = y, n_folds = num_cv)
        estimator_nums = np.logspace(0, np.size(X,axis=1), 10)
        estimator_nums = estimator_nums.astype(int)
        est_dict = {"n_estimators": estimator_nums}
        best_param = []
    #------------------------------------------------------------------------------ 
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            # Do grid search for num_estimators parameter
            clf = GSCV(
                        RF(n_estimators=1),
                        est_dict,
                        cv=num_cv
                        )
            # Fit model
            clf.fit(X_train,y_train)
            best_param.append(clf.best_params_['n_estimators'])
            if verbose:
                for params, mean_score, scores in clf.grid_scores_:
                    print("%0.3f (+/-%0.03f) for %r"
                      % (mean_score, scores.std() / 2, params))
        if verbose:
            print np.mean(np.asarray(scores))
        num_estimators = gmean(best_param)
#------------------------------------------------------------------------------ 
    # Measure accuracy
    kf2 = SKFold(y = y, n_folds = num_cv)
    clf = []
    accuracy = []
    importances = []
    loss_score = []
    clf_temp = RF(n_estimators = num_estimators, 
             n_jobs = 1, 
             verbose = verbose)#,
#              compute_importances = True)
    for train, test in kf2:
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        clf_temp.fit(X_train,y_train)
        clf.append(clf_temp)
        pred = clf_temp.predict_proba(X_test)[:,1]
        loss_score.append(lossFx(y_test,pred))
        accuracy.append(clf_temp.score(X_test, y_test))
        importances.append(clf_temp.feature_importances_)
        std = np.std([tree.feature_importances_ for tree in clf_temp.estimators_],axis=0)
        indices = np.argsort(importances)[::-1]
#------------------------------------------------------------------------------ 
        if verbose:
            print("Feature ranking:")
            for f in range(5):
                print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
#------------------------------------------------------------------------------ 
        if plot_importance:
            plot.figure()
            plot.title("Feature importances")
            plot.bar(range(10), importances[indices],
                   color="r", yerr=std[indices], align="center")
            plot.xticks(range(10), indices)
            plot.xlim([-1, 10])
            plot.show()
    return clf, accuracy, loss_score, num_estimators, kf2