Ejemplo n.º 1
0
def CV_Binary_stats(X, y, model,n=10) :
    '''
    http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
    Note that some of the metrics here ONLY work for BINARY tasks.
    This will be VERY slow compared to the built-in, multicore CV implementation. (Unless
     used with a classifier that is parallelized anyway, such as RF).
    By default, balances weights when fitting

    http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
    '''
    from sklearn.metrics import precision_score, accuracy_score, recall_score,precision_recall_fscore_support

    mean_auc = 0.0
    mean_precision = 0.0
    mean_recall = 0.0
    mean_accuracy = 0.0

    sss = StratifiedShuffleSplit(y,  n_iter=n, test_size=0.2, random_state=0)
    for train_index, test_index in sss:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # for i in range(n) :
    #     # for each iteration, randomly hold out 30% of the data as CV set
    #     X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y,
    #                                                                      test_size=.15,
    #                                                                      random_state=i)
    #     cv=StratifiedShuffleSplit(y=y_train, n_iter=11, test_size=0.11)
        # train model and make predictions
        model.fit(X_train, y_train,sample_weight=balance_weights(y_train))
        # preds = model.predict(X_cv)
        preds = model.predict(X_test)

        '''
        # ROC_AUC - Restricted to binary (not multiclass) case.
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
        roc_auc = metrics.auc(fpr, tpr)
        # print("( %d/%d)" % (i + 1, n))
        mean_auc += roc_auc
        '''
        accuracy = accuracy_score(y_cv, preds)
        precision = precision_score(y_cv, preds)
        recall = recall_score(y_cv, preds)
        mean_accuracy += accuracy
        mean_precision += precision
        mean_recall += recall

    mean_accuracy = (mean_accuracy / n)
    mean_precision = mean_precision / n
    mean_recall = mean_recall / n
    # mean_auc = mean_auc / n
    print('mean_accuracy:  %s ' %(round(mean_accuracy, 3)))
    print('mean_precision:  %s ' %(round(mean_precision, 3)))
    print('mean_recall:  %s ' %(round(mean_recall, 3)))
    # print('mean_auc:  %s ' %(round(mean_auc, 3)))
    return (mean_accuracy,mean_precision,mean_recall)
Ejemplo n.º 2
0
def CV_Binary_stats(X, y, model,n=10) :
    '''
    http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
    Note that some of the metrics here ONLY work for BINARY tasks.
    This will be VERY slow compared to the built-in, multicore CV implementation. (Unless
     used with a classifier that is parallelized anyway, such as RF).
    By default, balances weights when fitting

    http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
    '''
    from sklearn.metrics import precision_score, accuracy_score, recall_score,precision_recall_fscore_support

    mean_auc = 0.0
    mean_precision = 0.0
    mean_recall = 0.0
    mean_accuracy = 0.0

    sss = StratifiedShuffleSplit(y,  n_iter=n, test_size=0.2, random_state=0)
    for train_index, test_index in sss:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # for i in range(n) :
    #     # for each iteration, randomly hold out 30% of the data as CV set
    #     X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y,
    #                                                                      test_size=.15,
    #                                                                      random_state=i)
    #     cv=StratifiedShuffleSplit(y=y_train, n_iter=11, test_size=0.11)
        # train model and make predictions
        model.fit(X_train, y_train,sample_weight=balance_weights(y_train))
        # preds = model.predict(X_cv)
        preds = model.predict(X_test)

        '''
        # ROC_AUC - Restricted to binary (not multiclass) case.
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
        roc_auc = metrics.auc(fpr, tpr)
        # print("( %d/%d)" % (i + 1, n))
        mean_auc += roc_auc
        '''
        accuracy = accuracy_score(y_cv, preds)
        precision = precision_score(y_cv, preds)
        recall = recall_score(y_cv, preds)
        mean_accuracy += accuracy
        mean_precision += precision
        mean_recall += recall

    mean_accuracy = (mean_accuracy / n)
    mean_precision = mean_precision / n
    mean_recall = mean_recall / n
    # mean_auc = mean_auc / n
    print('mean_accuracy:  %s ' %(round(mean_accuracy, 3)))
    print('mean_precision:  %s ' %(round(mean_precision, 3)))
    print('mean_recall:  %s ' %(round(mean_recall, 3)))
    # print('mean_auc:  %s ' %(round(mean_auc, 3)))
    return (mean_accuracy,mean_precision,mean_recall)
Ejemplo n.º 3
0
def ModelParam_GridSearch(X_train, y_train, cv=3,scoreParam = 'precision'):
    '''
    Basic grid searchCV for multiple classifiers' perf & parameters.
    This is limited as currently implemented, but still  computationally expensive.
    Not guaranteed to reach even a local optima, but good to get a
    rough idea of parameters for the classifiers. (Does not address pre-processing)
    More classifiers can be added as desired, and parameters expanded.

    Later: Add options for RBM + Logit; PCA; ICA; LDA.
    (Further) Feature selection should  be implemented within the CV pipeline, if you wish
    to avoid overfitting. (Note the effects of )
     See also
    http://scikit-learn-laboratory.readthedocs.org/en/latest/_modules/skll/learner.html

    Possible Scoreparams: scoreParam = 'f1','accuracy', 'precision', 'roc_auc'..
    '''

#    pipeline1 = Pipeline('clf', RandomForestClassifier() )

    pipeline1 = RandomForestClassifier(n_jobs=-1)
    pipeline2 = SVC(cache_size=1900)
    pipeline3 = GradientBoostingClassifier()
    pipeline4 = LogisticRegression()


    'RandomForestClassifier:'
    parameters1 = {
    'n_estimators': [120],
    'criterion': ['gini'],
    'max_features': ['auto',0.4],
    'min_samples_leaf':[1,2],
   'min_samples_split':[2,3],
    'n_jobs':[-1],
    'max_depth': [8, None]

    }

    'SVC:'
    parameters2 = {
    'C': [0.2, 1,10,50,100,1000],
    # 'kernel': ['linear','rbf'],
    'kernel': ['rbf'],
    'gamma': [0.1,0.0, 1.0],
    'cache_size':[1900],
    'class_weight':['auto',None],
    }
# , 'poly','sigmoid']
    'GradientBoostingClassifier'
    parameters3 = {
    'max_depth':[5,7],
    'n_estimators': [80],
    # 'min_samples_leaf':[2],
    # 'learning_rate': [0.1, 0.05],
    'max_features': ['auto',0.4]
    }
    # 'min_samples_leaf':[1,2],

    'LogisticRegression:'
    parameters4 = {
    'C': [1.0,10,100],
    'penalty': ['l1','l2'],'class_weight':['auto',None]
    }

    pars = [parameters1, parameters2, parameters3,parameters4]
    pips = [pipeline1, pipeline2, pipeline3, pipeline4]

    'Store and return the best estimator found (and score)'
    bestEst=None
    bestScore=0

    print ("Starting gridsearch to find best model hyperparameters.")

    'Gridsearch done "in bits" due to some classifiers not supporting sample_Weight'
    def gs_fit(gs):
        nonlocal bestEst
        nonlocal bestScore
        gs.fit(X_train, y_train)
        report(gs.grid_scores_)
        # http://stackoverflow.com/questions/18210799/scikit-learn-sample-try-out-with-my-classifier-and-data
        if gs.best_score_>bestScore:
            bestEst = gs.best_estimator_
            bestScore = gs.best_score_
            print("Updated best Est, new Best score:",bestScore)

    # for i in range(len(pars)): #Orig
    for i in range(2):
        clf_name = str(pips[i])
        print(clf_name[0:clf_name.index("(")])
        gs = GridSearchCV(estimator=pips[i], param_grid=pars[i],
                          verbose=1, refit=True, n_jobs=-1,iid=False,
                          fit_params={'sample_weight': balance_weights(y_train)},
                          pre_dispatch='1.5*n_jobs',scoring=scoreParam,
                          cv=StratifiedKFold(y_train,n_folds=cv,shuffle=True))
#Valid scoring options: ['accuracy', 'average_precision', 'f1', 'precision', 'recall', 'roc_auc']
        gs_fit(gs)

    i=3 #Logistic Regression

    gs = GridSearchCV(estimator=pips[i], param_grid=pars[i],
                          verbose=0, refit=True, n_jobs=-1,iid=True,
                          pre_dispatch='1.5*n_jobs',scoring=scoreParam,
                          cv=StratifiedKFold(y_train,n_folds=cv,shuffle=True))
    gs_fit(gs)

   # 'http://stackoverflow.com/questions/13051706/scikit-learn-using-sample-weight-in-grid-search?rq=1'
    # "http://stackoverflow.com/questions/20082674/unbalanced-classification-using-randomforestclassifier-in-sklearn"
    # "Set Class weights (then into sample weights: https://github.com/scikit-learn/scikit-learn/blob/8dab222cfe894126dfb67832da2f4e871b87bce7/sklearn/utils/class_weight.py"
    #print (gs.best_score_)

    print("Best Predictor:", bestEst, "Score: ",(bestScore))
    return(bestEst,bestScore)
Ejemplo n.º 4
0
def ModelParam_GridSearch(X_train, y_train, cv=3,scoreParam = 'precision'):
    '''
    Basic grid searchCV for multiple classifiers' perf & parameters.
    This is limited as currently implemented, but still  computationally expensive.
    Not guaranteed to reach even a local optima, but good to get a
    rough idea of parameters for the classifiers. (Does not address pre-processing)
    More classifiers can be added as desired, and parameters expanded.

    Later: Add options for RBM + Logit; PCA; ICA; LDA.
    (Further) Feature selection should  be implemented within the CV pipeline, if you wish
    to avoid overfitting. (Note the effects of )
     See also
    http://scikit-learn-laboratory.readthedocs.org/en/latest/_modules/skll/learner.html

    Possible Scoreparams: scoreParam = 'f1','accuracy', 'precision', 'roc_auc'..
    '''

#    pipeline1 = Pipeline('clf', RandomForestClassifier() )

    pipeline1 = RandomForestClassifier(n_jobs=-1)
    pipeline2 = SVC(cache_size=1900)
    pipeline3 = GradientBoostingClassifier()
    pipeline4 = LogisticRegression()


    'RandomForestClassifier:'
    parameters1 = {
    'n_estimators': [120],
    'criterion': ['gini'],
    'max_features': ['auto',0.4],
    'min_samples_leaf':[1,2],
   'min_samples_split':[2,3],
    'n_jobs':[-1],
    'max_depth': [8, None]

    }

    'SVC:'
    parameters2 = {
    'C': [0.2, 1,10,50,100,1000],
    # 'kernel': ['linear','rbf'],
    'kernel': ['rbf'],
    'gamma': [0.1,0.0, 1.0],
    'cache_size':[1900],
    'class_weight':['auto',None],
    }
# , 'poly','sigmoid']
    'GradientBoostingClassifier'
    parameters3 = {
    'max_depth':[5,7],
    'n_estimators': [80],
    # 'min_samples_leaf':[2],
    # 'learning_rate': [0.1, 0.05],
    'max_features': ['auto',0.4]
    }
    # 'min_samples_leaf':[1,2],

    'LogisticRegression:'
    parameters4 = {
    'C': [1.0,10,100],
    'penalty': ['l1','l2'],'class_weight':['auto',None]
    }

    pars = [parameters1, parameters2, parameters3,parameters4]
    pips = [pipeline1, pipeline2, pipeline3, pipeline4]

    'Store and return the best estimator found (and score)'
    bestEst=None
    bestScore=0

    print ("Starting gridsearch to find best model hyperparameters.")

    'Gridsearch done "in bits" due to some classifiers not supporting sample_Weight'
    def gs_fit(gs):
        nonlocal bestEst
        nonlocal bestScore
        gs.fit(X_train, y_train)
        report(gs.grid_scores_)
        # http://stackoverflow.com/questions/18210799/scikit-learn-sample-try-out-with-my-classifier-and-data
        if gs.best_score_>bestScore:
            bestEst = gs.best_estimator_
            bestScore = gs.best_score_
            print("Updated best Est, new Best score:",bestScore)

    # for i in range(len(pars)): #Orig
    for i in range(2):
        clf_name = str(pips[i])
        print(clf_name[0:clf_name.index("(")])
        gs = GridSearchCV(estimator=pips[i], param_grid=pars[i],
                          verbose=1, refit=True, n_jobs=-1,iid=False,
                          fit_params={'sample_weight': balance_weights(y_train)},
                          pre_dispatch='1.5*n_jobs',scoring=scoreParam,
                          cv=StratifiedKFold(y_train,n_folds=cv,shuffle=True))
#Valid scoring options: ['accuracy', 'average_precision', 'f1', 'precision', 'recall', 'roc_auc']
        gs_fit(gs)

    i=3 #Logistic Regression

    gs = GridSearchCV(estimator=pips[i], param_grid=pars[i],
                          verbose=0, refit=True, n_jobs=-1,iid=True,
                          pre_dispatch='1.5*n_jobs',scoring=scoreParam,
                          cv=StratifiedKFold(y_train,n_folds=cv,shuffle=True))
    gs_fit(gs)

   # 'http://stackoverflow.com/questions/13051706/scikit-learn-using-sample-weight-in-grid-search?rq=1'
    # "http://stackoverflow.com/questions/20082674/unbalanced-classification-using-randomforestclassifier-in-sklearn"
    # "Set Class weights (then into sample weights: https://github.com/scikit-learn/scikit-learn/blob/8dab222cfe894126dfb67832da2f4e871b87bce7/sklearn/utils/class_weight.py"
    #print (gs.best_score_)

    print("Best Predictor:", bestEst, "Score: ",(bestScore))
    return(bestEst,bestScore)