def CV_Binary_stats(X, y, model,n=10) : ''' http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics Note that some of the metrics here ONLY work for BINARY tasks. This will be VERY slow compared to the built-in, multicore CV implementation. (Unless used with a classifier that is parallelized anyway, such as RF). By default, balances weights when fitting http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics ''' from sklearn.metrics import precision_score, accuracy_score, recall_score,precision_recall_fscore_support mean_auc = 0.0 mean_precision = 0.0 mean_recall = 0.0 mean_accuracy = 0.0 sss = StratifiedShuffleSplit(y, n_iter=n, test_size=0.2, random_state=0) for train_index, test_index in sss: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # for i in range(n) : # # for each iteration, randomly hold out 30% of the data as CV set # X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, # test_size=.15, # random_state=i) # cv=StratifiedShuffleSplit(y=y_train, n_iter=11, test_size=0.11) # train model and make predictions model.fit(X_train, y_train,sample_weight=balance_weights(y_train)) # preds = model.predict(X_cv) preds = model.predict(X_test) ''' # ROC_AUC - Restricted to binary (not multiclass) case. fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) roc_auc = metrics.auc(fpr, tpr) # print("( %d/%d)" % (i + 1, n)) mean_auc += roc_auc ''' accuracy = accuracy_score(y_cv, preds) precision = precision_score(y_cv, preds) recall = recall_score(y_cv, preds) mean_accuracy += accuracy mean_precision += precision mean_recall += recall mean_accuracy = (mean_accuracy / n) mean_precision = mean_precision / n mean_recall = mean_recall / n # mean_auc = mean_auc / n print('mean_accuracy: %s ' %(round(mean_accuracy, 3))) print('mean_precision: %s ' %(round(mean_precision, 3))) print('mean_recall: %s ' %(round(mean_recall, 3))) # print('mean_auc: %s ' %(round(mean_auc, 3))) return (mean_accuracy,mean_precision,mean_recall)
def ModelParam_GridSearch(X_train, y_train, cv=3,scoreParam = 'precision'): ''' Basic grid searchCV for multiple classifiers' perf & parameters. This is limited as currently implemented, but still computationally expensive. Not guaranteed to reach even a local optima, but good to get a rough idea of parameters for the classifiers. (Does not address pre-processing) More classifiers can be added as desired, and parameters expanded. Later: Add options for RBM + Logit; PCA; ICA; LDA. (Further) Feature selection should be implemented within the CV pipeline, if you wish to avoid overfitting. (Note the effects of ) See also http://scikit-learn-laboratory.readthedocs.org/en/latest/_modules/skll/learner.html Possible Scoreparams: scoreParam = 'f1','accuracy', 'precision', 'roc_auc'.. ''' # pipeline1 = Pipeline('clf', RandomForestClassifier() ) pipeline1 = RandomForestClassifier(n_jobs=-1) pipeline2 = SVC(cache_size=1900) pipeline3 = GradientBoostingClassifier() pipeline4 = LogisticRegression() 'RandomForestClassifier:' parameters1 = { 'n_estimators': [120], 'criterion': ['gini'], 'max_features': ['auto',0.4], 'min_samples_leaf':[1,2], 'min_samples_split':[2,3], 'n_jobs':[-1], 'max_depth': [8, None] } 'SVC:' parameters2 = { 'C': [0.2, 1,10,50,100,1000], # 'kernel': ['linear','rbf'], 'kernel': ['rbf'], 'gamma': [0.1,0.0, 1.0], 'cache_size':[1900], 'class_weight':['auto',None], } # , 'poly','sigmoid'] 'GradientBoostingClassifier' parameters3 = { 'max_depth':[5,7], 'n_estimators': [80], # 'min_samples_leaf':[2], # 'learning_rate': [0.1, 0.05], 'max_features': ['auto',0.4] } # 'min_samples_leaf':[1,2], 'LogisticRegression:' parameters4 = { 'C': [1.0,10,100], 'penalty': ['l1','l2'],'class_weight':['auto',None] } pars = [parameters1, parameters2, parameters3,parameters4] pips = [pipeline1, pipeline2, pipeline3, pipeline4] 'Store and return the best estimator found (and score)' bestEst=None bestScore=0 print ("Starting gridsearch to find best model hyperparameters.") 'Gridsearch done "in bits" due to some classifiers not supporting sample_Weight' def gs_fit(gs): nonlocal bestEst nonlocal bestScore gs.fit(X_train, y_train) report(gs.grid_scores_) # http://stackoverflow.com/questions/18210799/scikit-learn-sample-try-out-with-my-classifier-and-data if gs.best_score_>bestScore: bestEst = gs.best_estimator_ bestScore = gs.best_score_ print("Updated best Est, new Best score:",bestScore) # for i in range(len(pars)): #Orig for i in range(2): clf_name = str(pips[i]) print(clf_name[0:clf_name.index("(")]) gs = GridSearchCV(estimator=pips[i], param_grid=pars[i], verbose=1, refit=True, n_jobs=-1,iid=False, fit_params={'sample_weight': balance_weights(y_train)}, pre_dispatch='1.5*n_jobs',scoring=scoreParam, cv=StratifiedKFold(y_train,n_folds=cv,shuffle=True)) #Valid scoring options: ['accuracy', 'average_precision', 'f1', 'precision', 'recall', 'roc_auc'] gs_fit(gs) i=3 #Logistic Regression gs = GridSearchCV(estimator=pips[i], param_grid=pars[i], verbose=0, refit=True, n_jobs=-1,iid=True, pre_dispatch='1.5*n_jobs',scoring=scoreParam, cv=StratifiedKFold(y_train,n_folds=cv,shuffle=True)) gs_fit(gs) # 'http://stackoverflow.com/questions/13051706/scikit-learn-using-sample-weight-in-grid-search?rq=1' # "http://stackoverflow.com/questions/20082674/unbalanced-classification-using-randomforestclassifier-in-sklearn" # "Set Class weights (then into sample weights: https://github.com/scikit-learn/scikit-learn/blob/8dab222cfe894126dfb67832da2f4e871b87bce7/sklearn/utils/class_weight.py" #print (gs.best_score_) print("Best Predictor:", bestEst, "Score: ",(bestScore)) return(bestEst,bestScore)