def main(): data = proc.read_data() features, yfill = proc.features_yfill(data) X_train, X_test, y_train, y_test = train_test_split(features, yfill, test_size=0.20, random_state=42, stratify=yfill) X_train_over, y_train_over = proc.oversample(X_train, y_train, r=0.3) #plot_roc(X_train, y_train, 'LogisticRegression', LogisticRegression(C=1e5,penalty='l2')) ''' model_over = runLR(X_train_over, X_test, y_train_over, y_test) test_results(model_over, X_test, y_test) ''' model = runLR(X_train.values, X_test, y_train.values, y_test) test_results(model, X_test, y_test)
def do_grid_search(data): # Get the data from our function above features, yfill = proc.features_yfill(data) X_train, X_test, y_train, y_test = train_test_split(features, yfill, test_size=0.20, random_state=42, stratify=yfill) # Initalize our model here # original est = RandomForestClassifier() est = RandomForestClassifier(bootstrap=True, criterion="gini", class_weight="balanced_subsample") # Here are the params we are tuning, ie, # if you look in the docs, all of these are 'nobs' within the GradientBoostingClassifier algo. param_grid = { "max_depth": [3, 5, 10, 30, 50, 100], "max_features": [1, 3, 10, 30], "min_samples_split": [2, 3, 10], "min_samples_leaf": [2, 3, 10] } ''' {'max_depth': 10, 'max_features': 30, 'min_samples_leaf': 2, 'min_samples_split': 2} ''' ''' param_grid = {"max_depth": [3, 5, 10, 30], "max_features": [1, 3, 10, 30], "min_samples_split": [2, 3, 10], "min_samples_leaf": [2, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"], "class_weight":[None, "balanced_subsample"]} Full param grid ''' # Plug in our model, params dict, and the number of jobs, then .fit() gs_cv = GridSearchCV(est, param_grid, n_jobs=2).fit(X_train, y_train) # return the best score and the best params return gs_cv.best_score_, gs_cv.best_params_
plt.plot(num_trees, recall) #plt.ylim((0.8, 1)) plt.savefig('recall_vs_numtrees_{}.png'.format(graphid)) plt.close() if __name__ == '__main__': data = proc.read_data() # bits, yfill = bits_yfill(data) # X_train, X_test, y_train, y_test = train_test_split(bits, yfill, test_size=0.20, random_state=42, stratify =yfill) # for num in range(10): # rffit = RandomForestClass(X_train, X_test, y_train, y_test) # feature_importance(bits, rffit) # plot_features(bits, rffit, 20, 'bits', num) features, yfill = proc.features_yfill(data) X_train, X_test, y_train, y_test = train_test_split(features, yfill, test_size=0.20, random_state=1, stratify =yfill) X_train_over, y_train_over = proc.oversample(X_train,y_train, r = 0.3) rffit, y_predict = randomforest(X_train_over, X_test, y_train_over, y_test, num_est=50, cls_w = 'balanced_subsample') precision, recall, median_recall_index, medianrecall_threshold = set_threshold(rffit, X_train, X_test, y_train, y_test) print_threshold(rffit, X_train, X_test, y_train, y_test, medianrecall_threshold) feature_importance(features, rffit) ''' pprob = rffit.predict_proba(X_test) pdf = pd.DataFrame(pprob) print(pdf) pdf['myH'] = pdf[1].map(lambda x: 1 if x>0.35 else 0) my_pred = pdf['myH'].values
def uploaded_file(filename): if request.method == 'POST': # check if the post request has the file part if 'file' not in request.files: flash('No file part') return redirect(request.url) file = request.files['file'] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': flash('No selected file') return redirect(request.url) if file and allowed_file(file.filename): filename = secure_filename(file.filename) print(filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) return redirect(url_for('uploaded_HTS', filename=filename)) elif request.method == 'GET': # make a pd.dataframe of training data df = pd.read_csv(os.path.join(app.config['UPLOAD_FOLDER'], filename)) # use all features and yfill (no NaNs, filled with 0) features, yfill = proc.features_yfill(df) #train test split at 20% X_train, X_test, y_train, y_test = rf.train_test_split(features, yfill, test_size=0.20, random_state=1, stratify=yfill) #Optional: oversampling of minority class for training purposes #X_train_over, y_train_over = proc.oversample(X_train,y_train, r = 0.3) #rffit, y_predict = rf.randomforest(X_train_over, X_test, y_train_over, y_test, num_est=50, cls_w = 'balanced_subsample') #fit the Random Forest classifier: would like to add in a grid search rffit, y_predict = rf.randomforest(X_train.values, X_test, y_train.values, y_test) # Use below to run a grid search .... takes to long to work right now #rffit, y_predict = rf.randomforest(X_train.values, X_test, y_train.values, y_test, grid_search = 'small') #pickle the fit model for use with test data proc._pickle(rffit, 'RFC_fit.pkl') #set_threshold_recall is a function which determines the threshold to set such that recall is optimized (the median of the available thresholds that return the second best recall (not 1.0)) precision_list, recall_list, median_recall_index, medianrecall_threshold = rf.set_threshold_recall( rffit, X_train, X_test, y_train, y_test) #print_threshold uses the trained model and the selected threshold (in this case recall optimized) to return listed statistics precision, recall, fpr, fpr_test, tpr_test, cm = rf.print_threshold( rffit, X_train, X_test, y_train, y_test, medianrecall_threshold) r_cm = pd.DataFrame(cm) proc._pickle(medianrecall_threshold, 'medianrecall_threshold.pkl') #make a pd.dataframe of the stats for display recall_opt_stats = pd.DataFrame([[ format(medianrecall_threshold, '.2f'), format(recall, '.2f'), format(fpr, '.2f'), format(precision, '.2f'), ]], columns=[ 'Suggested Threshold', 'True Positive Rate (Recall)', 'False Positive Rate (Fall-out)', 'Precision' ]) # repeat the threshold selection process for precision optimization p_precision, p_recall, p_median_precision, threshold_precision = rf.set_threshold_precision( rffit, X_train, X_test, y_train, y_test) p_precision, p_recall, p_fpr, p_fpr_test, p_tpr_test, p_cm = rf.print_threshold( rffit, X_train, X_test, y_train, y_test, threshold_precision) p_cm = pd.DataFrame(p_cm) precision_opt_stats = pd.DataFrame( [[ format(threshold_precision, '.2f'), format(p_recall, '.2f'), format(p_fpr, '.2f'), format(p_precision, '.2f'), ]], columns=[ 'Suggested Threshold', 'True Positive Rate (Recall)', 'False Positive Rate (Fall-out)', 'Precision' ]) #produce a ROC plot test_prob = rffit.predict_proba(X_test) roc.plot_roc(X_train.values, y_train.values, y_test, test_prob, 'Test', RandomForestClassifier, max_depth=10, max_features=30, min_samples_leaf=2, min_samples_split=2) feature_description = rf.plot_features(features, rffit, 'Identifier', n=10) #option for overfit data #roc.plot_roc(X_train_over, y_train_over, y_test, test_prob, 'Test', RandomForestClassifier, max_depth = 10, max_features= 30, min_samples_leaf= 2, min_samples_split = 2) #roc.simple_roc(y_test, test_prob, 'ROC_RFC') pd.set_option('display.max_colwidth', -1) return render_template("rock.html", data_recall_opt=recall_opt_stats.to_html( index=False, classes="data_recall_opt"), data_precision_opt=precision_opt_stats.to_html( index=False, classes="data_precision_opt"), rocname='Test', f_descrip=feature_description.to_html( index=False, classes="f_descrip"), recall_cm=r_cm.to_html(classes="cm"), precision_cm=p_cm.to_html(classes="cm"))