#escalar los valores de entradas scaledXMM = scalerMM.fit_transform(x) scaledXDFMM[x.columns ] = scalerMM.fit_transform(x) scaledXDFMM.hist(column=['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se']) scaledXS = scalerS.fit_transform(x) scaledXDFS[x.columns ] = scalerS.fit_transform(x) scaledXDFS.hist(column=['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se']) #SPlitting into train and test xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.3) #creamos el modelo de un Perceptron multicapa model = MLPClassifier(alpha=1, max_iter=1000) #a entrenar model.fit(xtrain , ytrain) # Aplicar metrica al modelo print('Train: ', model.score(xtrain, ytrain)) print('Test: ', model.score(xtest, ytest)) #sacar la prediccion en la parte del test ytestpred = model.predict(xtest) #sacar el reporte de clasificacion print('Classification report: \n', classification_report (ytest, ytestpred)) class_names = [0, 1] disp = plot_confusion_matrix(model ,xtest , ytest , display_labels = class_names, cmap=plt.cm.Blues ,) disp.ax.set_title ("Confusion matrix , without normalization") plt.show() confusion_matrix (ytest, ytestpred)
CV_err_arr = np.append(CV_err_arr, np.mean(err_arr)) CV_var_arr = np.append(CV_var_arr, np.var(err_arr)) print(np.round(CV_err_arr, 2)) print(np.round(np.sqrt(CV_var_arr), 2)) # apply model to test data using hyperparameter k=1 (which was found to be the # best; this is probably because images are "far" away from each other in # space and thus there's no noise to be reduced by increasing k): # create and train the kNN Classifier knn = KNeighborsClassifier(n_neighbors=1) knn.fit(X_nontest, d_nontest.ravel()) # plot the confusion matrix matrix = plot_confusion_matrix(knn, X_test, d_test, cmap=plt.cm.Blues, normalize='true') plt.title('Confusion matrix for OvR classifier') plt.show(matrix) plt.show() # test model on the test data d_hat = knn.predict(X_test) err = 100 * (1 - metrics.accuracy_score(d_test.ravel(), d_hat)) print(np.round(err, 2))
from sklearn.metrics import precision_score from sklearn.metrics import recall_score y_true = np.array([1] * 194 + [0] * 194) x_pred = probabilities > 0.5 x_pred = x_pred.astype(int) print (y_test.shape) print (x_pred.shape) x_pred = x_pred.reshape(-1,1) y_test = y_true.reshape(-1,1) print (y_test.shape) print (x_pred.shape) clf= SVC(random_state=0) clf = svm.SVC(kernel='linear', C = 1.0) clf.fit(x_pred,y_test.ravel()) plot_confusion_matrix(clf, x_pred, y_test, normalize='all') plt2.show() ##Fscore outputted here as well as precision and recall fScore = f1_score(y_test,x_pred,labels=None, pos_label=1,average='binary',sample_weight=None,zero_division='warn') precison = precision_score(y_test, x_pred,labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn') recall = recall_score(y_test, x_pred,labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn') print("fScore = :",fScore) print("Precison = :",precison) print("Recall = :",recall) #metrics.accuracy_score(y_true,y_pred) #metrics.multilabel_confusion_matrix(y_true,y_pred) y_test = test_df["Class"].values
loaded_model = pickle.load(open('svm_color_classifier_poly.pkl', 'rb')) result = loaded_model.score(X_test, y_test) print('X_test index 0 is {}'.format(X_test[0])) print('result is {}'.format(result)) # pred = loaded_model.predict(np.array([80945, 115532, 228628, 284049, 246331, 234232, 193999, 149803, 176310]).reshape(1, -1)) # # ([3, 0, 0, 0, 1, 0, 0, 2, 0]) # print('pred is {}'.format(pred)) # Plot non-normalized confusion matrix titles_options = [("Confusion matrix, without normalization", None), ("Normalized confusion matrix", 'true')] for title, normalize in titles_options: disp = plot_confusion_matrix(loaded_model, X_test, y_test, display_labels=labels, cmap=plt.cm.Blues, normalize=normalize) disp.ax_.set_title(title) print(title) print(disp.confusion_matrix) plt.show() # svm_color_classifier_poly.pkl Running to save the best model for bin = 24 score=62.5% # svm_color_classifier_sigmoid.pkl Running to save the best model for bin = 7 score = 40% # svm_color_classifier_rbf.pkl Running to save the best model for bin = 24 score = 50% # svm_color_classifier_poly_gamma01.pkl[{'bins': 14, 'score': 0.55}] Running to save the best model for bin = 14 g=0.1 # C=100, gamma = 1, poly score = 50% Running to save the best model for bin = 9
# followed from https://levelup.gitconnected.com/scikit-learn-python-6-useful-tricks-for-data-scientists-1a0a502a6aa3 from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.metrics import plot_confusion_matrix from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt X, y = make_classification(n_samples=1000, n_features=4, n_classes=2, random_state=123) y.shape X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) clf = LogisticRegression() clf.fit(X_train, y_train) confmat = plot_confusion_matrix(clf, X_test, y_test, cmap="Blues") plt.show() # True positive : 예측 1(positive) 맞춤(true) # True neg : 예측 0(neg), 맞춤(true) # False pos : 예측 1(pos) , 틀림(false) 실제는 neg # Flase neg: 예측 0(neg), 틀림(false) 실제는 pos => 병원에서 중요
def test_error_on_invalid_option(pyplot, fitted_clf, data): X, y = data msg = r"normalize must be one of \{'true', 'pred', 'all', " r"None\}" with pytest.raises(ValueError, match=msg): plot_confusion_matrix(fitted_clf, X, y, normalize="invalid")
def prediction(): # Manage the user connection if 'user' in session: user = session['user'] # idToken expires after 1 hour, so we refresh the token to avoid stale token. user = auth.refresh(user['refreshToken']) session['user'] = user try: filename = session['filename'] # Dictionnary of columns for form select cols = df.columns df_col_dic = [{'name': col} for col in cols] y_predict = best_model.predict(X_test) if request.method == 'POST': if request.form['pred_btn'] == 'conf_matrix': # Confusion Matrix Plot cm_plot = plot_confusion_matrix(best_model, X_test, y_test, display_labels=y, cmap=plt.cm.Blues) # Save as an Image cm_buff = io.BytesIO() plt.savefig(cm_buff, format='png') cm_buff.seek(0) cm_buffer = b''.join(cm_buff) cm_encoded = base64.b64encode(cm_buffer) cm = cm_encoded.decode('utf-8') return render_template('prediction.html', df_name=filename, best_model=best_model_name, cm_plot=cm) elif request.form['pred_btn'] == 'pred_table': # We display a prediction table to compare result and prediction df_predictions = pd.DataFrame({ "target": y_predict, "prediction": y_test }) return render_template( 'prediction.html', df_name=filename, best_model=best_model_name, df_prediction=[df_predictions.to_html(classes='data')]) return render_template('prediction.html', df_name=filename, best_model=best_model_name) except: flash( 'There is no dataframe uploaded. PLease visit DATASET page first', 'warning') return render_template('prediction.html') return render_template('prediction.html') return redirect(url_for('login'))
def generate_models(X_train, X_test, y_train, y_test, show_graphs=True): from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC ########################### #Linear Regression ########################### reg = LogisticRegression(penalty='none', tol=0.5, random_state=1) reg.fit(X_train, y_train) reg_pred = reg.predict(X_test) if show_graphs == True: # print the initial results print("Linear Regression") print("The accuracy of the model on test set is: %4.2f " % accuracy_score(y_test, reg_pred)) print("The Kapa of your model is: ", round(cohen_kappa_score(y_test, reg_pred), 3)) # plot confusion matrix confusion_matrix(y_test, reg_pred) plot_confusion_matrix(reg, X_test, y_test) plt.show() # print classification report print(classification_report(y_test, reg_pred)) # Save model with open('lin_reg.pkl', 'wb') as file: pickle.dump(reg, file) ########################### #Support Vector Classifier ########################### svc = SVC(random_state=1, probability=True) svc.fit(X_train, y_train) svc_pred = reg.predict(X_test) if show_graphs == True: # print the initial results print("Support Vector Classifier") print("The accuracy of the model on test set is: %4.2f " % accuracy_score(y_test, reg_pred)) print("The Kapa of your model is: ", round(cohen_kappa_score(y_test, reg_pred), 3)) # plot confusion matrix confusion_matrix(y_test, svc_pred) plot_confusion_matrix(svc, X_test, y_test) plt.show() # print classification report print(classification_report(y_test, svc_pred)) # Save model with open('svc.pkl', 'wb') as file: pickle.dump(svc, file) ########################### #Random Forest ########################### RanFor = RandomForestClassifier(max_depth=25, n_estimators=1200, min_samples_split=2, min_samples_leaf=1) RanFor.fit(X_train, y_train) RanFor_pred = RanFor.predict(X_test) if show_graphs == True: # print the initial results print("Random Forest") print("The accuracy of the model on test set is: %4.2f " % accuracy_score(y_test, RanFor_pred)) print("The Kapa of your model is: ", round(cohen_kappa_score(y_test, RanFor_pred), 3)) # plot confusion matrix confusion_matrix(y_test, RanFor_pred) plot_confusion_matrix(RanFor, X_test, y_test) plt.show() # print classification report print(classification_report(y_test, RanFor_pred)) # Save model with open('forest.pkl', 'wb') as file: pickle.dump(RanFor, file) return reg_pred, svc_pred, RanFor_pred
categorical_transformer = make_pipeline( SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")) # Build a numeric pipeline numeric_transformer = make_pipeline( SimpleImputer(strategy="median"), StandardScaler()) # Build a categorical pipeline categorical_transformer = make_pipeline( SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")) # Build a categorical transformer col_transformer = make_column_transformer( (numeric_transformer, numeric_features), (categorical_transformer, categorical_features)) # Build a main pipeline lr_pipe = make_pipeline( col_transformer, LogisticRegression()) # Fit your pipeline on the training set lr_pipe.fit(X_train, y_train) # Plot your confusion matrix on your test set plot_confusion_matrix(lr_pipe, X_test, y_test, cmap="PuRd");
baseline_pipeline = make_pipeline( StandardScaler(), DummyClassifier(strategy="most_frequent")) baseline_pipeline.fit(x_train, np.array(y_train).ravel()) fpr, tpr, _ = roc_curve(y_test, baseline_pipeline.predict_proba(x_test)[:, 1]) roc_auc = auc(fpr, tpr) pyplot.plot(fpr, tpr, color="red", label='Baseline AUC = %0.8f' % roc_auc) pyplot.legend(loc='lower right') pyplot.show() best_pipeline = logistic_pipeline # make confusion matrix for logistic regression model plot_confusion_matrix(best_pipeline, test_x_input_features, test_y_output_data) pyplot.title("Logistic Regression") pyplot.show() baseline_pipeline.fit( x_input_features, y_output_data) # make confusion matrix for most_frequent model plot_confusion_matrix(baseline_pipeline, test_x_input_features, test_y_output_data) pyplot.title("Most Frequent Baseline") pyplot.show() baseline_accuracy = accuracy_score( y_pred=baseline_pipeline.predict(test_x_input_features), y_true=test_y_output_data) print(f"Baseline Accuracy: {baseline_accuracy}")
# In[20]: #knn classifier with for loop [1-20] to check the best accuracy of n for n in range(1, 21): knn = KNeighborsClassifier(n_neighbors=n) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) print('KNeighborsClassifier: n = {} , Accuracy is: {}'.format( n, knn.score(X_test, y_test))) # In[21]: #plot_confusion_matrix of knn plot_confusion_matrix(knn, X_test, y_test, display_labels=['Edible', 'Poisonous'], cmap="summer", normalize=None) plt.title('Confusion Matrix KNN') plt.show() # In[22]: #Print Confusion matrix Accuracy of knn print('Confusion matrix Accuracy is: {}'.format( metrics.accuracy_score(y_test, y_pred))) # In[23]: #classification_report of KNN KNN_REPORT = classification_report(y_test, knn.predict(X_test))
print("\nAccuracy:", np.mean(scores)) plt.figure() plt.bar(range(len(clf.feature_importances_)), clf.feature_importances_) plt.title('Feature importance, sums to 1') labels = [ 'Median right pupil diameter', 'RCPD right', 'Median left pupil diameter', 'RCPD left', 'median Light Intensity', 'RCLI', 'PERCLOS', 'Median gaze x coordinate', 'Median gaze y coordinate' ] plt.xticks(range(len(RFmatrix_X.columns)), labels) plt.xticks(fontsize=8, rotation=90) if Confusion: # Plot non-normalized confusion matrix plt.figure() titles_options = [("Confusion matrix, without normalization", None), ("Normalized confusion matrix", 'true')] for title, normalize in titles_options: disp = plot_confusion_matrix( clf, X_test, y_test, display_labels=['Baseline', '1-back', '2-back'], cmap=plt.cm.Blues, normalize=normalize) plt.xticks(rotation=90) disp.ax_.set_title(title) plt.savefig('graph.png', dpi=300, bbox_inches='tight') plt.show()
print(f"Labels size: {len(labels)}") #Split data into training and testing train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.2, shuffle=True) #Train the model nb = Classifier.fit(train_x, train_y) #Export the vectorizer and the model for use in other programs. export(vec, nb) #Display a % prediction for the first 10 labels. for index in range(1, 10): res = nb.predict_proba(test_x[index]) res = int(res[0][1] * 100) print(f"{res}%") #Displayt the accuracy of the model print(f"Accuracy is: {nb.score(test_x,test_y)*100}%") #Predict the test set again so it can be used in the confusion matrix. #%% pred_y = nb.predict(test_x) plot_confusion_matrix(nb, test_x, test_y) plt.show()
def test_model(final_search, X_test_df, out_folder, file_h, scaler, threshold, display_labels): fig, ax = plt.subplots() X_test = X_test_df.iloc[:, 5:].values y_test = X_test_df.true_label.values #X_test_scaled = scaler.fit_transform(X_test) y_pred = final_search.predict_proba(X_test)[:, 1] y_pred_default = final_search.predict(X_test) y_pred_decision = final_search.decision_function(X_test) y_pred_fixed = rescore(y_pred, threshold) #y_pred_fixed=final_search.predict(X_test) y_pred_log = final_search.predict_log_proba(X_test)[:, 1] average_precision = average_precision_score(y_test, y_pred) print('Average precision-recall score: {0:0.2f}'.format(average_precision)) file_h = write_line( file_h, 'Average precision-recall score: {0:0.2f}'.format(average_precision)) ###Plotting results log_fpr, log_tpr, log_threshold = roc_curve(y_test, y_pred) # sample_names = [label_dict[val] for val in y_test_labels] #sample_names = [label_dict[val] for val in X_test_df.index] #print(sample_names) pred_result = X_test_df.iloc[:, :5] this_result_default = pd.Series(y_pred_default, index=pred_result.index) this_result_default.name = "predicted_label" this_result_dec = pd.Series(y_pred_decision, index=pred_result.index) this_result_dec.name = "decision_function" this_result = pd.Series(y_pred, index=pred_result.index) this_result.name = "prediction_proba" this_result_fixed = pd.Series(y_pred_fixed, index=pred_result.index) this_result_fixed.name = "predicted_label_rescored" this_result_log = pd.Series(y_pred_log, index=pred_result.index) this_result_log.name = "prediction_log_proba" #zipped = zip(y_test_labels,y_test,y_pred,sample_names) pd.concat([ pred_result, this_result, this_result_log, this_result_default, this_result_fixed, this_result_dec ], axis=1).to_csv(os.path.join(out_folder, "pred_result.csv")) file_h = write_line( file_h, '-----------------------------------------------------') file_h = write_line(file_h, 'Prediction Result...') # file_h =write_line(file_h, '\tSample\tTest_Label\tPredicted_Label\tSample_names\t') #for val in zipped: # file_h = write_line(file_h, "\t%s\t%s\t%0.3f\t%s\t" %(val[0],val[1],val[2],val[3])) log_roc_auc = auc(log_fpr, log_tpr) file_h = write_line( file_h, '-----------------------------------------------------') file_h = write_line(file_h, 'Test Results.......') file_h = write_line(file_h, 'Log_Thres\tLog_TPR\tLog_FPR\tLog_TPR-Log_FPR') for ii in range(len(log_tpr)): print(log_threshold[ii], log_tpr[ii], log_fpr[ii], log_tpr[ii] - log_fpr[ii]) file_h = write_line( file_h, "%0.5f\t%0.5f\t%0.5f\t%0.5f" % (log_threshold[ii], log_tpr[ii], log_fpr[ii], log_tpr[ii] - log_fpr[ii])) plt.plot(log_fpr, log_tpr, color='orangered', linestyle='--', label='ROC curve (area = %0.3f)' % log_roc_auc, lw=3) plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k') plt.xlim([-0.03, 1.03]) plt.ylim([-0.03, 1.03]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Performance on validation set (%s)' % y_test.shape[0]) plt.legend(loc="lower right") plt.savefig(os.path.join(out_folder, "roc_curve.pdf")) #label_dict = expdesign.set_index("condition_rep")["label"].to_dict() plt.close() disp = plot_precision_recall_curve(final_search, X_test, y_test) disp.ax_.set_title('2-class Precision-Recall curve: ' 'AP={0:0.2f}'.format(average_precision)) plt.savefig(os.path.join(out_folder, "precision_recall_curve.pdf")) plt.close() cm = confusion_matrix(y_test, y_pred_fixed) tn, fp, fn, tp = cm.ravel() print("tn\tfp\tfn\ttp") print(tn, "\t", fp, "\t", fn, "\t", tp) file_h = write_line(file_h, "---------Confusion_matrix-------") file_h = write_line(file_h, "tn\tfp\tfn\ttp") file_h = write_line( file_h, str(tn) + "\t" + str(fp) + "\t" + str(fn) + "\t" + str(tp)) file_h = write_line(file_h, "----------------") # display_labels=["Healthy","Tumor"] # display_labels=["No Relapse","Relapse"] display_labels = display_labels disp = plot_confusion_matrix(final_search, X_test, y_test, display_labels=display_labels, cmap=plt.cm.Blues) disp.ax_.set_title("Confusion Matrix") plt.savefig(os.path.join(out_folder, "default_confusion_matrix.pdf")) plt.close() disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels) disp.plot(cmap=plt.cm.Reds) disp.ax_.set_title("Confusion Matrix") plt.savefig(os.path.join(out_folder, "rescored_confusion_matrix.pdf")) plt.close() print( classification_report(y_test, y_pred_fixed, target_names=display_labels)) file_h = write_line(file_h, "----Precision Recall F1-score Support------") file_h = write_line( file_h, classification_report(y_test, y_pred_fixed, target_names=display_labels)) file_h = write_line(file_h, "----------------") return file_h
# # Gradient Boosting algorithms # # 1. XGBoost # In[29]: from xgboost import XGBClassifier xgb = XGBClassifier(n_jobs=-1, random_state=42, n_estimators=120, max_depth = 5, min_samples_leaf=5) xgb.fit(X_train, y_train) y_pred = xgb.predict(X_test) print(classification_report(y_test, y_pred)) from sklearn.metrics import plot_confusion_matrix, accuracy_score plot_confusion_matrix(xgb,X_test , y_test, cmap = plt.cm.Blues) print(confusion_matrix(y_test, y_pred)) # In[36]: print("Train accuracy",xgb.score(X_train, y_train)) print("Test accuracy",xgb.score(X_test, y_test)) # # 2. LightGBM # In[ ]:
#cm = np.around(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], decimals=2) # Use white text if squares are dark; otherwise black. threshold = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): color = "white" if cm[i, j] > threshold else "black" plt.text(j, i, cm[i, j], horizontalalignment="center", color=color) plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() return figure plot_confusion_matrix(cm, CLASS_NAMES) def plot_image(i, predictions_array, true_label, img): predictions_array, true_label, img = predictions_array, true_label[i], img[ i] plt.grid(False) plt.xticks([]) plt.yticks([]) plt.imshow(img, cmap=plt.cm.binary) predicted_label = np.argmax(predictions_array) if predicted_label == true_label: color = 'blue' else:
def test_plot_confusion_matrix_deprecation_warning(pyplot, fitted_clf, data): with pytest.warns(FutureWarning): plot_confusion_matrix(fitted_clf, *data)
classifier = LinearSVC() classifier.fit(train_text, train_df["Sentiment"]) test_df = pd.read_csv("reviews_test.csv", header=None, skiprows=[0], names=["text", "Sentiment"], dtype=type_dict) # test_data = clean_data(test_df.data) print(test_df.head()) test_df.dropna(inplace=True) # eval data eval_df = pd.read_csv("reviews_eval.csv", header=None, skiprows=[0], names=["text", "Sentiment"], dtype=type_dict) # eval_df.data = clean_data(eval_df.data) print(eval_df.head()) eval_df.dropna(inplace=True) print("transforming test") test_text = vectorizer.transform(test_df["text"].values) print("transforming eval") eval_text = vectorizer.transform(eval_df["text"].values) print("scoring test") print(classifier.score(test_text, test_df["Sentiment"])) print("scoring eval") print(classifier.score(eval_text, eval_df["Sentiment"])) plot_confusion_matrix(classifier, eval_text, eval_df["Sentiment"], normalize="true") plot_roc_curve(classifier, eval_text, eval_df["Sentiment"]) plt.show()
train_acc = svm_clf.score(X_train, y_train) test_acc = svm_clf.score(X_test, y_test) train_uar = recall_score(y_train, pred_train, average='macro') test_uar = recall_score(y_test, pred_dev, average='macro') print(f"train_acc = {train_acc:.2f}, test_acc = {test_acc:.2f}") print(f"train_uar = {train_uar:.2f}, test_uar = {test_uar:.3f}") """ train_acc = 0.91, test_acc = 0.51 train_uar = 0.91, test_uar = 0.51 """ plt.figure() disp = plot_confusion_matrix(svm_clf, X_test, y_test, display_labels=encoder.classes_, cmap=plt.cm.Blues) disp.ax_.set_title('Confusion matrix') plt.savefig('confusion_mat_test_SVM.png', dpi=300) # %% Fit MLP with best hyperparameters device = torch.device("cuda:0") X_train, y_train = torch.Tensor(X_train), torch.from_numpy(y_train) X_test, y_test = torch.Tensor(X_test), torch.from_numpy(y_test) model, train_uar, test_uar = train_model(X_train, y_train, X_test, y_test, l2_lambda=0.001, lr=0.001)
x_test = train_data[38000:, 1:] y_test_digit = train_data[38000:, 0] rfc.fit(x_train, y_train_digit) predicted = rfc.predict(x_test) print("Accuracy: ", accuracy_score(y_test_digit, predicted)) ''' Error matrices ''' np.set_printoptions(precision=2) titles_options = [("Confusion matrix, without normalization", None), ("Normalized confusion matrix", 'true')] for title, normalize in titles_options: disp = plot_confusion_matrix(rfc, x_test, y_test_digit, cmap=plt.cm.Blues, normalize=normalize) disp.ax_.set_title(title) plt.show() ''' Kaggle results ''' test_data = pd.read_csv("digit-recognizer/test.csv").to_numpy() x_test = test_data[0:, 0:] x_train = train_data[0:, 1:] y_train_digit = train_data[0:, 0] start = time.time() rfc.fit(x_train, y_train_digit) predicted = rfc.predict(x_test)
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) # Dividindo nossos dados em treino e teste. A primeira metade serão os dados de treino, a segunda os dados de teste X_train, X_test, y_train, y_test = train_test_split(data, digits.target, test_size=0.5, shuffle=False) classifier = svm.SVC() classifier.fit(X_train, y_train) # Predizendo os valores da segunda metade (os dados de teste) predicted = classifier.predict(X_test) # Vamos mostrar as imagens testadas e a previsão do nosso classificador? fig2, axes2 = plt.subplots(1, 8) images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted)) # Plota as primeiras 8 imagens que foram preditas pelo classificador. for ax, (image, prediction) in zip(axes2, images_and_predictions[:8]): ax.set_axis_off() ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') ax.set_title('Prediction: %i' % prediction) # Printando o resultado do nosso classificador print("Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(y_test, predicted))) disp = metrics.plot_confusion_matrix(classifier, X_test, y_test) disp.figure_.suptitle("Confusion Matrix") print("Confusion matrix:\n%s" % disp.confusion_matrix) plt.show()
# Fit the logistic regression model with the training data logreg.fit(x_train, y_train) # Testing # predicted probabilities y_predicted = logreg.predict_proba(x_test) # predicted class labels k_predicted = logreg.predict(x_test) # Display 10 selected images from the test set, as gray-scale images, each with a different class label. import matplotlib.pyplot as plt for i in range(0,10): for j in range(len(y_test)): if i is y_test[j]: plt.imshow(x_testo[j, :, :], cmap='gray', vmin=0, vmax=255) plt.show() break # Give the recognition accuracy rate for the whole test set, and show the confusion matrix. numCorrect = 0 lenset = len(y_test) for i in range(lenset): if y_test[i] == k_predicted[i]: numCorrect += 1 print("Accuracy rate: ", (numCorrect/lenset)) from sklearn.metrics import plot_confusion_matrix plot_confusion_matrix(logreg, x_test, y_test, values_format = '.5g') plt.show()
X_train, y_train = vectorize_data(train_data, train_labels, channels_to_select) # Modell-Schätzung lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) model = lda.fit(X_train[:,:2], y_train) # Vorhersage der Klassen-Zuordnung aufgrund der Trainings-Daten und dem Modell y_pred = model.predict(X_train[:,:2]) # Visualisierung der richtig und falsch klassifizierten Epochen und der # linearen Trennlinie (Funktion abgeleitet von https://scikit-learn.org/stable/auto_examples/classification/plot_lda_qda.html#sphx-glr-auto-examples-classification-plot-lda-qda-py) plot_model_fit(model, X_train[:,:2], y_train, y_pred) plt.title('LDA Trainings-Daten') # Confusion Matrix anzeigen conf_mat = plot_confusion_matrix(model, X_train[:,:2], y_train, cmap=plt.cm.Blues) #, normalize = 'all') plt.title('Performance Trainings-Daten ') # Scores berechnen def print_scores(y_true, y_pred): TP = sum(np.logical_and(y_true == 1, y_pred == 1)) # True Positives FP = # False Positives TN = # True Negatives FN = # False Negatives sens = # Sensitivität spec = # Spezifizität prec = # Precision acc = # Accuracy print('Sensitivität:', sens) print('Spezifizität:',spec) print('Precision:', prec)
DecisionTreeClassifier(criterion='entropy') print(dtree.get_n_leaves()) print(dtree.get_depth()) #%% #Evaluate Model Performance pred_labels = dtree.predict(test.loc[:, pred_vars]) pred_labels[0:4] #Confusion Matrix metrics.plot_confusion_matrix(dtree, test.loc[:, pred_vars], test['Class']) #Classification report print(metrics.classification_report(test['Class'], pred_labels, digits=5)) #Probabalistic Evaluation pred_probs = dtree.predict_proba(test.loc[:, pred_vars]) pred_probs[0:5, :] #%% #Area Under The Curve metrics.roc_auc_score(test['Class'], pred_probs[:,1])
plt.xlabel('False Positive Rate') #plt.savefig(AUC-ROC.png') # Precision-Recall Curve average_precision = average_precision_score(y_test_, y_score) print('Average precision-recall score: {0:0.2f}'.format(average_precision)) disp = plot_precision_recall_curve(ClassifierSVM, X_test, y_test) disp.ax_.set_title('2-class Precision-Recall curve: ' 'AP={0:0.2f}'.format(average_precision)) plt.savefig('../DeNovo/ClassifiersFiles/AUC-PR.png') matrix = plot_confusion_matrix(ClassifierSVM, X_test, y_test, cmap=plt.cm.Blues, normalize='true') plt.title('Confusion matrix for RBF SVM') #plt.savefig('CM_RBF_SVM.png') ## Path to the trained classifier for latter use. print( "\n\n#############################################################################################################" ) print('\nPaths to models:\n\n') print(f"Classifier = joblib.load('{filename_svm}')") print(f"Vectorizer = joblib.load('{filename_vec}')") print(f"Variance Treshold = joblib.load('{filename_variance}')") print(f"Percentile Best Features= joblib.load('{filename_percentile}')") print('\n\n\n')
def confusion(modelo,nombre): fig=plt.figure() ax = fig.add_subplot(111) plt.title(nombre + " confusion matrix") plot_confusion_matrix(modelo, X_test, y_test, normalize ='true',ax=ax,cmap="Reds") plt.savefig("/home/bleon/Documents/TESIS_FILES/Codigos/DATOS/ALL_STARS_ALL/conf_matrix_All_"+nombre+"_2.jpg")
network,acc = train_and_evaluate(input_data_train, input_data_test, labels_train, labels_test) """## Wat is een confusion matrix? (zoek op) ## Hoe ziet een confusion matrix eruit wanneer de predicties 100% accuraat zijn? We gebruiken de `plot_confusion_matrix` functie uit [sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_confusion_matrix.html) om de confusion matrix te plotten """ from sklearn.metrics import plot_confusion_matrix plot_confusion_matrix(network, input_data_test, labels_test, display_labels=['wit','rood'], cmap=plt.cm.Blues, values_format = '.0f') """## Class imbalance Neurale netwerken werken het best wanneer de input data 'gebalanceerd' is. Dit betekend dat er per klasse evenveel data beschikbaar is. Bijvoorbeeld 500 witte en 500 rode wijnen. Je gaat nu experimenteren wat er gebeurt als we de data ongebalanceerd aanleveren. Nu koppelen we de functies die je hierboven hebt gemaakt aan elkaar. De enige input is nu nog: * Het totaal aantal rode wijnen * De ratio rode tot witte wijnen (0.8 zou dan betekenen dat 80% van de wijnen rood zijn) Deze functie maakt dan een dataset aan zoals hierboven, daarna wordt het genormaliseerd, gesplitst, getrained en geëvalueerd eveneens zoals je hierboven hebt gedaan. Ook wordt de accuracy en confusion matrix geplot """
from sklearn.linear_model import LogisticRegression from sklearn.metrics import plot_confusion_matrix, confusion_matrix, accuracy_score dataset = pd.read_csv("data.csv") X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values y = y.reshape(-1, 1) x_scaler = StandardScaler() X = x_scaler.fit_transform(X) x_train, x_test, y_tarin, y_test = train_test_split(X, y, test_size=0.2, random_state=0) classifier = LogisticRegression(random_state=0) classifier.fit(x_train, y_tarin.flatten()) print(classifier.predict(x_scaler.fit_transform([[30, 87000]]))) y_hat = classifier.predict(x_test) tn, fn, fp, tp = confusion_matrix(y_test, y_hat).ravel() print(f"tn: {tn}, fn: {fn}, fp: {fp}, tp: {tp}") accuracy = accuracy_score(y_test, y_hat) print("accuracy = ", accuracy) plot_confusion_matrix(classifier, x_test, y_test) plt.show()
x.head() y = df_no_missing['hd'].copy() x['cp'].unique() pd.get_dummies(x, columns=['cp']).head() x_encoded = pd.get_dummies(x, columns=['cp', 'restecg', 'slope', 'thal']) x_encoded.head() y.unique() y[y > 0] = 1 y.unique() x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42) x_test_scaled = scale(x_test) x_train_scaled = scale(x_train) clf_svm = SVC(random_state=42) clf_svm.fit(x_train_scaled, y_train) plot_confusion_matrix(clf_svm, x_test_scaled, y_test, display_labels=["No", "Has HD"]) #plt.show() param_grid = [ { 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] optimal_params = GridSearchCV(SVC(), param_grid, cv=5, verbose=0) optimal_params.fit(x_train_scaled, y_train) #plt.show() optimal_params.best_params_ clf_svm = SVC(random_state=42, C=10, gamma=0.001) clf_svm.fit(x_train_scaled, y_train)
def process_classifier(self, model, df: pd.DataFrame, features: list, label: str, pred_col: str, verbose: bool = False, conf_matrix: bool = False, test_recs: float = 0.25, split_mode: str = 'pctg', metric_round_dig: int = 2, **params): """ Run a full cycle of split-train-predict for any classifier model that respects sklearn interface. :param model: Regressor class :param ds: Data set with data and metadata :param verbose: True for printing actions to console :param conf_matrix: True to plot confusion matrix :param train_recs: Training records proportion: 1 - test_rec if 0: :param test_recs: Testing records proportion: 1 - train_recs if 0: :param split_mode(not implemented): * If 'pctg': test_recs and train_recs are considered as percentage. * If 'records': test_recs and train recs are considered as number of records, taking the last test_recs for prediction and the last train_recs before the last test_recs for training :param metric_round_dig: number of digits to rounds metrics to :param params: ML model hyper-parameters :return: """ # VALIDATE AND PROCESS PARAMETERS assert (test_recs > 0) and (test_recs < 1), 'test_recs need to be in the range )0,1(' # PREPARE DATA _label_col = label _df_train, _df_pred = DataSetSplit.sep_predict_percentage(df, test_recs) _X_t = _df_train[features] _y_t = _df_train[label] # TRAIN _clf = model(**params) _clf.fit(_X_t, _y_t) # calculate train metrics _train_pred_col = "y_hat_train" _df_train[_train_pred_col] = _clf.predict(_X_t) _acc_train = accuracy_score(_df_train[label], _df_train[_train_pred_col]).round(metric_round_dig) _prec_train = precision_score(_df_train[label], _df_train[_train_pred_col]).round(metric_round_dig) _rec_train = recall_score(_df_train[label], _df_train[_train_pred_col]).round(metric_round_dig) if verbose: print('') print('Train accuracy ', _acc_train) print('Train precision ', _prec_train) print('Train recall ', _rec_train) # PREDICT _X_p = _df_pred[features] _y_true = _df_pred[label] _df_pred[pred_col] = _clf.predict(_X_p) _acc = accuracy_score(_df_pred[label], _df_pred[pred_col]).round(3) _prec = precision_score(_df_pred[label], _df_pred[pred_col]).round(3) _rec = recall_score(_df_pred[label], _df_pred[pred_col]).round(3) # if verbose: # print('') # print('Prediction accuracy ', _acc) # print('Prediction precision ', _prec) # print('Prediction recall ', _rec) # if conf_matrix: # _cfm = confusion_matrix(np.array(_df_pred[label]), np.array(_df_pred[pred_col])) # _cfm = (_cfm / len(_df_pred)).round(3) # ax = sns.heatmap(_cfm, annot=True, fmt="0.2f") # plt.show() if conf_matrix: plot_confusion_matrix(_clf, _X_p, _y_true, normalize='pred', cmap='plasma') plt.grid = False plt.show() _metrics = dict() _metrics['acc_train'] = _acc_train _metrics['prec_train'] = _prec_train _metrics['rec_train'] = _rec_train _metrics['acc_pred'] = _acc _metrics['prec_pred'] = _prec _metrics['rec_pred'] = _rec return _df_pred, _metrics