def k_nearest_neighbors(json_data): df = pd.DataFrame(data=json_data) x = df.drop('type', axis=1) y = df['type'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0) scaler = StandardScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) classifier = KNeighborsClassifier(n_neighbors=5) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) print('K-nearest neighbors classifier:') print(classification_report(y_test, y_pred)) plot_confusion_matrix(y_true=y_test, y_pred=y_pred, classes=y.unique(), title='K-nearest neighbors classifier')
def linear_svc_classifier(json_data): df = pd.DataFrame(data=json_data) x = df.drop('type', axis=1) y = df['type'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0) classifier = LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=2500, multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) print('Linear SVC classifier:') print(classification_report(y_test, y_pred)) plot_confusion_matrix(y_true=y_test, y_pred=y_pred, classes=y.unique(), title='Linear SVM classifier')
def plot_conf_matrix(cnf_matrix): np.set_printoptions(precision=2) plt.figure() h.plot_confusion_matrix(cnf_matrix, classes=output_class, title='Confusion matrix, without normalization') plt.figure() h.plot_confusion_matrix(cnf_matrix, classes=output_class, normalize=True, title='Normalized Confusion matrix') plt.show()
def create_model(X, y, test_size=0.25, classifier_obj=GaussianNB, show_plot=False, save_model=False): import pickle from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, accuracy_score X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0) # Training the Naive Bayes model on the Training set model = classifier_obj() model.fit(X_train, y_train) # Predicting the Test set results, calc confusion matrix y_pred = model.predict(X_test) cm = confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4]) accuracy = accuracy_score(y_test, y_pred) # Plot confusion matrix if show_plot: from helpers import plot_confusion_matrix plot_confusion_matrix(cm, classes=[1, 2, 3, 4]) if save_model: from datetime import datetime dt = datetime.utcnow().strftime('%Y-%m-%d_%H-%M-%S') filename = 'model-{}.sav'.format(dt) pickle.dump(model, open(filename, 'wb')) print('====== RESULTS FOR MODEL ======') print('y_test:\t{}'.format(y_test)) print('y_pred:\t{}'.format(y_pred)) print('accuracy\t{}'.format(accuracy)) print('confusion matrix:\n{}'.format(cm)) print('====== END OF RESULTS =========') return [model, y_pred, cm, accuracy]
def sgd_classifier(json_data): df = pd.DataFrame(data=json_data) x = df.drop('type', axis=1) y = df['type'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0) classifier = SGDClassifier(alpha=0.0001, average=False, class_weight=None, early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.5, learning_rate='optimal', loss='log', max_iter=2500, n_iter_no_change=5, penalty='l2', power_t=0.9, random_state=None, shuffle=False, tol=0.00001, validation_fraction=0.1, verbose=0, warm_start=False) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) print('SGD Classifier:') print(classification_report(y_test, y_pred)) plot_confusion_matrix(y_true=y_test, y_pred=y_pred, classes=y.unique(), title='SGD Classifier')
def decision_tree_classifier(json_data): df = pd.DataFrame(data=json_data) x = df.drop('type', axis=1) y = df['type'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0) classifier = DecisionTreeClassifier() classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) print('Decision tree classifier:') print(classification_report(y_test, y_pred)) plot_confusion_matrix(y_true=y_test, y_pred=y_pred, classes=y.unique(), title='Decision tree classifier')
def svm_classifier(json_data): classifier = SVC(gamma='scale', decision_function_shape='ovo') df = pd.DataFrame(data=json_data) x = df.drop('type', axis=1) y = df['type'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) print('SVM classifier:') print(classification_report(y_test, y_pred)) plot_confusion_matrix(y_true=y_test, y_pred=y_pred, classes=y.unique(), title='SVM classifier')
def gaussian_naive_bayes(json_data): df = pd.DataFrame(data=json_data) x = df.drop('type', axis=1) y = df['type'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0) classifier = GaussianNB() classifier.fit(x_train, y_train) GaussianNB(priors=None, var_smoothing=1e-09) y_pred = classifier.predict(x_test) print('Gaussian Naive Bayes Classifier:') print(classification_report(y_test, y_pred)) plot_confusion_matrix(y_true=y_test, y_pred=y_pred, classes=y.unique(), title='Gaussian Naive Bayes classifier')
print("Classification Report With Dummy (n = {0}):\n{1}".format( dummy_n, dummy_metrics['classification_report'])) print("Train Accuracy With Mapping (n = {0}):{1}".format( mapping_n, mapping_metrics['accuracy']['train'])) print("Test Accuracy With Mapping (n = {0}):{1}".format( mapping_n, mapping_metrics['accuracy']['test'])) print("Confusion Matrix With Mapping (n = {0}):\n{1}".format( mapping_n, mapping_metrics['confusion_matrix'])) print("Classification Report With Mapping (n = {0}):\n{1}".format( mapping_n, mapping_metrics['classification_report'])) plt.figure() plt.plot(dummy_costs, label='dummy') plt.plot(mapping_costs, label='mapping') plt.legend() plt.figure() h.plot_confusion_matrix(dummy_metrics['confusion_matrix'], "Confusion Matrix With Dummy") plt.figure() h.plot_confusion_matrix(mapping_metrics['confusion_matrix'], "Confusion Matrix With Mapping") if dummy_metrics['accuracy']['test'] > mapping_metrics['accuracy']['test']: print("Predicting with dummy variables") _, df = h.clean_with_dummy(args[1] if len(args) > 1 else 'test.csv', dummy_params) df = df.drop(['Survived'], axis=1) df = h.predict_knn(df, dummy_knn) else: print("Predicting with mapping") _, df = h.clean_with_mapping(args[1] if len(args) > 1 else 'test.csv', mapping_params) df = h.predict_knn(df, mapping_knn)
# plot log loss fig, ax = plt.subplots() ax.plot(x_axis, train_metrics['mlogloss'], label='Train') ax.plot(x_axis, test_metrics['mlogloss'], label='Test') ax.legend() plt.ylabel('Log Loss') plt.title('{} - Log Loss'.format(args.model)) plt.savefig("img/logloss_{}.png".format(uid)) plt.show() # plot classification error fig, ax = plt.subplots() ax.plot(x_axis, train_metrics['merror'], label='Train') ax.plot(x_axis, test_metrics['merror'], label='Test') ax.legend() plt.ylabel('Error') plt.title('{} - Error'.format(args.model)) plt.savefig("img/error_{}.png".format(uid)) plt.show() # Confusion matrix plot_confusion_matrix(eval_predicted, eval_labels, args.model, uid) # AUC ROC scores compute_AUC_scores(eval_predicted_proba, eval_labels) # Save Kaggle submission files construct_kaggle_submissions(uid, results, results_proba) model_file = open("models/{}.mdl".format(uid), "wb") pickle.dump(model, model_file)
# plot_model(model, to_file=MODEL_SAVE_PATH, show_shapes=True, rankdir='LR') # =========================================================================== # Training the networks # =========================================================================== # 2 classes: 'binary_crossentropy' # mean squared error model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy']) print('Train...') model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCH, validation_data=(X_valid, y_valid)) # ====== evaluation ====== # y_test_pred_proba = model.predict_proba(X_test, batch_size=BATCH_SIZE * 2) y_test = np.argmax(y_test, axis=-1) y_test_pred = np.argmax(y_test_pred_proba, axis=-1) print("Accuracy:", accuracy_score(y_true=y_test, y_pred=y_test_pred)) print("F1 score:", f1_score(y_true=y_test, y_pred=y_test_pred, average='micro', labels=digit.numbers)) print(classification_report(y_true=y_test, y_pred=y_test_pred, labels=digit.numbers)) # ====== plot confusion matrix ====== # plt.figure() cm = confusion_matrix(y_true=y_test, y_pred=y_test_pred, labels=digit.numbers) plot_confusion_matrix(cm=cm, labels=digit.numbers, colorbar=True, fontsize=6) # =========================================================================== # Cleaning # =========================================================================== plot_save(path=FIGURE_SAVE_PATH)
from sklearn.model_selection import cross_val_score import helpers from MlcLinReg import MlcLinReg X_train, y_train, X_test, y_test = helpers.load_delicious(2) helpers.plot_confusion_matrix(MlcLinReg(learning_rate=0.5, iterations=2000, batch_size=1, l_one=0.1), X=X_train.toarray(), y=y_train.toarray()) print cross_val_score(estimator=MlcLinReg(learning_rate=0.5, iterations=2000, batch_size=1, l_one=0.1), X=X_train.toarray(), y=y_train.toarray(), cv=20).mean()
total = 0 okays = 0 for i in range(y_pred.shape[0]): total += 1 if (y_pred[i] == Y_test[i]): okays += 1 print("total acc: ", 100 * okays / total) print("correct classifications: ", okays) print("errors: ", total - okays) #----------------------------------------------------------------------------- #plot confusion matrix confusionMatrix = np.zeros((num_classes, num_classes), dtype='int') for i in range(0, Y_test.shape[0] - 1): confusionMatrix[int(Y_test[i])][y_pred[i]] += 1 dict_labels = pickle.load(open('dictLabels.p', "rb")) names = list() for i in range(0, num_classes): names.append(dict_labels[i]) plot_confusion_matrix(confusionMatrix, normalize=False, target_names=names) pickle.dump(confusionMatrix, open(conf_mat_patches_saved, "wb"))
accuracy_scores = cross_val_score(model, train_data, train_labels, cv=kfold, scoring="accuracy") print("Cross validation logloss scores: {:.5f} {:.5f}".format( np.mean(scores), np.std(scores))) print("Cross validation accuracy scores: {:.5f} {:.5f}".format( np.mean(accuracy_scores), np.std(accuracy_scores))) # Save Kaggle submission files results = model.predict(test_data) # Predicts from 1-10 results_proba = model.predict_proba(test_data) construct_kaggle_submissions(ensemble['uid'] + "_Final=" + str(args.final), results, results_proba) model_file = open( "models/{}.mdl".format(ensemble['uid'] + "_Final=" + str(args.final)), "wb") pickle.dump(model, model_file) if not args.final: # Confusion matrix eval_predicted = model.predict(eval_data) plot_confusion_matrix(eval_predicted, eval_labels, ensemble['title'], ensemble['uid']) # AUC ROC scores eval_predicted_proba = model.predict_proba(eval_data) compute_AUC_scores(eval_predicted_proba, eval_labels)
# TEST MODEL =========================================================================================================== all_predictions = model.predict(all_gen) test_predictions = model.predict(test_gen) all_node_predictions = target_encoding.inverse_transform(all_predictions.squeeze()) test_node_predictions = target_encoding.inverse_transform(test_predictions.squeeze()) all_predictions_df = pd.DataFrame({"Predicted": all_node_predictions, "True": graph_labels, "Fraud IDs": fraud_ids}, index=graph_labels.index) all_predictions_df['is_correct'] = all_predictions_df['Predicted'] == all_predictions_df['True'] all_predictions_df.to_csv(cfg.STORAGE_ROOT_PATH + rf'\results_all_gcn_node.csv', sep=';') all_identified_cases = all_predictions_df[all_predictions_df['is_correct'] == True]['Fraud IDs'] print('IDENTIFIED IN ALL: ', functools.reduce(aggregate_sets, all_identified_cases.dropna(), set())) test_predictions_df = pd.DataFrame({"Predicted": test_node_predictions, "True": target_encoding.inverse_transform(test_targets.squeeze()), "Fraud IDs": test_fraud_ids}, index=test_subjects.index) test_predictions_df['is_correct'] = test_predictions_df['Predicted'] == test_predictions_df['True'] test_predictions_df.to_csv(cfg.STORAGE_ROOT_PATH + rf'\results_test_gcn_node.csv', sep=';') test_identified_cases = test_predictions_df[test_predictions_df['is_correct'] == True]['Fraud IDs'] print('IDENTIFIED IN ALL: ', functools.reduce(aggregate_sets, test_identified_cases.dropna(), set())) plot_confusion_matrix('Confusion Matrix - All Data', all_node_predictions, graph_labels.tolist(), cfg.STORAGE_BASE_THESIS_IMG + rf'\conf_matrix_all_gcn_node.pdf') plot_confusion_matrix('Confusion Matrix - Test Data', test_node_predictions, target_encoding.inverse_transform(test_targets.squeeze()), cfg.STORAGE_BASE_THESIS_IMG + rf'\conf_matrix_test_gcn_node.pdf')