def print_test_accuracy(show_example_errors=False, show_confusion_matrix=False): # For all the images in the test-set, # calculate the predicted classes and whether they are correct. correct, cls_pred = predict_cls_test() # Classification accuracy and the number of correct classifications. acc, num_correct = classification_accuracy(correct) # Number of images being classified. num_images = len(correct) # Print the accuracy. msg = "Accuracy on Test-Set: {0:.1%} ({1} / {2})" print(msg.format(acc, num_correct, num_images)) # Plot some examples of mis-classifications, if desired. if show_example_errors: print("Example errors:") plot_functions.plot_example_errors(cls_pred=cls_pred, correct=correct, db=data["test"]) # Plot the confusion matrix, if desired. if show_confusion_matrix: print("Confusion Matrix:") plot_functions.plot_confusion_matrix(cls_pred=cls_pred, cls_true=data["test"]["cls"], num_classes=num_classes, class_names=class_names)
def classify(X_train, X_test, y_train, y_test, vectorizer): clf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial', n_jobs=-1, random_state=40) clf.fit(X_train, y_train) y_predicted_counts = clf.predict(X_test) accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts) cm = confusion_matrix(y_test, y_predicted_counts) top_scores, top_words, bottom_scores, bottom_words = get_relevant_features( vectorizer, clf, n=10) plot_confusion_matrix(cm, classes=['Irrelevant', 'Disaster', 'Unsure'], normalize=False, title='Confusion matrix') plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words for relevance") print(cm) print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
def decision_tree(self, data, labels, num_features, num_folds=5): fig_acc, axs_acc = plt.subplots(1, 2, figsize=(13, 4), squeeze=False) fig_sens, axs_sens = plt.subplots(1, 2, figsize=(13, 4), squeeze=False) if num_folds: self.param_grid['num_folds'] = num_folds skf = StratifiedKFold(n_splits=self.param_grid.get('fold')) for criteria_index in range(len(self.param_grid.get('criteria'))): criteria = self.param_grid.get('criteria')[criteria_index] values_acc = {} values_sens = {} for depth in self.param_grid.get('max_depths'): accuracies_values = [] sensitivities_values = [] for num_samples in self.param_grid.get('min_samples_leaf'): tree = DecisionTreeClassifier(min_samples_leaf=num_samples, max_depth=depth, criterion=criteria, min_impurity_decrease=0.005) fold_accuracies, fold_sensitivities, fold_predictions, fold = evaluation.initialize_metrics( self) # there are four probes for every patient, it's reasonable to take only one single_data, single_labels = prep.select_single_probes( data, labels) for train_index, test_index in skf.split( single_data, single_labels): trn_x, tst_x, trn_y, tst_y = prep.separate_and_prepare_data( data, labels, train_index, test_index, num_features) # CLASSIFICATION tree.fit(trn_x, trn_y) prd_y = tree.predict(tst_x) # EVALUATION evaluation.append_solution_for_fold( fold_accuracies, fold_sensitivities, fold_predictions, fold, tst_y, prd_y) fold = fold + 1 if statistics.mean(fold_accuracies ) > self.best_solution.get('accuracy'): self.best_solution['criteria'] = self.param_grid.get( 'criteria')[criteria_index] self.best_solution['min_samples_leaf'] = num_samples self.best_solution['max_depths'] = depth self.best_solution['accuracy'] = statistics.mean( fold_accuracies) self.best_solution['sensitivity'] = statistics.mean( fold_sensitivities) TN, FP, FN, TP = evaluation.compute_confm_values( fold_predictions) self.best_solution['confusion_matrix'] = np.array( ([TN, FP], [FN, TP])) accuracies_values.append(statistics.mean(fold_accuracies)) sensitivities_values.append( statistics.mean(fold_sensitivities)) values_acc[depth] = accuracies_values values_sens[depth] = sensitivities_values plot.multiple_line_chart(axs_acc[0, criteria_index], self.param_grid.get('min_samples_leaf'), values_acc, 'Decision Trees with %s criteria' % criteria, 'max_depths', 'min_samples_leaf', 'accuracy', percentage=True) plot.multiple_line_chart(axs_sens[0, criteria_index], self.param_grid.get('min_samples_leaf'), values_sens, 'Decision Trees with %s criteria' % criteria, 'max_depths', 'min_samples_leaf', 'sensitivity', percentage=True) plt.show() fig, axs = plt.subplots(1, 1, figsize=(4, 4), squeeze=False) plot.plot_confusion_matrix(axs[0, 0], self.best_solution.get('confusion_matrix'), 'Confusion matrix', [0, 1], True) plt.show() return self.best_solution
# Calculate the kernel matrix for raw data start = time.time() K_train_rw = gk.fit_transform(G_train_rw) K_test_rw = gk.transform(G_test_rw) end = time.time() print("", end=".") # Initialise an SVM and fit. clf = svm.SVC(kernel='precomputed') clf.fit(K_train_rw, y_train_rw) print("", end=". ") # Predict and test. y_pred_rw = clf.predict(K_test_rw) print("Confusion Matrix: \n", confusion_matrix(y_test_rw, y_pred_rw)) plot_confusion_matrix(y_test_rw, y_pred_rw, labels, title="Confusion Matrix Before Smoothing") # Calculate accuracy of classification. data_kernel_rw.append( sec_to_time(round(end - start, 2)) + " ~ " + str(round(accuracy_score(y_test_rw, y_pred_rw)*100, 2)) + "%") rw_ac.append((accuracy_score(y_test_rw, y_pred_rw))) print("Raw: ", data_kernel_rw[-1]) # Calculate the kernel matrix for Smooth data start = time.time() K_train_sm = gk.fit_transform(G_train_sm) K_test_sm = gk.transform(G_test_sm) end = time.time() print("", end=".")
def random_forest(self, data, labels, num_features, num_folds): fig_acc, axs_acc = plt.subplots(1, 2, figsize=(10, 4), squeeze=False) fig_sens, axs_sens = plt.subplots(1, 2, figsize=(10, 4), squeeze=False) if num_folds: self.param_grid['num_folds'] = num_folds skf = StratifiedKFold(n_splits=self.param_grid.get('num_folds')) for max_features_index in range(len(self.param_grid.get('max_features'))): max_features = self.param_grid.get('max_features')[max_features_index] values_acc = {} values_sens = {} for depth in self.param_grid.get('max_depths'): accuracies_values = [] sensitivities_values = [] for num_estimators in self.param_grid.get('num_estimators'): fold_accuracies, fold_sensitivities, fold_predictions, fold = evaluation.initialize_metrics(self) # there are four probes for every patient, it's reasonable to take only one single_data, single_labels = prep.select_single_probes(data, labels) for train_index, test_index in skf.split(single_data, single_labels): trn_x, tst_x, trn_y, tst_y = self.separate_and_prepare_data(data, labels, train_index, test_index, num_features) # CLASSIFICATION rf = RandomForestClassifier(n_estimators=num_estimators, max_depth=depth, max_features=max_features) rf.fit(trn_x, trn_y) prd_y = rf.predict(tst_x) # EVALUATION evaluation.append_solution_for_fold(fold_accuracies, fold_sensitivities, fold_predictions, fold, tst_y, prd_y) fold = fold + 1 if statistics.mean(fold_accuracies) > self.best_solution.get('accuracy'): self.best_solution['num_estimators'] = max_features_index self.best_solution['max_depths'] = depth self.best_solution['max_features'] = num_estimators self.best_solution['accuracy'] = statistics.mean(fold_accuracies) self.best_solution['sensitivity'] = statistics.mean(fold_sensitivities) TN, FP, FN, TP = evaluation.compute_confm_values(fold_predictions) self.best_solution['confusion_matrix'] = np.array(([TN, FP], [FN, TP])) accuracies_values.append(statistics.mean(fold_accuracies)) sensitivities_values.append(statistics.mean(fold_sensitivities)) values_acc[depth] = accuracies_values values_sens[depth] = sensitivities_values plot.multiple_line_chart(axs_acc[0, max_features_index], self.param_grid.get('num_estimators'), values_acc, 'Random Forests with %s features' % max_features, 'max_depths', 'nr estimators', 'accuracy', percentage=True) plt.figure() plot.multiple_line_chart(axs_sens[0, max_features_index], self.param_grid.get('num_estimators'), values_sens, 'Random Forests with %s features' % max_features, 'max_depths', 'nr estimators', 'sensitivity', percentage=True) plt.show() fig, axs = plt.subplots(1, 1, figsize=(4, 4), squeeze=False) plot.plot_confusion_matrix(axs[0, 0], self.best_solution.get('confusion_matrix'), 'Confusion matrix', [0, 1], True) plt.show() return self.best_solution
top_scores = sorted_contributions['Relevant']['supporters'][:10].tolist() bottom_words = sorted_contributions['Relevant']['detractors'][:10].index.tolist() bottom_scores = sorted_contributions['Relevant']['detractors'][:10].tolist() plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words for relevance") if __name__ == '__main__': questions = pd.read_pickle('ready_data.pkl') list_corpus = questions['text'].tolist() list_labels = questions['class_label'].tolist() tokenized_corpus = [[tokens for tokens in gensim.utils.tokenize(text)] for text in list_corpus] embeddings = [get_average_word2vec(tokens) for tokens in tokenized_corpus] X_train, X_test, y_train, y_test = train_test_split(embeddings, list_labels, test_size=0.2, random_state=40) plot_LSA(X_train, y_train) clf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial', n_jobs=-1, random_state=40) clf.fit(X_train, y_train) y_predicted_counts = clf.predict(X_test) accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts) cm = confusion_matrix(y_test, y_predicted_counts) plot_confusion_matrix(cm, classes=['Irrelevant', 'Disaster', 'Unsure'], normalize=False, title='Confusion matrix') plot_important_words_with_lime() print(cm) print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
def knn(self, data, labels, num_features, num_folds=5): fig_acc, axs_acc = plt.subplots(1, 1, figsize=(6, 4), squeeze=False) fig_sens, axs_sens = plt.subplots(1, 1, figsize=(6, 4), squeeze=False) if num_folds: self.param_grid['num_folds'] = num_folds skf = StratifiedKFold(n_splits=self.param_grid.get('num_folds')) values_acc = {} values_sens = {} for dist_index, dist in enumerate(self.param_grid.get('dist')): accuracies_values = [] sensitivities_values = [] for n_neigh in self.param_grid.get('n_neighbors'): fold_accuracies, fold_sensitivities, fold_predictions, fold = evaluation.initialize_metrics(self) # there are four probes for every patient, it's reasonable to take only one single_data, single_labels = prep.select_single_probes(data, labels) for train_index, test_index in skf.split(single_data, single_labels): trn_x, tst_x, trn_y, tst_y = self.separate_and_prepare_data(data, labels, train_index, test_index, num_features) # CLASSIFICATION knn = KNeighborsClassifier(n_neighbors=n_neigh, metric=dist) knn.fit(trn_x, trn_y) prd_y = knn.predict(tst_x) # EVALUATION evaluation.append_solution_for_fold(fold_accuracies, fold_sensitivities, fold_predictions, fold, tst_y, prd_y) fold = fold + 1 if statistics.mean(fold_accuracies) > self.best_solution.get('accuracy'): self.best_solution['dist'] = self.param_grid.get('dist')[dist_index] self.best_solution['n_neighbors'] = n_neigh self.best_solution['accuracy'] = statistics.mean(fold_accuracies) self.best_solution['sensitivity'] = statistics.mean(fold_sensitivities) TN, FP, FN, TP = evaluation.compute_confm_values(fold_predictions) self.best_solution['confusion_matrix'] = np.array(([TN, FP], [FN, TP])) # result for different number of neighbours accuracies_values.append(statistics.mean(fold_accuracies)) sensitivities_values.append(statistics.mean(fold_sensitivities)) # results for every distance with different num of neighbours values_acc[dist] = accuracies_values values_sens[dist] = sensitivities_values plot.multiple_line_chart(axs_acc[0, 0], self.param_grid.get('n_neighbors'), values_acc, 'KNN for different number of neighbours', 'Distance metrics', 'nr neighbours', 'accuracy', percentage=False) plot.multiple_line_chart(axs_sens[0, 0], self.param_grid.get('n_neighbors'), values_sens, 'KNN for different number of neighbours', 'Distance metrics', 'nr neighbours', 'sensitivity', percentage=False) plt.show() fig, axs = plt.subplots(1, 1, figsize=(4, 4), squeeze=False) plot.plot_confusion_matrix(axs[0, 0], self.best_solution.get('confusion_matrix'), 'Confusion matrix', [0, 1], True) plt.show() return self.best_solution