def predict(self): correct_labels = [] predicted_labels = [] # Perform MAP classification with open(DATA_DIR + self.test_file) as f: for line in f: doc = line.split() correct_labels.append(self.classes[doc[0]]) map_classifier = np.zeros(self.num_classes) # Calculate decision function on test doc features for each class for model_class in range(self.num_classes): map_classifier[model_class] = np.array( [math.log(float(self.doc_counts[model_class]/np.sum(self.doc_counts)))]) if self.runmode == 'multinomial': for word in doc[1:]: word = word.split(':') if word[0] in self.model: map_classifier[model_class] += math.log(self.model[word[0]][model_class]) elif self.runmode == 'bernoulli': words = [word.split(':')[0] for word in doc[1:]] for word in self.model: if word in words: map_classifier[model_class] += math.log(self.model[word][model_class]) else: map_classifier[model_class] += math.log(1. - self.model[word][model_class]) # Get best classified class predicted_labels.append(np.argmax(map_classifier)) # Get total accuracy correct_labels = np.array(correct_labels) predicted_labels = np.array(predicted_labels) accuracy = calc_accuracy(correct_labels, predicted_labels) logger.info('NB model is {0:.2f}% accurate on the {1} data with k = {2}.' .format(accuracy, self.runmode, self.k)) # Get confusion matrix with class accuracies cm = confusion_matrix(correct_labels, predicted_labels, self.num_classes) class_accuracies = [cm[n][n] for n in range(self.num_classes)] for n, x in enumerate(class_accuracies): logger.info('Class {0} has an accuracy of {1:.2f}%'.format(self.class_names[n], 100 * x)) # Plot confusion matrix plt.figure(figsize=(30,30)) plt.imshow(cm, cmap=plt.get_cmap('Greens'), interpolation='nearest') plt.title('Confusion Matrix') plt.xticks(np.arange(self.num_classes), self.class_names, fontsize = 8) plt.yticks(np.arange(self.num_classes), self.class_names, fontsize = 10) plt.xlabel('Predictions') plt.ylabel('Truths') plt.colorbar() plt.show()
def predict(self, info=True): test_label_path = DATA_DIR + '/testlabels' test_images_path = DATA_DIR + '/testimages' correct_labels = [] with open(test_label_path) as f: for line in f: correct_labels.append(int(line)) num_images = len(correct_labels) test_images = [[None for _ in range(self.row)] for _ in range(num_images)] with open(test_images_path) as f: for n in range(num_images): for y in range(self.row): test_images[n][y] = list(f.readline().rstrip('\n')) predicted_labels = [] for n in range(num_images): model = np.zeros((self.row,self.col)) for y in range(self.row): for x in range(self.col): if test_images[n][y][x] in ['+', '#']: model[y][x] = 1 decision = self.train_decision(model) predicted_labels.append(decision) truths = np.array(correct_labels) predictions = np.array(predicted_labels) accuracy = calc_accuracy(truths, predictions) logger.info('NB model is {0:.2f}% accurate on the digit data'.format(accuracy)) X = np.array(range(self.col)) Y = np.fliplr(np.atleast_2d(np.array(range(self.row))))[0] if info: confm = confusion_matrix(truths, predictions, self.num_classes) class_accuracies = [confm[n][n] for n in range(self.num_classes)] # Class accuracies for n, x in enumerate(class_accuracies): logger.info('Class {0} has an accuracy of {1:.2f}%'.format(n, 100 * x)) # Confusion matrixx plt.figure() plt.imshow(confm, cmap=plt.get_cmap('Greens'), interpolation='nearest') plt.title('Confusion Matrix') plt.xticks(np.arange(self.num_classes)) plt.yticks(np.arange(self.num_classes)) plt.xlabel('Predictions') plt.ylabel('Truths') X, Y = np.meshgrid(range(self.col), range(self.row)) Y = Y[::-1] for i in range(self.num_classes): hf = plt.figure() ha = hf.gca(projection = '3d') ha.plot_surface(X, Y, self.feature_weight_vectors[i], rstride=1, cstride=1, linewidth=0, cmap=cm.coolwarm, antialiased = False) ha.set_xlabel('X') ha.set_ylabel('Y') ha.set_zlabel('weigh') plt.show()
def predict(self, info=True): if self.runmode == 'digits': test_label_path = DATA_DIR + '/testlabels' test_images_path = DATA_DIR + '/testimages' elif self.runmode == 'faces': test_label_path = DATA_DIR + '/facedatatestlabels' test_images_path = DATA_DIR + '/facedatatest' correct_labels = [] with open(test_label_path) as f: for line in f: correct_labels.append(int(line)) num_images = len(correct_labels) # Using python list instead of np since np chararrays replace spaces with empty string test_images = [[None for _ in range(self.row)] for _ in range(num_images)] with open(test_images_path) as f: for n in range(num_images): for y in range(self.row): test_images[n][y] = list(f.readline().rstrip('\n')) predicted_labels = [] for n in range(num_images): map_classifier = np.zeros(self.num_classes) for num in range(self.num_classes): map_classifier[num] = np.array([math.log(self.num_counts[num]/np.sum(self.num_counts))]) for y in range(self.row): for x in range(self.col): if test_images[n][y][x] == ' ': map_classifier[num] += math.log(self.model[num][y][x][0]) if self.num_features == 3: if test_images[n][y][x] == '+': map_classifier[num] += math.log(self.model[num][y][x][1]) elif test_images[n][y][x] == '#': map_classifier[num] += math.log(self.model[num][y][x][2]) else: if test_images[n][y][x] in ['+', '#']: map_classifier[num] += math.log(self.model[num][y][x][1]) predicted_label = np.argmax(map_classifier) predicted_labels.append((predicted_label, map_classifier[predicted_label], n)) truths = np.array(correct_labels) predictions = np.array([x[0] for x in predicted_labels]) accuracy = calc_accuracy(truths, predictions) logger.info('NB model is {0:.2f}% accurate on the {1} data with k = {2}.'.format(accuracy, self.runmode, self.k)) if info: cm = confusion_matrix(truths, predictions, self.num_classes) class_accuracies = [cm[n][n] for n in range(self.num_classes)] # Class accuracies for n, x in enumerate(class_accuracies): logger.info('Class {0} has an accuracy of {1:.2f}%'.format(n, 100 * x)) # Confusion matrix plt.figure() plt.imshow(cm, cmap=plt.get_cmap('Greens'), interpolation='nearest') plt.title('Confusion Matrix') plt.xticks(np.arange(self.num_classes)) plt.yticks(np.arange(self.num_classes)) plt.xlabel('Predictions') plt.ylabel('Truths') # Test images with the highest and lowest posterior probability # Sorts from lowest to highest by class, then by posterior probability sorted_predictions = sorted(predicted_labels) class_indices = [] for x in range(len(sorted_predictions)): if sorted_predictions[x][0] != sorted_predictions[x-1][0]: class_indices.append(x) for x in range(len(class_indices)): curr_class = sorted_predictions[class_indices[x]][0] lowest_idex = sorted_predictions[class_indices[x]][2] try: highest_idx = sorted_predictions[class_indices[x+1]-1][2] except IndexError: highest_idx = sorted_predictions[len(sorted_predictions)-1][2] best_test_image = [[0 if x in ['#', '+'] else 1 for x in y] for y in test_images[highest_idx]] worst_test_image = [[0 if x in ['#', '+'] else 1 for x in y] for y in test_images[lowest_idex]] plt.figure() plt.suptitle('Class {0}'.format(curr_class)) plt.subplot(1, 2, 1) plt.imshow(best_test_image, cmap=plt.get_cmap('Greys_r')) plt.title('Highest') plt.xticks([]) plt.yticks([]) plt.subplot(1, 2, 2) plt.title('Lowest') plt.xticks([]) plt.yticks([]) plt.imshow(worst_test_image, cmap=plt.get_cmap('Greys_r')) # Odds ratio for the four worst classes cm_ravel = np.ravel(cm) least_accurate_pairs = cm_ravel.argsort()[:4] least_accurate_pairs = [(x % self.num_classes, math.floor(x / self.num_classes)) for x in least_accurate_pairs] if self.num_features == 2 and self.runmode == 'digits': for i, j in least_accurate_pairs: log_likelihood_one = np.zeros((self.col, self.row)) log_likelihood_two = np.zeros((self.col, self.row)) odds_ratio = np.zeros((self.col, self.row)) for y in range(self.row): for x in range(self.col): log_likelihood_one[y][x] = math.log(self.model[i][y][x][1]) log_likelihood_two[y][x] = math.log(self.model[j][y][x][1]) odds_ratio[y][x] = math.log(self.model[i][y][x][1] / self.model[j][y][x][1]) plt.figure() plt.subplot(1, 3, 1) plt.imshow(log_likelihood_one, interpolation='nearest') plt.title('Likelihood of {0}'.format(i)) plt.xticks([]) plt.yticks([]) cbar = plt.colorbar(shrink=0.35) cbar.set_ticks(np.arange(np.amin(log_likelihood_one), np.amax(log_likelihood_one), step=2, dtype=np.int8)) for t in cbar.ax.get_yticklabels(): t.set_horizontalalignment('right') t.set_x(4) plt.subplot(1, 3, 2) plt.imshow(log_likelihood_two, interpolation='nearest') plt.title('Likelihood of {0}'.format(j)) plt.xticks([]) plt.yticks([]) cbar = plt.colorbar(shrink=0.35) cbar.set_ticks(np.arange(np.amin(log_likelihood_two), np.amax(log_likelihood_two), step=2, dtype=np.int8)) for t in cbar.ax.get_yticklabels(): t.set_horizontalalignment('right') t.set_x(4) plt.subplot(1, 3, 3) plt.imshow(odds_ratio, interpolation='nearest') plt.title('Odds ratio') plt.xticks([]) plt.yticks([]) cbar = plt.colorbar(shrink=0.35) cbar.set_ticks(np.arange(np.amin(odds_ratio), np.amax(odds_ratio), step=2, dtype=np.int8)) for t in cbar.ax.get_yticklabels(): t.set_horizontalalignment('right') t.set_x(4) plt.show()