def predict(self):
        correct_labels = []
        predicted_labels = []

        # Perform MAP classification
        with open(DATA_DIR + self.test_file) as f:
            for line in f:
                doc = line.split()
                correct_labels.append(self.classes[doc[0]])
                map_classifier = np.zeros(self.num_classes)
                # Calculate decision function on test doc features for each class
                for model_class in range(self.num_classes):
                    map_classifier[model_class] = np.array(
                        [math.log(float(self.doc_counts[model_class]/np.sum(self.doc_counts)))])
                    if self.runmode == 'multinomial':
                        for word in doc[1:]:
                            word = word.split(':')
                            if word[0] in self.model:
                                map_classifier[model_class] += math.log(self.model[word[0]][model_class])
                    elif self.runmode == 'bernoulli':
                        words = [word.split(':')[0] for word in doc[1:]]
                        for word in self.model:
                            if word in words:
                                map_classifier[model_class] += math.log(self.model[word][model_class])
                            else:
                                map_classifier[model_class] += math.log(1. - self.model[word][model_class])
                # Get best classified class
                predicted_labels.append(np.argmax(map_classifier))

        # Get total accuracy
        correct_labels = np.array(correct_labels)
        predicted_labels = np.array(predicted_labels)
        accuracy = calc_accuracy(correct_labels, predicted_labels)
        logger.info('NB model is {0:.2f}% accurate on the {1} data with k = {2}.'
                    .format(accuracy, self.runmode, self.k))

        # Get confusion matrix with class accuracies
        cm = confusion_matrix(correct_labels, predicted_labels, self.num_classes)
        class_accuracies = [cm[n][n] for n in range(self.num_classes)]
        for n, x in enumerate(class_accuracies):
            logger.info('Class {0} has an accuracy of {1:.2f}%'.format(self.class_names[n], 100 * x))

        # Plot confusion matrix
        plt.figure(figsize=(30,30))
        plt.imshow(cm, cmap=plt.get_cmap('Greens'), interpolation='nearest')
        plt.title('Confusion Matrix')
        plt.xticks(np.arange(self.num_classes), self.class_names, fontsize = 8)
        plt.yticks(np.arange(self.num_classes), self.class_names, fontsize = 10)
        plt.xlabel('Predictions')
        plt.ylabel('Truths')
        plt.colorbar()
        plt.show()
Esempio n. 2
0
    def predict(self, info=True):
        test_label_path = DATA_DIR + '/testlabels'
        test_images_path = DATA_DIR + '/testimages'

        correct_labels = []

        with open(test_label_path) as f:
            for line in f:
                correct_labels.append(int(line))

        num_images = len(correct_labels)

        test_images = [[None for _ in range(self.row)] for _ in range(num_images)]
        with open(test_images_path) as f:
            for n in range(num_images):
                for y in range(self.row):
                    test_images[n][y] = list(f.readline().rstrip('\n'))

        predicted_labels = []

        for n in range(num_images):
            model = np.zeros((self.row,self.col))
            for y in range(self.row):
                for x in range(self.col):
                    if test_images[n][y][x] in ['+', '#']:
                        model[y][x] = 1
            decision = self.train_decision(model)
            predicted_labels.append(decision)

        truths = np.array(correct_labels)
        predictions = np.array(predicted_labels)
        accuracy = calc_accuracy(truths, predictions)
        logger.info('NB model is {0:.2f}% accurate on the digit data'.format(accuracy))

        X = np.array(range(self.col))
        Y = np.fliplr(np.atleast_2d(np.array(range(self.row))))[0]
        if info:
            confm = confusion_matrix(truths, predictions, self.num_classes)
            class_accuracies = [confm[n][n] for n in range(self.num_classes)]
            # Class accuracies
            for n, x in enumerate(class_accuracies):
                logger.info('Class {0} has an accuracy of {1:.2f}%'.format(n, 100 * x))

            # Confusion matrixx
            plt.figure()
            plt.imshow(confm, cmap=plt.get_cmap('Greens'), interpolation='nearest')
            plt.title('Confusion Matrix')
            plt.xticks(np.arange(self.num_classes))
            plt.yticks(np.arange(self.num_classes))
            plt.xlabel('Predictions')
            plt.ylabel('Truths')

            X, Y = np.meshgrid(range(self.col), range(self.row))
            Y = Y[::-1]
            for i in range(self.num_classes):
                hf = plt.figure()
                ha = hf.gca(projection = '3d')

                ha.plot_surface(X, Y, self.feature_weight_vectors[i], rstride=1, cstride=1,
                                linewidth=0, cmap=cm.coolwarm, antialiased = False)
                ha.set_xlabel('X')
                ha.set_ylabel('Y')
                ha.set_zlabel('weigh')
            plt.show()
    def predict(self, info=True):
        if self.runmode == 'digits':
            test_label_path = DATA_DIR + '/testlabels'
            test_images_path = DATA_DIR + '/testimages'
        elif self.runmode == 'faces':
            test_label_path = DATA_DIR + '/facedatatestlabels'
            test_images_path = DATA_DIR + '/facedatatest'

        correct_labels = []

        with open(test_label_path) as f:
            for line in f:
                correct_labels.append(int(line))

        num_images = len(correct_labels)
        # Using python list instead of np since np chararrays replace spaces with empty string
        test_images = [[None for _ in range(self.row)] for _ in range(num_images)]

        with open(test_images_path) as f:
            for n in range(num_images):
                for y in range(self.row):
                    test_images[n][y] = list(f.readline().rstrip('\n'))

        predicted_labels = []
        for n in range(num_images):
            map_classifier = np.zeros(self.num_classes)
            for num in range(self.num_classes):
                map_classifier[num] = np.array([math.log(self.num_counts[num]/np.sum(self.num_counts))])
                for y in range(self.row):
                    for x in range(self.col):
                        if test_images[n][y][x] == ' ':
                            map_classifier[num] += math.log(self.model[num][y][x][0])
                        if self.num_features == 3:
                            if test_images[n][y][x] == '+':
                                map_classifier[num] += math.log(self.model[num][y][x][1])
                            elif test_images[n][y][x] == '#':
                                map_classifier[num] += math.log(self.model[num][y][x][2])
                        else:
                            if test_images[n][y][x] in ['+', '#']:
                                map_classifier[num] += math.log(self.model[num][y][x][1])
            predicted_label = np.argmax(map_classifier)
            predicted_labels.append((predicted_label, map_classifier[predicted_label], n))

        truths = np.array(correct_labels)
        predictions = np.array([x[0] for x in predicted_labels])
        accuracy = calc_accuracy(truths, predictions)
        logger.info('NB model is {0:.2f}% accurate on the {1} data with k = {2}.'.format(accuracy, self.runmode, self.k))

        if info:
            cm = confusion_matrix(truths, predictions, self.num_classes)
            class_accuracies = [cm[n][n] for n in range(self.num_classes)]
            # Class accuracies
            for n, x in enumerate(class_accuracies):
                logger.info('Class {0} has an accuracy of {1:.2f}%'.format(n, 100 * x))

            # Confusion matrix
            plt.figure()
            plt.imshow(cm, cmap=plt.get_cmap('Greens'), interpolation='nearest')
            plt.title('Confusion Matrix')
            plt.xticks(np.arange(self.num_classes))
            plt.yticks(np.arange(self.num_classes))
            plt.xlabel('Predictions')
            plt.ylabel('Truths')

            # Test images with the highest and lowest posterior probability
            # Sorts from lowest to highest by class, then by posterior probability
            sorted_predictions = sorted(predicted_labels)
            class_indices = []
            for x in range(len(sorted_predictions)):
                if sorted_predictions[x][0] != sorted_predictions[x-1][0]:
                    class_indices.append(x)

            for x in range(len(class_indices)):
                curr_class = sorted_predictions[class_indices[x]][0]
                lowest_idex = sorted_predictions[class_indices[x]][2]
                try:
                    highest_idx = sorted_predictions[class_indices[x+1]-1][2]
                except IndexError:
                    highest_idx = sorted_predictions[len(sorted_predictions)-1][2]
                best_test_image = [[0 if x in ['#', '+'] else 1 for x in y] for y in test_images[highest_idx]]
                worst_test_image = [[0 if x in ['#', '+'] else 1 for x in y] for y in test_images[lowest_idex]]
                plt.figure()
                plt.suptitle('Class {0}'.format(curr_class))
                plt.subplot(1, 2, 1)
                plt.imshow(best_test_image, cmap=plt.get_cmap('Greys_r'))
                plt.title('Highest')
                plt.xticks([])
                plt.yticks([])
                plt.subplot(1, 2, 2)
                plt.title('Lowest')
                plt.xticks([])
                plt.yticks([])
                plt.imshow(worst_test_image, cmap=plt.get_cmap('Greys_r'))

            # Odds ratio for the four worst classes
            cm_ravel = np.ravel(cm)
            least_accurate_pairs = cm_ravel.argsort()[:4]
            least_accurate_pairs = [(x % self.num_classes, math.floor(x / self.num_classes)) for x in least_accurate_pairs]

            if self.num_features == 2 and self.runmode == 'digits':
                for i, j in least_accurate_pairs:
                    log_likelihood_one = np.zeros((self.col, self.row))
                    log_likelihood_two = np.zeros((self.col, self.row))
                    odds_ratio = np.zeros((self.col, self.row))
                    for y in range(self.row):
                        for x in range(self.col):
                            log_likelihood_one[y][x] = math.log(self.model[i][y][x][1])
                            log_likelihood_two[y][x] = math.log(self.model[j][y][x][1])
                            odds_ratio[y][x] = math.log(self.model[i][y][x][1] / self.model[j][y][x][1])

                    plt.figure()
                    plt.subplot(1, 3, 1)
                    plt.imshow(log_likelihood_one, interpolation='nearest')
                    plt.title('Likelihood of {0}'.format(i))
                    plt.xticks([])
                    plt.yticks([])
                    cbar = plt.colorbar(shrink=0.35)
                    cbar.set_ticks(np.arange(np.amin(log_likelihood_one), np.amax(log_likelihood_one), step=2, dtype=np.int8))
                    for t in cbar.ax.get_yticklabels():
                        t.set_horizontalalignment('right')
                        t.set_x(4)
                    plt.subplot(1, 3, 2)
                    plt.imshow(log_likelihood_two, interpolation='nearest')
                    plt.title('Likelihood of {0}'.format(j))
                    plt.xticks([])
                    plt.yticks([])
                    cbar = plt.colorbar(shrink=0.35)
                    cbar.set_ticks(np.arange(np.amin(log_likelihood_two), np.amax(log_likelihood_two), step=2, dtype=np.int8))
                    for t in cbar.ax.get_yticklabels():
                        t.set_horizontalalignment('right')
                        t.set_x(4)
                    plt.subplot(1, 3, 3)
                    plt.imshow(odds_ratio, interpolation='nearest')
                    plt.title('Odds ratio')
                    plt.xticks([])
                    plt.yticks([])
                    cbar = plt.colorbar(shrink=0.35)
                    cbar.set_ticks(np.arange(np.amin(odds_ratio), np.amax(odds_ratio), step=2, dtype=np.int8))
                    for t in cbar.ax.get_yticklabels():
                        t.set_horizontalalignment('right')
                        t.set_x(4)

            plt.show()