def get_learning_curve(self, learner):
        step_size = self.data.x_train.shape[0] / 25
        steps = xrange(10, self.data.x_train.shape[0], step_size)
        train_errors = []
        validation_errors = []

        x_vals = []
        for i in steps:
            print i
            x_vals.append(i)
            x_temp, x_remainder, y_temp, y_remainder = ms.train_test_split(
                self.data.x_train,
                self.data.y_train,
                train_size=i,
                random_state=11)
            #        print x_temp, y_temp
            train_accuracy, validation_accuracy = sm.kfold_validation(learner,
                                                                      x_temp,
                                                                      y_temp,
                                                                      k=5)
            train_errors.append(1 - train_accuracy)
            validation_errors.append(1 - validation_accuracy)

        img_path = self.data.image_dir + 'boosted_dt_learning_curve.png'
        plt.plot(steps, train_errors, label='Train Error')
        plt.plot(steps, validation_errors, label='Validation Error')
        plt.xlabel('Training Examples')
        plt.ylabel('Error')
        plt.title(self.data.name + ' Boosted Decision Tree Learning Curve')
        plt.legend()
        plt.savefig(img_path)
        plt.show()
        plt.close()
    def find_optimal_num_estimators(self, base_learner, learning_rate):
        num_estimators = [1, 25, 50, 75, 100, 150, 200]
        print num_estimators
        train_accuracies = []
        validation_accuracies = []
        for i in num_estimators:
            print i
            boost_learner = ensemble.AdaBoostClassifier(
                base_learner, n_estimators=i, learning_rate=learning_rate)
            train_acc, validation_acc = sm.kfold_validation(boost_learner,
                                                            self.data.x_train,
                                                            self.data.y_train,
                                                            k=5)

            train_accuracies.append(train_acc)
            validation_accuracies.append(validation_acc)

        img_path = self.data.image_dir + 'boosted_dt_num_estimators.png'
        plt.plot(num_estimators, train_accuracies, label='Train Accuracy')
        plt.plot(num_estimators,
                 validation_accuracies,
                 label='Validation Accuracy')
        plt.xlabel('# Estimators')
        plt.ylabel('Accuracy')
        plt.title(self.data.name + ' Boosted DT Accuracy')
        plt.legend()
        plt.savefig(img_path)
        #        plt.show()
        plt.close()
Ejemplo n.º 3
0
    def get_optimal_leaf_size(self, leaf_sizes):
        train_accuracys = []
        validation_accuracys = []
        for i in leaf_sizes:
            dtl = tree.DecisionTreeClassifier(criterion='gini',
                                              min_samples_leaf=i)
            train_accuracy, validation_accuracy = sm.kfold_validation(
                dtl, self.data.x_train, self.data.y_train, k=5)
            train_accuracys.append(train_accuracy)
            validation_accuracys.append(validation_accuracy)

        img_path = self.data.image_dir + 'dt_leaf_size_accuracy.png'
        plt.plot(leaf_sizes, train_accuracys, label='Train Accuracy')
        plt.plot(leaf_sizes, validation_accuracys, label='Validation Accuracy')
        plt.xlabel('Leaf Size')
        plt.ylabel('Accuracy')
        plt.title(self.data.name + ' Decision Tree Accuracy by Leaf Size')
        plt.legend()
        plt.show()
        plt.savefig(img_path)
        plt.close()

        temp_validation_acc = np.array(validation_accuracys)
        min_error = temp_validation_acc[1:].max()
        optimal_leaf_size = temp_validation_acc[1:].argmax() + 2
        return min_error, optimal_leaf_size
Ejemplo n.º 4
0
    def show_learning_curve(self, learner):
        train_errors = []
        validation_errors = []
        train_examples = xrange(10, self.data.x_train.shape[0], 25)
        for i in train_examples:
            x_temp, x_remainder, y_temp, y_remainder = ms.train_test_split(
                self.data.x_train,
                self.data.y_train,
                train_size=i,
                random_state=11)
            train_accuracy, validation_accuracy = sm.kfold_validation(learner,
                                                                      x_temp,
                                                                      y_temp,
                                                                      k=5)
            train_errors.append(1 - train_accuracy)
            validation_errors.append(1 - validation_accuracy)

        img_path = self.data.image_dir + 'dt_learning_curve.png'
        plt.plot(train_examples, train_errors, label='Train Error')
        plt.plot(train_examples, validation_errors, label='Validation Error')
        plt.xlabel('Training Examples')
        plt.ylabel('Error')
        plt.title(self.data.name + ' Decision Tree Learning Curve')
        plt.legend()
        plt.savefig(img_path)
        plt.show()
        plt.close()
Ejemplo n.º 5
0
    def find_optimal_k(self):
        n_neighbors = xrange(1, 150)
        train_accuracies = []
        validation_accuracies = []
        for i in n_neighbors:
            knnl = skl.neighbors.KNeighborsClassifier(n_neighbors=i,
                                                      weights='uniform',
                                                      p=1)
            train_acc, validation_acc = sm.kfold_validation(knnl,
                                                            self.data.x_train,
                                                            self.data.y_train,
                                                            k=5)
            train_accuracies.append(train_acc)
            validation_accuracies.append(validation_acc)

        img_path = self.data.image_dir + 'knn_num_neighbors.png'
        plt.plot(n_neighbors, train_accuracies, label='Train Accuracy')
        plt.plot(n_neighbors,
                 validation_accuracies,
                 label='Validation Accuracy')
        plt.xlabel('# Neighbors')
        plt.ylabel('Accuracy')
        plt.title(self.data.name + ' KNN Accuracy by # Neighbors')
        plt.legend()
        plt.savefig(img_path)
        plt.show()
        plt.close()

        temp_validation_acc = np.array(validation_accuracies)
        min_error = temp_validation_acc.max()
        optimal_num_neighbors = temp_validation_acc.argmax() + 1

        return min_error, optimal_num_neighbors