コード例 #1
0
ファイル: dtree_learner.py プロジェクト: kjohnsen/cs478
    def train(self, features, labels):
        vs_size = int(0.8 * features.rows)
        vs_features = Matrix(features, 0, 0, vs_size, features.cols)
        vs_labels = Matrix(labels, 0, 0, vs_size, labels.cols)
        print(f'Holding out {vs_size} instances for validation set')
        train_features = Matrix(features, vs_size, 0, features.rows - vs_size,
                                features.cols)
        train_labels = Matrix(labels, vs_size, 0, labels.rows - vs_size,
                              labels.cols)

        # self.build_tree_for_instances(train_features, train_labels)
        self.build_tree_for_instances(features, labels)
        # print(self.num_nodes)
        self.prune_tree_using_instances(vs_features, vs_labels)
        print(self.num_nodes)
        print(self.root.traverse())

        print(self.root)
コード例 #2
0
 def get_label_list(self, labels):
     label_list = []
     for count in range(self.output_classes):
         label_list.append(Matrix(labels, 0, 0, labels.rows, labels.cols))
     for row_num in range(labels.rows):
         output_class = int(labels.row(row_num)[0])
         for index in range(0, len(label_list)):
             label_list[index].set(row_num, 0,
                                   1 if output_class == index else 0)
     return label_list
コード例 #3
0
ファイル: backprop_learner.py プロジェクト: kjohnsen/cs478
    def train(self, features, labels):
        self.setup_network(features.cols, labels.value_count(0))
        vs_size = int(self.validation_set_proportion * features.rows)
        vs_features = Matrix(features, 0, 0, vs_size, features.cols)
        vs_labels = Matrix(labels, 0, 0, vs_size, labels.cols)
        print(f'Holding out {vs_size} instances for validation set')
        train_features = Matrix(features, vs_size, 0, features.rows - vs_size,
                                features.cols)
        train_labels = Matrix(labels, vs_size, 0, labels.rows - vs_size,
                              labels.cols)

        stagnant_epochs = 0
        epochs = 0
        best_vs_mse = float('inf')
        while stagnant_epochs < 40:
            this_train_mse = 0
            for i in range(train_features.rows):
                inputs = train_features.row(i)
                target = train_labels.row(i)[0]
                self.calc_set_out(inputs)
                self.output_layer.set_target(target)
                self.calc_deltas()
                self.update_weights()
                this_train_mse += self.se_for_instance()
            this_train_mse = this_train_mse / train_features.rows

            epochs += 1
            this_vs_mse, this_vs_accy = self.calc_mse_and_accy(
                vs_features, vs_labels)
            # print(f'{epochs},{this_train_mse},{this_vs_mse},{this_vs_accy}')

            if this_vs_mse < best_vs_mse:
                stagnant_epochs = 0
                best_vs_mse = this_vs_mse
            else:
                stagnant_epochs += 1

            train_features.shuffle(train_labels)

        self.write_out(
            f'{epochs},{self.hidden_layers[-1].num_nb_nodes},{this_train_mse},{this_vs_mse},'
        )
        print(f'{epochs} epochs elapsed in training')
コード例 #4
0
ファイル: dtree_learner.py プロジェクト: kjohnsen/cs478
    def split(self, features, labels):
        # if class is pure for node, stop
        if labels.count_values_present(0) == 1:
            self.output = labels.get(0, 0)
            return
        elif features.cols == 0:  # no more features to split on
            self.output = labels.most_common_value(0)
            return
        # find lowest info attribute (highest info gain)
        self.att_idx = self.highest_info_gain_att(features, labels)
        self.att_name = features.attr_name(self.att_idx)
        # print(f'Splitting on {self.att_name}')
        self.children_names = features.enum_to_str[self.att_idx]
        # split on each value of that attribute
        features_for_value = {}
        labels_for_value = {}
        # iterate over rows and build data for use in child nodes
        for i in range(features.rows):
            row = features.row(i)
            label = labels.row(i)
            value = row[self.att_idx]
            new_row = copy.deepcopy(row)
            del new_row[self.att_idx]
            try:
                features_for_value[value].data.append(new_row)
                labels_for_value[value].data.append(label)
            except KeyError:
                new_attr_names = copy.deepcopy(features.attr_names)
                del new_attr_names[self.att_idx]
                new_str_to_enum = copy.deepcopy(features.str_to_enum)
                del new_str_to_enum[self.att_idx]
                new_enum_to_str = copy.deepcopy(features.enum_to_str)
                del new_enum_to_str[self.att_idx]
                features_for_value[value] = Matrix()
                features_for_value[value].data = [new_row]
                features_for_value[value].str_to_enum = new_str_to_enum
                features_for_value[value].enum_to_str = new_enum_to_str
                features_for_value[value].attr_names = new_attr_names

                labels_for_value[value] = Matrix()
                labels_for_value[value].data = [label]
        for value in sorted(features_for_value.keys()):
            weight = features_for_value[value].rows / features.rows
            n = Node(weight, labels.most_common_value(0))
            # print(f'Creating child for {self.att_name} = {features.enum_to_str[self.att_idx][value]}')
            self.children[value] = n
            n.split(features_for_value[value], labels_for_value[value])
コード例 #5
0
    def train(self, features, labels):
        """
        :type features: Matrix
        :type labels: Matrix
        """
        # LAZY LEARNER
        if labels.value_count(0) == 0:
            self.regression = True
        else:
            self.regression = False

        if self.use_weighted_columns:
            new_feats = Matrix(features, 0, 0, features.rows, features.cols)
            self.perceptron = Perceptron()
            self.perceptron.train(new_feats, labels)
            self.weights = [(1 if np.isnan(value) else value)
                            for value in self.perceptron.weights]
        self.feature_types = self.get_semantic_types(features)
        self.training_features = features
        self.training_labels = labels
コード例 #6
0
    def get_bootstrap_datasets(self, features, labels):
        # make copies to not affect original
        new_features = Matrix(matrix=features,
                              row_start=0,
                              col_start=0,
                              row_count=features.rows,
                              col_count=features.cols)
        new_labels = Matrix(matrix=labels,
                            row_start=0,
                            col_start=0,
                            row_count=labels.rows,
                            col_count=labels.cols)
        new_features.shuffle(buddy=new_labels)

        # take random samples of the dataset for this particular tree
        train_size = int(self.percent_strap * new_features.rows)
        bootstrap_features = Matrix(new_features, 0, 0, train_size,
                                    new_features.cols - 1)
        bootstrap_labels = Matrix(new_labels, 0, new_labels.cols - 1,
                                  train_size, 1)
        return bootstrap_features, bootstrap_labels
コード例 #7
0
def make_plot():
    data = Matrix()
    data.load_arff("../datasets/linear.arff")
    features = Matrix(data, 0, 0, data.rows, data.cols - 1)
    labels = Matrix(data, 0, data.cols - 1, data.rows, 1)
コード例 #8
0
    def main(self, learner, learner_name, file_name, seed, train=True):
        # parse the command-line arguments
        # Evaluation method (training | static <test_ARFF_file> | random <%%_for_training> | cross <num_folds>)
        eval_method = "random"
        eval_parameter = .7
        # boolean: Print the confusion matrix and learner accuracy on individual class values
        print_confusion_matrix = False
        # boolean: Use normalized data
        normalize = False
        # string: Random seed
        random.seed(seed)

        # load the ARFF file
        data = Matrix()
        data.load_arff(file_name)
        if normalize:
            print("Using normalized data")
            data.normalize()

        # print some stats
        print("\nDataset name: {}\n"
              "Number of instances: {}\n"
              "Number of attributes: {}\n"
              "Learning algorithm: {}\n"
              "Evaluation method: {}\n".format(file_name, data.rows, data.cols,
                                               learner_name, eval_method))

        if eval_method == "training":

            print("Calculating accuracy on training set...")

            features = Matrix(data, 0, 0, data.rows, data.cols - 1)
            labels = Matrix(data, 0, data.cols - 1, data.rows, 1)
            confusion = Matrix()
            start_time = time.time()
            if train:
                learner.train(features, labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))
            accuracy = learner.measure_accuracy(features, labels, confusion)
            print("Training set accuracy: " + str(accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "static":

            print("Calculating accuracy on separate test set...")

            test_data = Matrix(arff=eval_parameter)
            if normalize:
                test_data.normalize()

            print("Test set name: {}".format(eval_parameter))
            print("Number of test instances: {}".format(test_data.rows))
            features = Matrix(data, 0, 0, data.rows, data.cols - 1)
            labels = Matrix(data, 0, data.cols - 1, data.rows, 1)

            start_time = time.time()
            learner.train(features, labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))

            train_accuracy = learner.measure_accuracy(features, labels)
            print("Training set accuracy: {}".format(train_accuracy))

            test_features = Matrix(test_data, 0, 0, test_data.rows,
                                   test_data.cols - 1)
            test_labels = Matrix(test_data, 0, test_data.cols - 1,
                                 test_data.rows, 1)
            confusion = Matrix()
            test_accuracy = learner.measure_accuracy(test_features,
                                                     test_labels, confusion)
            print("Test set accuracy: {}".format(test_accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "random":

            print("Calculating accuracy on a random hold-out set...")
            train_percent = float(eval_parameter)
            if train_percent < 0 or train_percent > 1:
                raise Exception(
                    "Percentage for random evaluation must be between 0 and 1")
            print("Percentage used for training: {}".format(train_percent))
            print("Percentage used for testing: {}".format(1 - train_percent))

            data.shuffle()

            train_size = int(train_percent * data.rows)
            train_features = Matrix(data, 0, 0, train_size, data.cols - 1)
            train_labels = Matrix(data, 0, data.cols - 1, train_size, 1)

            test_features = Matrix(data, train_size, 0, data.rows - train_size,
                                   data.cols - 1)
            test_labels = Matrix(data, train_size, data.cols - 1,
                                 data.rows - train_size, 1)

            start_time = time.time()
            learner.train(train_features, train_labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))

            train_accuracy = learner.measure_accuracy(train_features,
                                                      train_labels)
            print("Training set accuracy: {}".format(train_accuracy))

            confusion = Matrix()
            test_accuracy = learner.measure_accuracy(test_features,
                                                     test_labels, confusion)
            print("Test set accuracy: {}".format(test_accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "cross":

            print("Calculating accuracy using cross-validation...")

            folds = int(eval_parameter)
            if folds <= 0:
                raise Exception("Number of folds must be greater than 0")
            print("Number of folds: {}".format(folds))
            reps = 1
            sum_accuracy = 0.0
            elapsed_time = 0.0
            for j in range(reps):
                data.shuffle()
                for i in range(folds):
                    begin = int(i * data.rows / folds)
                    end = int((i + 1) * data.rows / folds)

                    train_features = Matrix(data, 0, 0, begin, data.cols - 1)
                    train_labels = Matrix(data, 0, data.cols - 1, begin, 1)

                    test_features = Matrix(data, begin, 0, end - begin,
                                           data.cols - 1)
                    test_labels = Matrix(data, begin, data.cols - 1,
                                         end - begin, 1)

                    train_features.add(data, end, 0, data.cols - 1)
                    train_labels.add(data, end, data.cols - 1, 1)

                    start_time = time.time()
                    learner.train(train_features, train_labels)
                    elapsed_time += time.time() - start_time

                    accuracy = learner.measure_accuracy(
                        test_features, test_labels)
                    sum_accuracy += accuracy
                    print("Rep={}, Fold={}, Accuracy={}".format(
                        j, i, accuracy))

            elapsed_time /= (reps * folds)
            print(
                "Average time to train (in seconds): {}".format(elapsed_time))
            print("Mean accuracy={}".format(sum_accuracy / (reps * folds)))

        else:
            raise Exception(
                "Unrecognized evaluation method '{}'".format(eval_method))

        if train:
            return learner.w
コード例 #9
0
    def train(self, features, labels):
        """
        :type features: Matrix
        :type labels: Matrix
        """
        print("The learning rate for this model is {} \n with momentum {}".
              format(self.learning_rate, self.momentum))
        features_bias = Matrix(features, 0, 0, features.rows, features.cols)
        features_bias = self.add_bias_to_features(features_bias)

        #### Prepare Validation Set ####
        if self.validation_set:
            features_bias.shuffle(buddy=labels)

            test_size = int(.1 * features_bias.rows)
            train_features = Matrix(features_bias, 0, 0,
                                    features_bias.rows - test_size,
                                    features_bias.cols)
            train_features = self.add_bias_to_features(train_features)
            train_labels = Matrix(labels, 0, 0, features_bias.rows - test_size,
                                  labels.cols)

            test_features = Matrix(features_bias, test_size, 0, test_size,
                                   features_bias.cols)
            test_features = self.add_bias_to_features(test_features)
            test_labels = Matrix(labels, test_size, 0, test_size, labels.cols)

        ##### Setup Weights #####
        self.output_classes = len(set(labels.col(labels.cols - 1)))
        # set up output layer => the number of classes by the size of the previous hidden layer
        self.output_layer = np.random.uniform(low=self.min,
                                              high=self.max,
                                              size=(self.output_classes,
                                                    self.nodes_per_layer + 1))
        # setup input layer to match specs - number of nodes to connect to, number of inputs plus bias
        self.input_layer = np.random.uniform(low=self.min,
                                             high=self.max,
                                             size=(self.nodes_per_layer,
                                                   features_bias.cols + 1))
        # setup output layer to match specs
        self.num_output_layers = labels.cols
        # create a new features set with a bias for training
        last_row_num = features_bias.rows - 1
        self._is_nominal_output = labels.value_count(0) != 0

        self.best_inputs = self.input_layer
        self.best_hidden = self.hidden_layers
        self.best_output = self.output_layer

        if not self.validation_set:
            train_features = features_bias
            test_features = features_bias
            train_labels = labels
            test_labels = labels

        # start learning
        while self._is_still_learning(self):
            print(" #### On Epoch Number {}".format(self.epoch_count))
            train_features.shuffle(buddy=train_labels)
            for row_num in range(train_features.rows):
                #print("On input number {} #############".format(row_num + 1))
                row = train_features.row(row_num)
                #print("Feed Forward with row: {}".format(row))
                output = self._feed_forward(row)
                #print("backpropogating errors with output: {}".format(output))
                self._back_propagate(
                    output, train_labels.row(row_num), row,
                    self.batch_norm_enabled,
                    (self.batch_norm_enabled and row_num == last_row_num))
            # test on validation set
            accuracy_for_epoch: float = self.measure_accuracy(test_features,
                                                              test_labels,
                                                              MSE=self.MSE)
            accuracy_for_epoch_train: float = self.measure_accuracy(
                train_features, train_labels, MSE=self.MSE)
            if self.epoch_count > 1:
                if (self.MSE
                        and min(self.accuracy_hash.items(),
                                key=lambda x: x[1])[1] < accuracy_for_epoch
                    ) or (not self.MSE
                          and max(self.accuracy_hash.items(),
                                  key=lambda x: x[1])[1] < accuracy_for_epoch):
                    self.best_inputs = self.input_layer
                    self.best_hidden = self.hidden_layers
                    self.best_output = self.output_layer
            self.accuracy_hash[self.epoch_count] = accuracy_for_epoch
            self.accuracy_hash_train[
                self.epoch_count] = accuracy_for_epoch_train

            self.epoch_count += 1

        print("The best accuracy on the validation set was {}".format(
            max(self.accuracy_hash.items(), key=lambda x: x[1])))

        self.input_layer = self.best_inputs
        self.hidden_layers = self.best_hidden
        self.output_layer = self.best_output

        final_vs: float = self.measure_accuracy(test_features,
                                                test_labels,
                                                MSE=self.MSE)
        final_ts: float = self.measure_accuracy(train_features,
                                                train_labels,
                                                MSE=self.MSE)
        print("The final best for VS: {} and for TS: {}".format(
            final_vs, final_ts))

        return
コード例 #10
0
    def main(self):
        # parse the command-line arguments
        args = self.parser().parse_args()
        file_name = args.arff
        learner_name = args.L
        eval_method = args.E[0]
        eval_parameter = args.E[1] if len(args.E) > 1 else None
        print_confusion_matrix = args.verbose
        normalize = args.normalize
        random.seed(
            args.seed
        )  # Use a seed for deterministic results, if provided (makes debugging easier)

        # load the model
        learner = self.get_learner(learner_name)

        # load the ARFF file
        data = Matrix()
        data.load_arff(file_name)
        if normalize:
            print("Using normalized data")
            data.normalize()

        # print some stats
        print("\nDataset name: {}\n"
              "Number of instances: {}\n"
              "Number of attributes: {}\n"
              "Learning algorithm: {}\n"
              "Evaluation method: {}\n".format(file_name, data.rows, data.cols,
                                               learner_name, eval_method))

        if eval_method == "training":

            print("Calculating accuracy on training set...")

            features = Matrix(data, 0, 0, data.rows, data.cols - 1)
            labels = Matrix(data, 0, data.cols - 1, data.rows, 1)
            confusion = Matrix()
            start_time = time.time()
            learner.train(features, labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))
            accuracy = learner.measure_accuracy(features, labels, confusion)
            print("Training set accuracy: " + str(accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "static":

            print("Calculating accuracy on separate test set...")

            test_data = Matrix(arff=eval_parameter)
            if normalize:
                test_data.normalize()

            print("Test set name: {}".format(eval_parameter))
            print("Number of test instances: {}".format(test_data.rows))
            features = Matrix(data, 0, 0, data.rows, data.cols - 1)
            labels = Matrix(data, 0, data.cols - 1, data.rows, 1)

            start_time = time.time()
            learner.train(features, labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))

            train_accuracy = learner.measure_accuracy(features, labels)
            print("Training set accuracy: {}".format(train_accuracy))

            test_features = Matrix(test_data, 0, 0, test_data.rows,
                                   test_data.cols - 1)
            test_labels = Matrix(test_data, 0, test_data.cols - 1,
                                 test_data.rows, 1)
            confusion = Matrix()
            test_accuracy = learner.measure_accuracy(test_features,
                                                     test_labels, confusion)
            print("Test set accuracy: {}".format(test_accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "random":

            print("Calculating accuracy on a random hold-out set...")
            train_percent = float(eval_parameter)
            if train_percent < 0 or train_percent > 1:
                raise Exception(
                    "Percentage for random evaluation must be between 0 and 1")
            print("Percentage used for training: {}".format(train_percent))
            print("Percentage used for testing: {}".format(1 - train_percent))

            data.shuffle()

            train_size = int(train_percent * data.rows)
            train_features = Matrix(data, 0, 0, train_size, data.cols - 1)
            train_labels = Matrix(data, 0, data.cols - 1, train_size, 1)

            test_features = Matrix(data, train_size, 0, data.rows - train_size,
                                   data.cols - 1)
            test_labels = Matrix(data, train_size, data.cols - 1,
                                 data.rows - train_size, 1)

            start_time = time.time()
            learner.train(train_features, train_labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))

            train_accuracy = learner.measure_accuracy(train_features,
                                                      train_labels)
            print("Training set accuracy: {}".format(train_accuracy))

            confusion = Matrix()
            test_accuracy = learner.measure_accuracy(test_features,
                                                     test_labels, confusion)
            print("Test set accuracy: {}".format(test_accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "cross":

            print("Calculating accuracy using cross-validation...")

            folds = int(eval_parameter)
            if folds <= 0:
                raise Exception("Number of folds must be greater than 0")
            print("Number of folds: {}".format(folds))
            reps = 1
            sum_accuracy = 0.0
            elapsed_time = 0.0
            for j in range(reps):
                data.shuffle()
                for i in range(folds):
                    begin = int(i * data.rows / folds)
                    end = int((i + 1) * data.rows / folds)

                    train_features = Matrix(data, 0, 0, begin, data.cols - 1)
                    train_labels = Matrix(data, 0, data.cols - 1, begin, 1)

                    test_features = Matrix(data, begin, 0, end - begin,
                                           data.cols - 1)
                    test_labels = Matrix(data, begin, data.cols - 1,
                                         end - begin, 1)

                    train_features.add(data, end, 0, data.cols - 1)
                    train_labels.add(data, end, data.cols - 1, 1)

                    start_time = time.time()
                    learner.train(train_features, train_labels)
                    elapsed_time += time.time() - start_time

                    accuracy = learner.measure_accuracy(
                        test_features, test_labels)
                    sum_accuracy += accuracy
                    print("Rep={}, Fold={}, Accuracy={}".format(
                        j, i, accuracy))

            elapsed_time /= (reps * folds)
            print(
                "Average time to train (in seconds): {}".format(elapsed_time))
            print("Mean accuracy={}".format(sum_accuracy / (reps * folds)))

        else:
            raise Exception(
                "Unrecognized evaluation method '{}'".format(eval_method))
コード例 #11
0
    def get_training_sets(self, features, labels):
        features_bias = Matrix(features, 0, 0, features.rows, features.cols)
        # features_bias = self.add_bias_to_features(features_bias)

        #### Prepare Validation Set ####
        if self.validation_set:
            features_bias.shuffle(buddy=labels)

            test_size = int(.4 * features_bias.rows)
            train_features = Matrix(features_bias, 0, 0,
                                    features_bias.rows - test_size,
                                    features_bias.cols)
            train_labels = Matrix(labels, 0, 0, features_bias.rows - test_size,
                                  labels.cols)

            test_features = Matrix(features_bias, test_size, 0, test_size,
                                   features_bias.cols)
            test_labels = Matrix(labels, test_size, 0, test_size, labels.cols)

            train_features = train_features.return_pandas_df()
            train_labels = train_labels.return_pandas_df()
            test_features = test_features.return_pandas_df()
            test_labels = test_labels.return_pandas_df()

        if not self.validation_set:
            features_bias = features_bias.return_pandas_df()
            labels = labels.return_pandas_df()

            train_features = features_bias
            test_features = features_bias
            train_labels = labels
            test_labels = labels

        return test_features, test_labels, train_features, train_labels