def main(self, learner, learner_name, file_name, seed, train=True):
        # parse the command-line arguments
        # Evaluation method (training | static <test_ARFF_file> | random <%%_for_training> | cross <num_folds>)
        eval_method = "random"
        eval_parameter = .7
        # boolean: Print the confusion matrix and learner accuracy on individual class values
        print_confusion_matrix = False
        # boolean: Use normalized data
        normalize = False
        # string: Random seed
        random.seed(seed)

        # load the ARFF file
        data = Matrix()
        data.load_arff(file_name)
        if normalize:
            print("Using normalized data")
            data.normalize()

        # print some stats
        print("\nDataset name: {}\n"
              "Number of instances: {}\n"
              "Number of attributes: {}\n"
              "Learning algorithm: {}\n"
              "Evaluation method: {}\n".format(file_name, data.rows, data.cols,
                                               learner_name, eval_method))

        if eval_method == "training":

            print("Calculating accuracy on training set...")

            features = Matrix(data, 0, 0, data.rows, data.cols - 1)
            labels = Matrix(data, 0, data.cols - 1, data.rows, 1)
            confusion = Matrix()
            start_time = time.time()
            if train:
                learner.train(features, labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))
            accuracy = learner.measure_accuracy(features, labels, confusion)
            print("Training set accuracy: " + str(accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "static":

            print("Calculating accuracy on separate test set...")

            test_data = Matrix(arff=eval_parameter)
            if normalize:
                test_data.normalize()

            print("Test set name: {}".format(eval_parameter))
            print("Number of test instances: {}".format(test_data.rows))
            features = Matrix(data, 0, 0, data.rows, data.cols - 1)
            labels = Matrix(data, 0, data.cols - 1, data.rows, 1)

            start_time = time.time()
            learner.train(features, labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))

            train_accuracy = learner.measure_accuracy(features, labels)
            print("Training set accuracy: {}".format(train_accuracy))

            test_features = Matrix(test_data, 0, 0, test_data.rows,
                                   test_data.cols - 1)
            test_labels = Matrix(test_data, 0, test_data.cols - 1,
                                 test_data.rows, 1)
            confusion = Matrix()
            test_accuracy = learner.measure_accuracy(test_features,
                                                     test_labels, confusion)
            print("Test set accuracy: {}".format(test_accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "random":

            print("Calculating accuracy on a random hold-out set...")
            train_percent = float(eval_parameter)
            if train_percent < 0 or train_percent > 1:
                raise Exception(
                    "Percentage for random evaluation must be between 0 and 1")
            print("Percentage used for training: {}".format(train_percent))
            print("Percentage used for testing: {}".format(1 - train_percent))

            data.shuffle()

            train_size = int(train_percent * data.rows)
            train_features = Matrix(data, 0, 0, train_size, data.cols - 1)
            train_labels = Matrix(data, 0, data.cols - 1, train_size, 1)

            test_features = Matrix(data, train_size, 0, data.rows - train_size,
                                   data.cols - 1)
            test_labels = Matrix(data, train_size, data.cols - 1,
                                 data.rows - train_size, 1)

            start_time = time.time()
            learner.train(train_features, train_labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))

            train_accuracy = learner.measure_accuracy(train_features,
                                                      train_labels)
            print("Training set accuracy: {}".format(train_accuracy))

            confusion = Matrix()
            test_accuracy = learner.measure_accuracy(test_features,
                                                     test_labels, confusion)
            print("Test set accuracy: {}".format(test_accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "cross":

            print("Calculating accuracy using cross-validation...")

            folds = int(eval_parameter)
            if folds <= 0:
                raise Exception("Number of folds must be greater than 0")
            print("Number of folds: {}".format(folds))
            reps = 1
            sum_accuracy = 0.0
            elapsed_time = 0.0
            for j in range(reps):
                data.shuffle()
                for i in range(folds):
                    begin = int(i * data.rows / folds)
                    end = int((i + 1) * data.rows / folds)

                    train_features = Matrix(data, 0, 0, begin, data.cols - 1)
                    train_labels = Matrix(data, 0, data.cols - 1, begin, 1)

                    test_features = Matrix(data, begin, 0, end - begin,
                                           data.cols - 1)
                    test_labels = Matrix(data, begin, data.cols - 1,
                                         end - begin, 1)

                    train_features.add(data, end, 0, data.cols - 1)
                    train_labels.add(data, end, data.cols - 1, 1)

                    start_time = time.time()
                    learner.train(train_features, train_labels)
                    elapsed_time += time.time() - start_time

                    accuracy = learner.measure_accuracy(
                        test_features, test_labels)
                    sum_accuracy += accuracy
                    print("Rep={}, Fold={}, Accuracy={}".format(
                        j, i, accuracy))

            elapsed_time /= (reps * folds)
            print(
                "Average time to train (in seconds): {}".format(elapsed_time))
            print("Mean accuracy={}".format(sum_accuracy / (reps * folds)))

        else:
            raise Exception(
                "Unrecognized evaluation method '{}'".format(eval_method))

        if train:
            return learner.w
Exemple #2
0
    def main(self):
        # parse the command-line arguments
        args = self.parser().parse_args()
        file_name = args.arff
        learner_name = args.L
        eval_method = args.E[0]
        eval_parameter = args.E[1] if len(args.E) > 1 else None
        print_confusion_matrix = args.verbose
        normalize = args.normalize
        random.seed(
            args.seed
        )  # Use a seed for deterministic results, if provided (makes debugging easier)

        # load the model
        learner = self.get_learner(learner_name)

        # load the ARFF file
        data = Matrix()
        data.load_arff(file_name)
        if normalize:
            print("Using normalized data")
            data.normalize()

        # print some stats
        print("\nDataset name: {}\n"
              "Number of instances: {}\n"
              "Number of attributes: {}\n"
              "Learning algorithm: {}\n"
              "Evaluation method: {}\n".format(file_name, data.rows, data.cols,
                                               learner_name, eval_method))

        if eval_method == "training":

            print("Calculating accuracy on training set...")

            features = Matrix(data, 0, 0, data.rows, data.cols - 1)
            labels = Matrix(data, 0, data.cols - 1, data.rows, 1)
            confusion = Matrix()
            start_time = time.time()
            learner.train(features, labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))
            accuracy = learner.measure_accuracy(features, labels, confusion)
            print("Training set accuracy: " + str(accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "static":

            print("Calculating accuracy on separate test set...")

            test_data = Matrix(arff=eval_parameter)
            if normalize:
                test_data.normalize()

            print("Test set name: {}".format(eval_parameter))
            print("Number of test instances: {}".format(test_data.rows))
            features = Matrix(data, 0, 0, data.rows, data.cols - 1)
            labels = Matrix(data, 0, data.cols - 1, data.rows, 1)

            start_time = time.time()
            learner.train(features, labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))

            train_accuracy = learner.measure_accuracy(features, labels)
            print("Training set accuracy: {}".format(train_accuracy))

            test_features = Matrix(test_data, 0, 0, test_data.rows,
                                   test_data.cols - 1)
            test_labels = Matrix(test_data, 0, test_data.cols - 1,
                                 test_data.rows, 1)
            confusion = Matrix()
            test_accuracy = learner.measure_accuracy(test_features,
                                                     test_labels, confusion)
            print("Test set accuracy: {}".format(test_accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "random":

            print("Calculating accuracy on a random hold-out set...")
            train_percent = float(eval_parameter)
            if train_percent < 0 or train_percent > 1:
                raise Exception(
                    "Percentage for random evaluation must be between 0 and 1")
            print("Percentage used for training: {}".format(train_percent))
            print("Percentage used for testing: {}".format(1 - train_percent))

            data.shuffle()

            train_size = int(train_percent * data.rows)
            train_features = Matrix(data, 0, 0, train_size, data.cols - 1)
            train_labels = Matrix(data, 0, data.cols - 1, train_size, 1)

            test_features = Matrix(data, train_size, 0, data.rows - train_size,
                                   data.cols - 1)
            test_labels = Matrix(data, train_size, data.cols - 1,
                                 data.rows - train_size, 1)

            start_time = time.time()
            learner.train(train_features, train_labels)
            elapsed_time = time.time() - start_time
            print("Time to train (in seconds): {}".format(elapsed_time))

            train_accuracy = learner.measure_accuracy(train_features,
                                                      train_labels)
            print("Training set accuracy: {}".format(train_accuracy))

            confusion = Matrix()
            test_accuracy = learner.measure_accuracy(test_features,
                                                     test_labels, confusion)
            print("Test set accuracy: {}".format(test_accuracy))

            if print_confusion_matrix:
                print(
                    "\nConfusion matrix: (Row=target value, Col=predicted value)"
                )
                confusion.print()
                print("")

        elif eval_method == "cross":

            print("Calculating accuracy using cross-validation...")

            folds = int(eval_parameter)
            if folds <= 0:
                raise Exception("Number of folds must be greater than 0")
            print("Number of folds: {}".format(folds))
            reps = 1
            sum_accuracy = 0.0
            elapsed_time = 0.0
            for j in range(reps):
                data.shuffle()
                for i in range(folds):
                    begin = int(i * data.rows / folds)
                    end = int((i + 1) * data.rows / folds)

                    train_features = Matrix(data, 0, 0, begin, data.cols - 1)
                    train_labels = Matrix(data, 0, data.cols - 1, begin, 1)

                    test_features = Matrix(data, begin, 0, end - begin,
                                           data.cols - 1)
                    test_labels = Matrix(data, begin, data.cols - 1,
                                         end - begin, 1)

                    train_features.add(data, end, 0, data.cols - 1)
                    train_labels.add(data, end, data.cols - 1, 1)

                    start_time = time.time()
                    learner.train(train_features, train_labels)
                    elapsed_time += time.time() - start_time

                    accuracy = learner.measure_accuracy(
                        test_features, test_labels)
                    sum_accuracy += accuracy
                    print("Rep={}, Fold={}, Accuracy={}".format(
                        j, i, accuracy))

            elapsed_time /= (reps * folds)
            print(
                "Average time to train (in seconds): {}".format(elapsed_time))
            print("Mean accuracy={}".format(sum_accuracy / (reps * folds)))

        else:
            raise Exception(
                "Unrecognized evaluation method '{}'".format(eval_method))