Example #1
0
def get_test_set_accuracy_with_leaf_thresholds():
    train_dataset = dtlearn.get_dataset_from_file(TRAIN_DATASET_FILE_PATH)
    test_dataset = dtlearn.get_dataset_from_file(TEST_DATASET_FILE_PATH)
  
    leaf_thresholds = [2, 5, 10, 20]
    accuracy_with_leaf_thresholds = []
    for leaf_threshold in leaf_thresholds:
        print "Running test for leaf threshold : " + str(leaf_threshold)
        training_dataset_dtree = dtree.learn_dtree(train_dataset, leaf_threshold)
        test_set_accuracy = dtree.test_dtree(training_dataset_dtree, test_dataset)
        accuracy_with_leaf_thresholds.append(test_set_accuracy*100)
        print "Test set accuracy for leaf threshold " + str(leaf_threshold) + " is " + "{0:.2f}".format(test_set_accuracy)

    # Plot the graph
    plt.figure()

    plt.plot(leaf_thresholds, accuracy_with_leaf_thresholds, label="accuracy vs stopping threshold",marker='D', color='r')
    
    plt.xlabel("Stopping threshold")
    plt.ylabel("Test set accuracy (%)")
    plt.title("Test set accuracy vs Stopping threshold")
    
    plt.xlim(0, 22)
    plt.ylim(0, 100) 
 
    plt.grid(True)
    plt.legend(loc="upper left")
    
    plt.savefig("graphs/accuracy_vs_m.png")
Example #2
0
def get_learning_curve_with_number_of_instances():
    train_dataset = dtlearn.get_dataset_from_file(TRAIN_DATASET_FILE_PATH)
    test_dataset = dtlearn.get_dataset_from_file(TEST_DATASET_FILE_PATH)
    
    leaf_threshold = 4
    training_set_sizes_repetitions = {}
    training_set_sizes_repetitions[25]  = 10
    training_set_sizes_repetitions[50]  = 10
    training_set_sizes_repetitions[100] = 10
    training_set_sizes_repetitions[200] = 1
    
    test_set_accuracies_with_size = {}
    avg_test_data_accuracy, min_test_data_accuracy, max_test_data_accuracy = [], [], []
    for size in sorted(training_set_sizes_repetitions.keys()):
        num_repetitions = training_set_sizes_repetitions[size]
        test_set_accuracies = []
        for counter in xrange(0, num_repetitions):
            stratified_training_data_set = get_stratified_random_data_set(train_dataset, size)
            training_dataset_dtree = dtree.learn_dtree(stratified_training_data_set, leaf_threshold)
            test_set_accuracy = dtree.test_dtree(training_dataset_dtree, test_dataset)
            test_set_accuracies.append(test_set_accuracy*100)
            
        test_set_accuracies_with_size[size] = test_set_accuracies
        avg_test_data_accuracy.append(sum(test_set_accuracies)/float(len(test_set_accuracies)))
        min_test_data_accuracy.append(min(test_set_accuracies))
        max_test_data_accuracy.append(max(test_set_accuracies))
        
    training_data_set_sizes = sorted(training_set_sizes_repetitions.keys())

    print "\n\nTraining data set sizes : " + str(training_data_set_sizes)
    print "Average test set accuracy : " + str(avg_test_data_accuracy)
    print "Minimum test set accuracy : " + str(min_test_data_accuracy)
    print "Maximum test set accuracy : " + str(max_test_data_accuracy)
    print "Test Set accuracies vs Training dataset size : " + str(test_set_accuracies_with_size)
    
    # Plot the graph
    plt.figure()

    plt.plot(training_data_set_sizes, avg_test_data_accuracy, label="avg accuracy vs size", marker='H')
    plt.plot(training_data_set_sizes, min_test_data_accuracy, label="min accuracy vs size", marker='H')
    plt.plot(training_data_set_sizes, max_test_data_accuracy, label="max accuracy vs size", marker='H')
            
    plt.xlabel("Training set size")
    plt.ylabel("Test set accuracy (%)")
    plt.title("Test accuracy vs Training set size")
    
    plt.xlim(0, 201)
    plt.ylim(0, 100) 
 
    plt.grid(True)
    plt.legend(loc="lower right")
    
    plt.savefig("graphs/accuracy_vs_size.png")