def problem_3b_sizes(training_set_sizes, testing_set_size=None):
    results = []
    for training_set_size in training_set_sizes:
        print "Starting", training_set_size

        print 'loading data'
        training_set, training_labels, testing_set, testing_labels = c1.select_data(training_set_size,
                                                                                    testing_set_size)
        testing_set = pickle.load(open('testing_set.p'))
        testing_labels = pickle.load(open('testing_labels.p'))

        print 'preprocessing data'
        training_set = c1.preprocess(training_set)
        testing_set = c1.preprocess(testing_set)

        print 'building classifier'
        classifier = c1.build_classifier(training_set, training_labels)

        print 'predicting'
        predicted = c1.classify(testing_set, classifier)
        error = c1.error_measure(predicted, testing_labels)

        print 'error:', error
        results.append(error)
    print zip(training_set_sizes, results)

    plt.plot(training_set_sizes, results, 'o')
    plt.xscale('log')
    plt.yscale('log')
    plt.title('Naive Bayes Error vs Training set size')
    plt.xlabel('Training set size')
    plt.ylabel('Error')
    plt.show()
def problem_3b_parameters():
    results = []
    alphas = xrange(0, 100, 4)
    binarizes = np.array(xrange(0, 10))/10.0
    for binarize in binarizes:
        print "\nStarting Binarize:", binarize

        print 'loading data'
        training_set = pickle.load(open('training_set.p'))
        training_labels = pickle.load(open('training_labels.p'))
        testing_set = pickle.load(open('testing_set.p'))
        testing_labels = pickle.load(open('testing_labels.p'))

        print 'preprocessing data'
        training_set = c1.preprocess(training_set)
        testing_set = c1.preprocess(testing_set)

        print 'building classifier'
        classifier = c1.build_classifier(training_set, training_labels, binarize=binarize)

        print 'predicting'
        predicted = c1.classify(testing_set, classifier)
        error = c1.error_measure(predicted, testing_labels)

        print 'error:', error
        results.append(error)

    print zip(binarizes, results)
    plt.plot(binarizes, results, 'o')
    plt.title('Naive Bayes Error vs Threshold parameter')
    plt.xlabel('Threshold parameter')
    plt.ylabel('Error')
    plt.show()

    results = []

    for alpha in alphas:
        print "\nStarting Alpha:", alpha

        print 'loading data'
        training_set = pickle.load(open('training_set.p'))
        training_labels = pickle.load(open('training_labels.p'))
        testing_set = pickle.load(open('testing_set.p'))
        testing_labels = pickle.load(open('testing_labels.p'))

        print 'preprocessing data'
        training_set = c1.preprocess(training_set)
        testing_set = c1.preprocess(testing_set)

        print 'building classifier'
        classifier = c1.build_classifier(training_set, training_labels, alpha=alpha)

        print 'predicting'
        predicted = c1.classify(testing_set, classifier)
        error = c1.error_measure(predicted, testing_labels)

        print 'error:', error
        results.append(error)

    print zip(alphas, results)
    plt.plot(alphas, results, 'o')
    plt.title('Naive Bayes Error vs Alpha parameter')
    plt.xlabel('Alpha Value')
    plt.ylabel('Error')
    plt.show()