Exemple #1
0
def simulations():
    que = {}
    progress_iterator = ProgressIterator(4)
    progress_iterator.next()
    avg_v1, avg_vrand, avg_vmin = test_one()
    que[1] = ("v min :", avg_vmin)

    progress_iterator.next()
    in_error, out_error = experiment(test_two, [100, 1000], 1000)
    que[5] = ("in sample error :", in_error)
    que[6] = ("out sample error :", out_error)

    progress_iterator.next()
    iterations = experiment(test_three, [10], 1000)
    que[7] = ("iterations :", iterations)

    progress_iterator.next()
    results = np.array([test_four(100, 1000) for _ in range(1000)],
                       dtype=object)
    in_error_no_transform = np.mean(results[:, 0])
    weight = np.mean(results[:, 1], axis=0)
    out_error_transform = np.mean(results[:, 2])
    que[8] = ("in sample error -- without higher dimension transformation :",
              in_error_no_transform)
    que[9] = ("higher dimensional weights :", weight)
    que[10] = ("out of sample error -- with higher dimension transformation :",
               out_error_transform)
    return que
Exemple #2
0
def simulations():
    que = {}
    progress_iterator = ProgressIterator(4)
    progress_iterator.next()
    que[1] = ("sample points needed :", datapoints_needed(0.008, 0.1, 8))

    progress_iterator.next()
    gradient = [in_error_derivative_u, in_error_derivative_v]
    value, point, iterations =  find_threshold(in_error, in_error_gradient, [1,1], 0.1, mpf(10)**mpf(-14), 0 )
    que[5] = ( "gradient descent results", "\n\tvalue : " + str(value) \
                                           + "\n\tpoint : " + str(point) \
                                           + " # ans to question 6" \
                                           + "\n\titerations : " + str(iterations)\
                                           + " # ans to question 5"
                                           )

    progress_iterator.next()
    gradient = [in_error_derivative_u, in_error_derivative_v]
    value, point, iterations = coordinate_descent_max_iterations(
            in_error, gradient, [1,1], mpf('0.1'), 30)
    que[7] = ( "coordinate gradient descent results", "\n\tvalue : " + str(value) \
                                           + "\n\tpoint : " + str(point) \
                                           + "\n\titerations : " + str(iterations)
                                           )
    def trial_no_weights(*args):
        weight, iterations, out_sample_error = trial(*args)
        return iterations, out_sample_error

    progress_iterator.next()
    iterations, out_sample_error = experiment(trial_no_weights, [100, 1000], 100)
    que[8] = ("out of sample cross entrophy error :", out_sample_error)
    que[9] = ("iterations :", iterations)
    return que
def simulations():
    que ={}
    progress_iterator = ProgressIterator(4)
    progress_iterator.next()
    avg_v1, avg_vrand, avg_vmin = test_one()
    que[1] = ("v min :", avg_vmin)
    
    progress_iterator.next()
    in_error, out_error = experiment(test_two, [100, 1000], 1000)
    que[5] = ("in sample error :", in_error)
    que[6] = ("out sample error :", out_error)

    progress_iterator.next()
    iterations = experiment(test_three, [10], 1000)
    que[7] = ("iterations :", iterations)

    progress_iterator.next()
    results = np.array([ test_four(100, 1000) for _ in range(1000) ], dtype=object)
    in_error_no_transform = np.mean(results[:,0])
    weight = np.mean(results[:,1], axis=0)
    out_error_transform = np.mean(results[:,2])
    que[8] = ("in sample error -- without higher dimension transformation :",
            in_error_no_transform)
    que[9] = ("higher dimensional weights :", weight)
    que[10] = ("out of sample error -- with higher dimension transformation :",
            out_error_transform)
    return que
def simulations():
    que = {}
    training_data = np.genfromtxt(os.path.join(file_dir, "in.dta"))
    testing_data = np.genfromtxt(os.path.join(file_dir, "out.dta"))
    progress_iterator = ProgressIterator(4)

    progress_iterator.next()
    in_sample_error, out_of_sample_error = test1(training_data, testing_data)
    que[2] = ("linear regression",
            "\n\tin sample error : " + str(in_sample_error) + \
            "\n\tout of sample error : " + str(out_of_sample_error))

    progress_iterator.next()
    in_sample_error, out_of_sample_error = trial(training_data, testing_data, pow_10(-3))
    que[3] = ("linear regression with weight decay, k=-3",
            "\n\tin sample error : " + str(in_sample_error) + \
            "\n\tout of sample error : " + str(out_of_sample_error))

    progress_iterator.next()
    in_sample_error, out_of_sample_error = trial(training_data, testing_data, pow_10(3))
    que[4] = ("linear regression with weight decay, k=3",
            "\n\tin sample error : " + str(in_sample_error) + \
            "\n\tout of sample error : " + str(out_of_sample_error))

    progress_iterator.next()
    out_of_sample_errors = [ str(trial(training_data, testing_data, pow_10(k))[1])
            for k in range(-2,3) ]
    pretty_table = tabulate( [ [k, out_of_sample_errors[k+2]] for k in range(-2,3) ], 
        headers=['k', "EOUT"])
    que[5] = ("Also includes answer to question 6\n\nlinear regression with weight decay, k=-2..2",
            "\nout of sample errors\n" + pretty_table)
    return que
def simulations():
    que ={}
    progress_iterator = ProgressIterator(2)

    progress_iterator.next()
    out_error, iterations = experiment(trial, [10, 100], 1000)
    que[7] = ("iterations :", iterations)
    que[8] = ("out of sample error :", out_error)

    progress_iterator.next()
    out_error, iterations = experiment(trial, [100, 100], 1000)
    que[9] = ("iterations :", iterations)
    que[10] = ("out of sample error :", out_error)
    return que
Exemple #6
0
def simulations():
    que = {}
    training_data = np.genfromtxt(os.path.join(file_dir, "in.dta"))
    testing_data = np.genfromtxt(os.path.join(file_dir, "out.dta"))
    progress_iterator = ProgressIterator(4)

    progress_iterator.next()
    in_sample_error, out_of_sample_error = test1(training_data, testing_data)
    que[2] = ("linear regression",
            "\n\tin sample error : " + str(in_sample_error) + \
            "\n\tout of sample error : " + str(out_of_sample_error))

    progress_iterator.next()
    in_sample_error, out_of_sample_error = trial(training_data, testing_data,
                                                 pow_10(-3))
    que[3] = ("linear regression with weight decay, k=-3",
            "\n\tin sample error : " + str(in_sample_error) + \
            "\n\tout of sample error : " + str(out_of_sample_error))

    progress_iterator.next()
    in_sample_error, out_of_sample_error = trial(training_data, testing_data,
                                                 pow_10(3))
    que[4] = ("linear regression with weight decay, k=3",
            "\n\tin sample error : " + str(in_sample_error) + \
            "\n\tout of sample error : " + str(out_of_sample_error))

    progress_iterator.next()
    out_of_sample_errors = [
        str(trial(training_data, testing_data, pow_10(k))[1])
        for k in range(-2, 3)
    ]
    pretty_table = tabulate([[k, out_of_sample_errors[k + 2]]
                             for k in range(-2, 3)],
                            headers=['k', "EOUT"])
    que[5] = (
        "Also includes answer to question 6\n\nlinear regression with weight decay, k=-2..2",
        "\nout of sample errors\n" + pretty_table)
    return que
def simulations():
    que = {}
    progress_iterator = ProgressIterator(2)

    progress_iterator.next()
    out_error, iterations = experiment(myTrial, [10, 100], 1000)
    que[7] = ("iterations :", iterations)
    que[8] = ("out of sample error :", out_error)

    progress_iterator.next()
    out_error, iterations = experiment(myTrial, [100, 100], 1000)
    que[9] = ("iterations :", iterations)
    que[10] = ("out of sample error :", out_error)
    return que
Exemple #8
0
def simulations():
    que = {}
    training_data = np.genfromtxt(os.path.join(file_dir, "features.train"))
    testing_data = np.genfromtxt(os.path.join(file_dir, "features.test"))

    def convert_raw(t_data):
        return DataML((t_data[:, 1:], np.array(t_data[:, 0], dtype="int")))

    initial_training_set = convert_raw(training_data)
    initial_testing_set = convert_raw(testing_data)

    def transform_help(transform, *col_data_sets):
        return [
            DataML((transform(data_set.z), data_set.y))
            for data_set in col_data_sets
        ]

    progress_iterator = ProgressIterator(4)

    progress_iterator.next()
    constant_training_set, constant_testing_set = transform_help(
        add_constant, initial_training_set, initial_testing_set)
    allexcept_constant_train_test_li = [
        allexcept(digit, constant_training_set, constant_testing_set)
        for digit in range(10)
    ]
    no_transform_errors = [
        train_test(*train_test_sets, minimize_error_aug, [1])
        for train_test_sets in allexcept_constant_train_test_li
    ]
    in_sample_error_5_9 = [
        error_list[0] for error_list in no_transform_errors[5:10]
    ]
    min_arg = np.argmin(in_sample_error_5_9) + 5
    min_error = min(in_sample_error_5_9)
    que[7] = ("digit with lowest in sample error : ",
              str(min_arg) + ", " + str(min_error))

    progress_iterator.next()
    second_order_training_set, second_order_testing_set = transform_help(
        second_order_nic, initial_training_set, initial_testing_set)
    allexcept_second_order_train_test_li = [
        allexcept(digit, second_order_training_set, second_order_testing_set)
        for digit in range(10)
    ]
    transform_errors = [
        train_test(*train_test_sets, minimize_error_aug, [1])
        for train_test_sets in allexcept_second_order_train_test_li
    ]
    out_of_sample_error_0_4 = [
        error_list[1] for error_list in transform_errors[:5]
    ]
    min_arg = np.argmin(out_of_sample_error_0_4)
    min_error = min(out_of_sample_error_0_4)
    que[8] = ("digit with lowest out of sample error : ",
              str(min_arg) + ", " + str(min_error))

    tables = [[["no transform"] + no_transform_errors[i],
               ["transform"] + transform_errors[i]] for i in range(10)]

    pretty_tables = [
        tabulate(table, headers=["", "in sample", "out of sample"])
        for table in tables
    ]

    tables_string = "\n".join([
        "\ndigit {}\n".format(i) + str(pretty_tables[i])
        for i in range(len(pretty_tables))
    ])

    que[9] = ("effectiveness of feature transform on 0 and 9", tables_string)

    progress_iterator.next()
    one_v_five_second_order_sets = a_vs_b(1, 5, second_order_training_set,
                                          second_order_testing_set)
    errors_lambda = [
        train_test(*one_v_five_second_order_sets, minimize_error_aug, [alpha])
        for alpha in [0.01, 1]
    ]
    pretty_table = tabulate(
        [["lambda 0.01"] + errors_lambda[0], ["lambda 1"] + errors_lambda[1]],
        headers=["", "in sample", "out of sample"])
    que[10] = (
        "errors from changing lambda for 1 vs 5\n", "\n" + str(pretty_table) +
        "\n\nevidence of overfitting as increased constraint improves performance"
    )
    total_support_vectors, in_sample_error = train_svc(
        transform_help(add_constant, svm_que_helper())[0],
        SVC(kernel="poly", degree=2, C=float("infinity")))
    que[12] = ("total support vectors :", total_support_vectors)
    total_trials = 30

    class SVC_REGULAR:
        def __init__(self, total_trials, k, gammas):
            self.total_hard_margin_svc_failure, \
            self.svc_eout_li, \
            self.reg_ein_li, \
            self.reg_eout_li = trial(total_trials, k, gammas)

    progress_iterator.next()
    k9_g1x5 = SVC_REGULAR(total_trials, 9, 1.5 * np.ones(9))
    que[13] = ("total hard margin svc failure percentage :",
               k9_g1x5.total_hard_margin_svc_failure / total_trials)
    que[14] = ("svc rbf better than regular rbf percentage (k=9):",
               sum(k9_g1x5.svc_eout_li < k9_g1x5.reg_eout_li) /
               len(k9_g1x5.svc_eout_li))
    k12_g1x5 = SVC_REGULAR(total_trials, 12, 1.5 * np.ones(12))
    que[15] = ("svc rbf better than regular rbf percentage (k=12):",
               sum(k12_g1x5.svc_eout_li < k12_g1x5.reg_eout_li) /
               len(k12_g1x5.svc_eout_li))
    k9_better_k12_ein_percentage = sum(
        k9_g1x5.reg_ein_li < k12_g1x5.reg_ein_li) / len(k9_g1x5.reg_ein_li)
    k9_better_k12_eout_percentage = sum(
        k9_g1x5.reg_eout_li < k12_g1x5.reg_eout_li) / len(k9_g1x5.reg_eout_li)
    pretty_table = tabulate(
        [[k9_better_k12_ein_percentage, k9_better_k12_eout_percentage]],
        headers=[
            "k=9 ein < k=12 ein percentage", "k=9 eout < k=12 eout percentage"
        ])
    table = [[
        np.mean(error_li)
        for error_li in [svc_regular.reg_ein_li, svc_regular.reg_eout_li]
    ] for svc_regular in [k9_g1x5, k12_g1x5]]
    pretty_table2 = tabulate(
        [["k=9"] + table[0], ["k=12"] + table[1]],
        headers=["", "in sample error", "out of sampler error"])
    que[16] = ("regular rbf changing k",
            "\n" + str(pretty_table) \
            + "\n" + str(pretty_table2))
    k9_g2 = SVC_REGULAR(total_trials, 12, 2 * np.ones(12))
    g1x5_better_g2_ein_percentage = sum(
        k9_g1x5.reg_ein_li < k9_g2.reg_ein_li) / len(k9_g1x5.reg_ein_li)
    g1x5_better_g2_eout_percentage = sum(
        k9_g1x5.reg_eout_li < k9_g2.reg_eout_li) / len(k9_g1x5.reg_eout_li)
    pretty_table = tabulate(
        [[g1x5_better_g2_ein_percentage, g1x5_better_g2_eout_percentage]],
        headers=[
            "g=1.5 ein < g=2 ein percentage",
            "g=1.5 eout < g=2 eout percentage"
        ])
    table = [[
        np.mean(error_li)
        for error_li in [svc_regular.reg_ein_li, svc_regular.reg_eout_li]
    ] for svc_regular in [k9_g1x5, k9_g2]]
    pretty_table2 = tabulate(
        [["g=1.5"] + table[0], ["g=2"] + table[1]],
        headers=["", "in sample error", "out of sampler error"])
    que[17] = ("regular rbf changing gammas",
            "\n" + str(pretty_table) \
            + "\n" + str(pretty_table2))
    zero_ein = k9_g1x5.reg_ein_li < 1 / (10 * total_trials)
    que[18] = (
        "regular rbf (k=9, gamma=1.5) zero in sample error percentage : ",
        sum(zero_ein) / len(zero_ein))
    return que
Exemple #9
0
def simulations():
    que = {}
    training_data = np.genfromtxt(os.path.join(hw6_dir_path, "in.dta"))
    testing_data = np.genfromtxt(os.path.join(hw6_dir_path, "out.dta"))
    progress_iterator = ProgressIterator(6)

    progress_iterator.next()
    inital_total = 25  # initial points used for training
    inital_model_weights = restricted_training(training_data, inital_total)
    validation_set = DataML(training_data[inital_total:], transform)
    best_k, out_of_sample_errors = best_model(inital_model_weights,
                                              validation_set)
    pretty_table = tabulate([[k, out_of_sample_errors[k - 3]]
                             for k in range(3, 8)],
                            headers=["k", "EOUT"])
    que[1] = ("validation set out of sample errors, last 10 points",
            "\n" \
            + str(pretty_table)
            )

    progress_iterator.next()
    testing_set = DataML(testing_data, transform)
    best_k, out_of_sample_errors = best_model(inital_model_weights,
                                              testing_set)
    pretty_table = tabulate([[k, out_of_sample_errors[k - 3]]
                             for k in range(3, 8)],
                            headers=["k", "EOUT"])
    que[2] = ("test set out of sample errors",
            "\n" \
            + str(pretty_table)
            )

    progress_iterator.next()
    first_error = min(out_of_sample_errors)
    reverse_total = 10
    training_set = DataML(training_data[-reverse_total:], transform)
    reverse_model_weights = gen_models(training_set)
    best_k, out_of_sample_errors = best_model(
        reverse_model_weights, DataML(training_data[:-reverse_total],
                                      transform))
    pretty_table = tabulate([[k, out_of_sample_errors[k - 3]]
                             for k in range(3, 8)],
                            headers=["k", "EOUT"])
    que[3] = ("validation set out of sample errors, first 25 points",
            "\n" \
            + str(pretty_table)
            )

    progress_iterator.next()
    testing_set = DataML(testing_data, transform)
    best_k, out_of_sample_errors = best_model(reverse_model_weights,
                                              testing_set)
    pretty_table = tabulate([[k, out_of_sample_errors[k - 3]]
                             for k in range(3, 8)],
                            headers=["k", "EOUT"])
    que[4] = ("test set out of sample errors",
            "\n" \
            + str(pretty_table)
            )
    second_error = min(out_of_sample_errors)
    que[5] = ("smallest out of sample errors :",
              str(first_error) + ", " + str(second_error))

    progress_iterator.next()
    svm_better, total_support_vectors = experiment(trial, [10, 100], 1000)
    que[8] = ("svm better than pla : ", svm_better)

    progress_iterator.next()
    svm_better, total_support_vectors = experiment(trial, [100, 100], 1000)
    que[9] = ("svm better than pla : ", svm_better)
    que[10] = ("total support vectors : ", total_support_vectors)
    return que
def simulations():
    que = {}
    training_data = np.genfromtxt(os.path.join(file_dir, "features.train"))
    testing_data = np.genfromtxt(os.path.join(file_dir, "features.test"))
    def convert_raw(t_data):
        return DataML((t_data[:,1:], np.array(t_data[:,0], dtype="int")))
    initial_training_set = convert_raw(training_data)
    initial_testing_set = convert_raw(testing_data)
    def transform_help(transform, *col_data_sets):
        return [ DataML((transform(data_set.z), data_set.y)) 
                for data_set in col_data_sets ]
    progress_iterator = ProgressIterator(4)

    progress_iterator.next()
    constant_training_set, constant_testing_set = transform_help(
            add_constant, initial_training_set, initial_testing_set)
    allexcept_constant_train_test_li = [
            allexcept(digit, constant_training_set, constant_testing_set)
            for digit in range(10) ]
    no_transform_errors = [ train_test(
        *train_test_sets, 
        minimize_error_aug,
        [1])
        for train_test_sets in allexcept_constant_train_test_li ]
    in_sample_error_5_9 = [ 
            error_list[0] for error_list in no_transform_errors[5:10] ]
    min_arg = np.argmin(in_sample_error_5_9) + 5
    min_error = min(in_sample_error_5_9)
    que[7] = ("digit with lowest in sample error : ", 
            str(min_arg) + ", " + str(min_error))

    progress_iterator.next()
    second_order_training_set, second_order_testing_set = transform_help(
            second_order_nic, initial_training_set, initial_testing_set)
    allexcept_second_order_train_test_li = [
            allexcept(
                digit, 
                second_order_training_set, 
                second_order_testing_set)
            for digit in range(10) ]
    transform_errors = [ train_test(
        *train_test_sets, 
        minimize_error_aug,
        [1])
        for train_test_sets in allexcept_second_order_train_test_li ]
    out_of_sample_error_0_4 = [ 
            error_list[1] for error_list in transform_errors[:5] ]
    min_arg = np.argmin(out_of_sample_error_0_4)
    min_error = min(out_of_sample_error_0_4)
    que[8] = ("digit with lowest out of sample error : ", 
            str(min_arg) + ", " + str(min_error))

    tables = [ [
        ["no transform"] + no_transform_errors[i], 
        ["transform"] + transform_errors[i] ]
        for i in range(10) ]

    pretty_tables = [ tabulate(
        table, 
        headers=["","in sample", "out of sample"])
        for table in tables ] 

    tables_string = "\n".join(
            ["\ndigit {}\n".format(i) + str(pretty_tables[i])
            for i in range(len(pretty_tables)) ]
            )

    que[9] = ("effectiveness of feature transform on 0 and 9",
            tables_string
            )

    progress_iterator.next()
    one_v_five_second_order_sets = a_vs_b(
            1, 5,
           second_order_training_set, 
           second_order_testing_set)
    errors_lambda = [ train_test(
        *one_v_five_second_order_sets,
        minimize_error_aug,
        [alpha]) 
        for alpha in [0.01, 1] ]
    pretty_table = tabulate( 
            [ ["lambda 0.01"] + errors_lambda[0],
              ["lambda 1"] + errors_lambda[1] ],
            headers=["", "in sample", "out of sample"]
            )
    que[10] = ("errors from changing lambda for 1 vs 5\n", "\n" + str(pretty_table) + "\n\nevidence of overfitting as increased constraint improves performance")
    total_support_vectors, in_sample_error = train_svc(
            transform_help(add_constant, svm_que_helper())[0],
            SVC(kernel="poly", degree=2, C=float("infinity")))
    que[12] = ("total support vectors :", total_support_vectors)
    total_trials = 30
    class SVC_REGULAR:
        def __init__(self, total_trials, k, gammas):
            self.total_hard_margin_svc_failure, \
            self.svc_eout_li, \
            self.reg_ein_li, \
            self.reg_eout_li = trial(total_trials, k, gammas)

    progress_iterator.next()
    k9_g1x5 = SVC_REGULAR(total_trials, 9, 1.5 * np.ones(9))
    que[13] = ("total hard margin svc failure percentage :", k9_g1x5.total_hard_margin_svc_failure / total_trials)
    que[14] = ("svc rbf better than regular rbf percentage (k=9):", 
            sum(k9_g1x5.svc_eout_li < k9_g1x5.reg_eout_li) / len(k9_g1x5.svc_eout_li) )
    k12_g1x5 = SVC_REGULAR(total_trials, 12, 1.5 * np.ones(12))
    que[15] = ("svc rbf better than regular rbf percentage (k=12):", 
            sum(k12_g1x5.svc_eout_li < k12_g1x5.reg_eout_li) / len(k12_g1x5.svc_eout_li) )
    k9_better_k12_ein_percentage = sum(k9_g1x5.reg_ein_li < k12_g1x5.reg_ein_li) / len(k9_g1x5.reg_ein_li)
    k9_better_k12_eout_percentage = sum(k9_g1x5.reg_eout_li < k12_g1x5.reg_eout_li) / len(k9_g1x5.reg_eout_li)
    pretty_table = tabulate(
            [[k9_better_k12_ein_percentage, k9_better_k12_eout_percentage]],
            headers=["k=9 ein < k=12 ein percentage", "k=9 eout < k=12 eout percentage"])
    table = [ [ np.mean(error_li) 
        for error_li in [svc_regular.reg_ein_li, svc_regular.reg_eout_li] ]
        for svc_regular in [k9_g1x5, k12_g1x5] ]
    pretty_table2 = tabulate([["k=9"] + table[0], ["k=12"] + table[1]],
            headers=["", "in sample error", "out of sampler error"])
    que[16] = ("regular rbf changing k",
            "\n" + str(pretty_table) \
            + "\n" + str(pretty_table2))
    k9_g2 = SVC_REGULAR(total_trials, 12, 2 * np.ones(12))
    g1x5_better_g2_ein_percentage = sum(k9_g1x5.reg_ein_li < k9_g2.reg_ein_li) / len(k9_g1x5.reg_ein_li)
    g1x5_better_g2_eout_percentage = sum(k9_g1x5.reg_eout_li < k9_g2.reg_eout_li) / len(k9_g1x5.reg_eout_li)
    pretty_table = tabulate(
            [[g1x5_better_g2_ein_percentage, g1x5_better_g2_eout_percentage]], headers=["g=1.5 ein < g=2 ein percentage", "g=1.5 eout < g=2 eout percentage"])
    table = [ [ np.mean(error_li) 
        for error_li in [svc_regular.reg_ein_li, svc_regular.reg_eout_li] ]
        for svc_regular in [k9_g1x5, k9_g2] ]
    pretty_table2 = tabulate([["g=1.5"] + table[0], ["g=2"] + table[1]],
            headers=["", "in sample error", "out of sampler error"])
    que[17] = ("regular rbf changing gammas", 
            "\n" + str(pretty_table) \
            + "\n" + str(pretty_table2))
    zero_ein = k9_g1x5.reg_ein_li < 1 / (10 * total_trials )
    que[18] = ("regular rbf (k=9, gamma=1.5) zero in sample error percentage : ", 
            sum(zero_ein) / len(zero_ein))
    return que
Exemple #11
0
def simulations():
    que = {}
    progress_iterator = ProgressIterator(5)
    progress_iterator.next()
    sample_size = ceil(solved_vc_inequality(1 - 0.95, 0.05, 400000))
    que[1] = ("sample size needed :", sample_size)

    def error_bound_format(n):
        original_vc_bound, rademacher_penalty_bound, parrondo_van_den_broek_bound, devroye_bound = error_bound(
            n)
        output = ("Bounds for N=" + str(n),
                  "\noriginal vc : " + str(original_vc_bound) + "\n" +
                  "rademacher penalty : " + str(rademacher_penalty_bound) +
                  "\n" + "parrondo and van den broek : " +
                  str(parrondo_van_den_broek_bound) + "\n" + "devroye : " +
                  str(devroye_bound) + "\n")
        return output

    progress_iterator.next()
    que[2] = error_bound_format(10000)

    progress_iterator.next()
    que[3] = error_bound_format(5)

    progress_iterator.next()
    analysis = bias_variance_out_sample_error(1000)

    def bias_variance_format(analysis):
        names = [
            "constant : a", "\n\nline through origin : ax",
            "\n\nline : ax + b", "\n\nquadratic through origin : ax**2",
            "\n\nquadratic : ax**2 + b"
        ]
        output = ""
        for i in range(len(analysis)):
            if i == 1:
                output += names[i] \
                        + "\nmean parameters : " + str(analysis[i]["mean parameters"]) + " # ans to question 4 this differs from solution given" \
                        + "\nbias : " + str(analysis[i]["bias"]) + " # ans to question 5" \
                        + "\nvariance : " + str(analysis[i]["variance"]) + " # ans to question 6" \
                        + "\nexpected out of sample error : " + str(analysis[i]["expected out of sample error"])
            else:
                output += names[i] \
                        + "\nmean parameters : " + str(analysis[i]["mean parameters"]) \
                        + "\nbias : " + str(analysis[i]["bias"]) \
                        + "\nvariance : " + str(analysis[i]["variance"]) \
                        + "\nexpected out of sample error : " + str(analysis[i]["expected out of sample error"])
        output += "\n\nbest hypothesis is 'line throgh origin' with an expected out of sample error of " + str(
            round(analysis[1]["expected out of sample error"], 3))
        return output

    progress_iterator.next()
    que[4] = (
        "Also includes answers to question 5,6,7\n\nAnalysis of various hypotheses",
        "\n" + str(bias_variance_format(analysis)))
    return que
def simulations():
    que = {}
    training_data = np.genfromtxt(os.path.join(hw6_dir_path, "in.dta"))
    testing_data = np.genfromtxt(os.path.join(hw6_dir_path, "out.dta"))
    progress_iterator = ProgressIterator(6)

    progress_iterator.next()
    inital_total = 25 # initial points used for training
    inital_model_weights = restricted_training(training_data, inital_total)
    validation_set = DataML(training_data[inital_total:], transform) 
    best_k, out_of_sample_errors = best_model(inital_model_weights, validation_set)
    pretty_table = tabulate( [ [ k, out_of_sample_errors[k-3]] 
        for k in range(3, 8) ],
        headers=["k", "EOUT"])
    que[1] = ("validation set out of sample errors, last 10 points",
            "\n" \
            + str(pretty_table)
            )

    progress_iterator.next()
    testing_set = DataML(testing_data, transform)
    best_k, out_of_sample_errors = best_model(inital_model_weights, testing_set)
    pretty_table = tabulate( [ [ k, out_of_sample_errors[k-3]] 
        for k in range(3, 8) ],
        headers=["k", "EOUT"])
    que[2] = ("test set out of sample errors",
            "\n" \
            + str(pretty_table)
            )

    progress_iterator.next()
    first_error = min(out_of_sample_errors)
    reverse_total = 10 
    training_set = DataML(training_data[-reverse_total:], transform)
    reverse_model_weights = gen_models(training_set)
    best_k, out_of_sample_errors = best_model(reverse_model_weights, DataML(training_data[:-reverse_total], transform))
    pretty_table = tabulate( [ [ k, out_of_sample_errors[k-3]] 
        for k in range(3, 8) ],
        headers=["k", "EOUT"])
    que[3] = ("validation set out of sample errors, first 25 points",
            "\n" \
            + str(pretty_table)
            )

    progress_iterator.next()
    testing_set = DataML(testing_data, transform)
    best_k, out_of_sample_errors = best_model(reverse_model_weights, testing_set)
    pretty_table = tabulate( [ [ k, out_of_sample_errors[k-3]] 
        for k in range(3, 8) ],
        headers=["k", "EOUT"])
    que[4] = ("test set out of sample errors",
            "\n" \
            + str(pretty_table)
            )
    second_error = min(out_of_sample_errors)
    que[5] = ("smallest out of sample errors :", str(first_error) + ", " + str(second_error))

    progress_iterator.next()
    svm_better, total_support_vectors  = experiment(trial, [10, 100], 1000)
    que[8] = ("svm better than pla : ", svm_better)

    progress_iterator.next()
    svm_better, total_support_vectors  = experiment(trial, [100, 100], 1000)
    que[9] = ("svm better than pla : ", svm_better)
    que[10] = ("total support vectors : ", total_support_vectors)
    return que
def simulations():
    que = {}
    training_data = np.genfromtxt(os.path.join(file_dir, "features.train"))
    testing_data = np.genfromtxt(os.path.join(file_dir, "features.test"))
    def convert_raw(t_data):
        return DataML((t_data[:,1:], np.array(t_data[:,0], dtype="int")))
    training_set = convert_raw(training_data)
    testing_set = convert_raw(testing_data)
    progress_iterator = ProgressIterator(5)

    progress_iterator.next()
    results_even = [ trial_all_except(training_set, testing_set, digit, 'poly', 0.1, 2)
            for digit in range(0,9,2) ]
    in_sample_error_list_even = [ result[1][0] for result in results_even ]
    que[2] = ("digit with highest in sample error :", (np.argmax(in_sample_error_list_even) * 2 , np.max(in_sample_error_list_even)) )
    results_odd = [ trial_all_except(training_set, testing_set, digit, 'poly', 0.1, 2)
            for digit in range(1,10,2) ]
    in_sample_error_list_odd = [ result[1][0] for result in results_odd ]
    que[3] = ("digit with lowest in sample error :", (np.argmin(in_sample_error_list_odd) * 2 + 1 , np.min(in_sample_error_list_odd)) )
    support_vector_difference = abs(
            sum(results_even[np.argmax(in_sample_error_list_even)][0])
            - sum(results_odd[np.argmin(in_sample_error_list_odd)][0]))
    que[4] = ("support vector difference :", support_vector_difference)

    progress_iterator.next()
    results = [ trial_a_vs_b(training_set, testing_set, 1, 5, 'poly', c, 2) 
            for c in [0.001, 0.01, 0.1, 1] ]
    support_vectors = [ sum(result[0]) for result in results ]
    out_of_sample_errors = [ result[1][1] for result in results ]
    in_sample_errors = [ result[1][0] for result in results ]
    que[5] = ("various stats", 
    "\n\tsupport vectors\n\t" + str(support_vectors)
    + "\n\tout of sample errors\n\t" + str(out_of_sample_errors)
    + "\n\tin sample errors\n\t" + str(in_sample_errors)
    )

    progress_iterator.next()
    results = [ [trial_a_vs_b(training_set, testing_set, 1, 5, 'poly', c, degree) 
            for c in [0.0001, 0.001 ,0.01, 0.1, 1]] for degree in range(2,6) ]
    results_transpose = [ [results[i][j] for i in range(len(results)) ] 
            for j in range(len(results[0])) ]
    c_lowest_ein =  [ result[1][0] for result in results_transpose[0] ]
    support_vectors = [ sum(result[0]) for result in results_transpose[1] ]
    c_third_lowest_ein =  [ result[1][0] for result in results_transpose[0] ]
    c_highest_eou = [ result[1][1] for result in results_transpose[-1] ]
    que[6] = ("various stats", 
    "\n\tin sample errors when c = 0.0001\n\t" + str(c_lowest_ein)
    + "\n\tsupport vectors when c = 0.001\n\t" + str(support_vectors)
    + "\n\tin sample errors when c = 0.01\n\t" + str(c_third_lowest_ein)
    + "\n\tout of sample errors when c = 1\n\t" + str(c_highest_eou)
    )

    progress_iterator.next()
    results = [ best_c(training_set) for _ in range(50) ]
    frequency = np.bincount([ result[0] for result in results ])
    que[7] = ("frequency :", frequency)
    best = np.argmax(frequency)
    average_score_of_best = np.mean([ result[1][best] for result in results ])
    que[8] = ("average_score_of_best :", average_score_of_best)

    progress_iterator.next()
    results = [ trial_a_vs_b(training_set, testing_set, 1, 5, 'rbf', c)
            for c in [0.01, 1, 100, 10**4, 10**6 ] ]
    in_sample_errors = [ result[1][0] for result in results ] 
    que[9] = ("in sample errors :", in_sample_errors)
    out_of_sample_errors = [ result[1][1] for result in results ] 
    que[10] = ("out of sample errors :", out_of_sample_errors)
    return que
Exemple #14
0
def simulations():
    que = {}
    training_data = np.genfromtxt(os.path.join(file_dir, "features.train"))
    testing_data = np.genfromtxt(os.path.join(file_dir, "features.test"))

    def convert_raw(t_data):
        return DataML((t_data[:, 1:], np.array(t_data[:, 0], dtype="int")))

    training_set = convert_raw(training_data)
    testing_set = convert_raw(testing_data)
    progress_iterator = ProgressIterator(5)

    progress_iterator.next()
    results_even = [
        trial_all_except(training_set, testing_set, digit, 'poly', 0.1, 2)
        for digit in range(0, 9, 2)
    ]
    in_sample_error_list_even = [result[1][0] for result in results_even]
    que[2] = ("digit with highest in sample error :",
              (np.argmax(in_sample_error_list_even) * 2,
               np.max(in_sample_error_list_even)))
    results_odd = [
        trial_all_except(training_set, testing_set, digit, 'poly', 0.1, 2)
        for digit in range(1, 10, 2)
    ]
    in_sample_error_list_odd = [result[1][0] for result in results_odd]
    que[3] = ("digit with lowest in sample error :",
              (np.argmin(in_sample_error_list_odd) * 2 + 1,
               np.min(in_sample_error_list_odd)))
    support_vector_difference = abs(
        sum(results_even[np.argmax(in_sample_error_list_even)][0]) -
        sum(results_odd[np.argmin(in_sample_error_list_odd)][0]))
    que[4] = ("support vector difference :", support_vector_difference)

    progress_iterator.next()
    results = [
        trial_a_vs_b(training_set, testing_set, 1, 5, 'poly', c, 2)
        for c in [0.001, 0.01, 0.1, 1]
    ]
    support_vectors = [sum(result[0]) for result in results]
    out_of_sample_errors = [result[1][1] for result in results]
    in_sample_errors = [result[1][0] for result in results]
    que[5] = ("various stats",
              "\n\tsupport vectors\n\t" + str(support_vectors) +
              "\n\tout of sample errors\n\t" + str(out_of_sample_errors) +
              "\n\tin sample errors\n\t" + str(in_sample_errors))

    progress_iterator.next()
    results = [[
        trial_a_vs_b(training_set, testing_set, 1, 5, 'poly', c, degree)
        for c in [0.0001, 0.001, 0.01, 0.1, 1]
    ] for degree in range(2, 6)]
    results_transpose = [[results[i][j] for i in range(len(results))]
                         for j in range(len(results[0]))]
    c_lowest_ein = [result[1][0] for result in results_transpose[0]]
    support_vectors = [sum(result[0]) for result in results_transpose[1]]
    c_third_lowest_ein = [result[1][0] for result in results_transpose[0]]
    c_highest_eou = [result[1][1] for result in results_transpose[-1]]
    que[6] = ("various stats", "\n\tin sample errors when c = 0.0001\n\t" +
              str(c_lowest_ein) + "\n\tsupport vectors when c = 0.001\n\t" +
              str(support_vectors) + "\n\tin sample errors when c = 0.01\n\t" +
              str(c_third_lowest_ein) +
              "\n\tout of sample errors when c = 1\n\t" + str(c_highest_eou))

    progress_iterator.next()
    results = [best_c(training_set) for _ in range(50)]
    frequency = np.bincount([result[0] for result in results])
    que[7] = ("frequency :", frequency)
    best = np.argmax(frequency)
    average_score_of_best = np.mean([result[1][best] for result in results])
    que[8] = ("average_score_of_best :", average_score_of_best)

    progress_iterator.next()
    results = [
        trial_a_vs_b(training_set, testing_set, 1, 5, 'rbf', c)
        for c in [0.01, 1, 100, 10**4, 10**6]
    ]
    in_sample_errors = [result[1][0] for result in results]
    que[9] = ("in sample errors :", in_sample_errors)
    out_of_sample_errors = [result[1][1] for result in results]
    que[10] = ("out of sample errors :", out_of_sample_errors)
    return que
def simulations():
    que ={}
    progress_iterator = ProgressIterator(5)
    progress_iterator.next()
    sample_size = ceil(solved_vc_inequality(1 - 0.95, 0.05, 400000))
    que[1] = ("sample size needed :", sample_size)
    def error_bound_format(n):
        original_vc_bound, rademacher_penalty_bound, parrondo_van_den_broek_bound, devroye_bound = error_bound(n)
        output = ("Bounds for N=" + str(n),
            "\noriginal vc : " + str(original_vc_bound)
            + "\n" + "rademacher penalty : " + str(rademacher_penalty_bound)
            + "\n" + "parrondo and van den broek : " + str(parrondo_van_den_broek_bound)
            + "\n" + "devroye : "  + str(devroye_bound)
            + "\n"
            )
        return output

    progress_iterator.next()
    que[2] = error_bound_format(10000)

    progress_iterator.next()
    que[3] = error_bound_format(5) 

    progress_iterator.next()
    analysis = bias_variance_out_sample_error(1000)
    def bias_variance_format(analysis):
        names = [ "constant : a",
                "\n\nline through origin : ax",
                "\n\nline : ax + b",
                "\n\nquadratic through origin : ax**2",
                "\n\nquadratic : ax**2 + b"]
        output = ""
        for i in range(len(analysis)):
            if i == 1:
                output += names[i] \
                        + "\nmean parameters : " + str(analysis[i]["mean parameters"]) + " # ans to question 4 this differs from solution given" \
                        + "\nbias : " + str(analysis[i]["bias"]) + " # ans to question 5" \
                        + "\nvariance : " + str(analysis[i]["variance"]) + " # ans to question 6" \
                        + "\nexpected out of sample error : " + str(analysis[i]["expected out of sample error"])
            else:
                output += names[i] \
                        + "\nmean parameters : " + str(analysis[i]["mean parameters"]) \
                        + "\nbias : " + str(analysis[i]["bias"]) \
                        + "\nvariance : " + str(analysis[i]["variance"]) \
                        + "\nexpected out of sample error : " + str(analysis[i]["expected out of sample error"])
        output += "\n\nbest hypothesis is 'line throgh origin' with an expected out of sample error of " + str(round(analysis[1]["expected out of sample error"], 3))
        return output

    progress_iterator.next()
    que[4] = ("Also includes answers to question 5,6,7\n\nAnalysis of various hypotheses", 
            "\n" + str(bias_variance_format(analysis)))
    return que