Exemple #1
0
def run_test(data, n, size, samples):
    # input data = (pandas Series)
    # n = number of categories
    verbose = True
    ################
    myRegModel = regModel(n, size, samples)
    myRegModel.fit(plot=False)

    lbs = []
    for i in np.arange(1, len(data) - size, size):
        uncomp_numbers = data[i:i + size].values
        ratio = lzw_compression_ratio(uncomp_numbers, n)
        ent = myRegModel.get_entropy(ratio, "a multinomial sequence", False)
        lb = h_inverse(ent, n, a=0.001)
        lbs.append(lb)

    plt.title("P(e) changing over time, len={}".format(size))
    plt.plot(lbs)
    plt.axhline(np.mean(lbs),
                color="red",
                label="mean={}".format(np.mean(lbs).round(3)))
    plt.xlabel("time")
    plt.ylim(0, 0.666)
    plt.ylabel("probability of error")

    plt.legend()
    plt.savefig("result/bitcoin_{}.pdf".format(size), format='pdf')
    plt.show()
    return lbs
Exemple #2
0
def test1():
    n = 5
    samples = 100
    #############

    for power in [7, 8, 9, 10]:
        size = 2**power
        filename = "ex2_model_fitting_{}".format(size)
        myRegModel = regModel(n, size, samples)
        myRegModel.fit(filename=filename)
Exemple #3
0
def lower_bounds(test, k_s=[1, 2, 3, 4, 5], plot=False):
    samples = 100

    lbs = []
    for k in k_s:
        n = 2**k
        #####
        seq = test.y
        size = len(seq)
        #####

        myRegModel = regModel(n, size, samples)
        myRegModel.fit(plot=False)

        # discretize the sequence
        discretized_seq, categories = discretize(seq, n)
        uncomp_numbers = list(discretized_seq)
        ratio = lzw_compression_ratio(uncomp_numbers, n)
        ent = myRegModel.get_entropy(ratio, "a multinomial sequence", plot)
        lb = h_inverse(ent, n, a=0.001)
        lbs.append(lb)
    return lbs
Exemple #4
0
def test():
    n = 2
    samples = 100
    size = 1024
    #############

    myRegModel = regModel(n, size, samples)
    myRegModel.fit(plot=False)

    # sample sequence to test - 1. multinomial
    diff_list = []
    for num in range(100):
        p = random_p(n)
        uncomp_numbers = multinomial(size, p)
        multi_ratio = lzw_compression_ratio(uncomp_numbers, n)
        multi_ent = myRegModel.get_entropy(multi_ratio,
                                           "a multinomial sequence", False)
        multi_ent_true = entropy(p)
        diff = multi_ent_true - multi_ent
        diff_list.append(diff)

    plt.hist(diff_list)
    plt.show()
Exemple #5
0
def run_test(test, samples):
    lbs = []
    for k in [1, 2, 3, 4, 5]:
        n = 2**k

        #####
        seq = get_diff(test.y)
        size = len(seq)
        #####

        myRegModel = regModel(n, size, samples)
        myRegModel.fit(plot=False)

        # discretize the sequence
        discretized_seq, categories = discretize(seq, n)

        # convert format and get p_tilda
        uncomp_numbers = list(discretized_seq)
        ratio = lzw_compression_ratio(uncomp_numbers, n)
        ent = myRegModel.get_entropy(ratio, "a multinomial sequence", False)
        lb = h_inverse(ent, n, a=0.001)
        lbs.append(lb)
        print("Lower Bound: ", lb)
    return lbs
Exemple #6
0
def year_test(df, fr='15S', yr=213):
    #####
    df_q4_home, df_q4_away = get_matrix(get_year(df, yr=yr), fr=fr)

    samples = 100
    size = df_q4_home.shape[1] - 1

    myRegModel3 = regModel(3, size, samples)
    myRegModel4 = regModel(4, size, samples)
    myRegModel5 = regModel(5, size, samples)
    myRegModel6 = regModel(6, size, samples)
    myRegModel7 = regModel(7, size, samples)

    myRegModel3.fit(plot=False)
    myRegModel4.fit(plot=False)
    myRegModel5.fit(plot=False)
    myRegModel6.fit(plot=False)
    myRegModel7.fit(plot=False)

    lbs_home = []
    for i in range(df_q4_home.shape[0]):
        seq = df_q4_home.astype(int).iloc[i]
        uncomp_numbers = get_first_diff(seq)
        n = max(uncomp_numbers) + 1

        print(n)
        if n == 3:
            myRegModel = myRegModel3
        elif n == 4:
            myRegModel = myRegModel4
        elif n == 5:
            myRegModel = myRegModel5
        elif n == 6:
            myRegModel = myRegModel6
        elif n == 7:
            myRegModel = myRegModel7

        if np.sum(uncomp_numbers < 0) != 0:
            continue

        ratio = lzw_compression_ratio(uncomp_numbers, n)
        ent = myRegModel.get_entropy(ratio, "a multinomial sequence", False)
        lb = h_inverse(ent, n, a=0.001)
        lbs_home.append(lb)

    lbs_away = []
    for i in range(df_q4_away.shape[0]):
        seq = df_q4_away.astype(int).iloc[i]
        uncomp_numbers = get_first_diff(seq)
        n = max(uncomp_numbers) + 1

        print(n)
        if n == 3:
            myRegModel = myRegModel3
        elif n == 4:
            myRegModel = myRegModel4
        elif n == 5:
            myRegModel = myRegModel5
        elif n == 6:
            myRegModel = myRegModel6
        elif n == 7:
            myRegModel = myRegModel7

        if np.sum(uncomp_numbers < 0) != 0:
            continue

        ratio = lzw_compression_ratio(uncomp_numbers, n)
        ent = myRegModel.get_entropy(ratio, "a multinomial sequence", False)
        lb = h_inverse(ent, n, a=0.001)
        lbs_away.append(lb)

    lbs = np.append(np.array(lbs_home), np.array(lbs_away))
    lbs_df = pd.DataFrame(lbs, columns=[yr])

    return lbs_df
Exemple #7
0
def test2(size):
    # size = 1024
    n = 2
    samples = 100
    #############

    myRegModel = regModel(n, size, samples)
    myRegModel.fit()

    # blue scatters
    plt.scatter(myRegModel.entropy, myRegModel.ratio, marker='.')
    # orange regression
    plt.plot(myRegModel.reg_inv.predict(
        np.array(myRegModel.ratio).reshape(-1, 1)),
             myRegModel.ratio,
             label="regression",
             color="orange")

    # sample sequence to test - 1. multinomial
    # p = random_p(n)
    p = [0.8, 0.2]
    uncomp_numbers = multinomial(size, p)
    multi_ratio = lzw_compression_ratio(uncomp_numbers, n)
    multi_ent = myRegModel.get_entropy(multi_ratio, "a multinomial sequence",
                                       False)
    multi_ent_true = entropy(p)

    # sample sequence to test - 2. Markov
    # P = random_P(n)
    P = np.array([[0.7, 0.3], [0.6, 0.4]])
    uncomp_numbers = markov(size, P)
    markov_ratio = lzw_compression_ratio(uncomp_numbers, n)
    markov_ent = myRegModel.get_entropy(markov_ratio, "a Markov process",
                                        False)
    markov_ent_true = entropy_rate(P)

    # multi
    plt.axvline(multi_ent, color="grey", alpha=0.5)
    plt.axhline(multi_ratio, color="grey", alpha=0.5)
    plt.scatter(multi_ent,
                multi_ratio,
                zorder=10,
                color="red",
                label="(multinomial entropy) est={}, true={}".format(
                    multi_ent.round(3), np.round(multi_ent_true, 3)))

    # Markov
    plt.axvline(markov_ent, color="grey", alpha=0.5)
    plt.axhline(markov_ratio, color="grey", alpha=0.5)
    plt.scatter(markov_ent,
                markov_ratio,
                zorder=10,
                marker="X",
                color="red",
                label="(Markov entropy) est={}, true={}".format(
                    markov_ent.round(3), np.round(markov_ent_true, 3)))

    plt.title("Estimated entropy of {} of size {}".format(
        "a sequence", myRegModel.size))
    plt.xlabel("entropy")
    plt.ylabel("compression ratio")
    plt.legend(loc="lower right")
    plt.savefig("result/example_2_{}.pdf".format(size), format='pdf')
    plt.show()