def run_test(data, n, size, samples): # input data = (pandas Series) # n = number of categories verbose = True ################ myRegModel = regModel(n, size, samples) myRegModel.fit(plot=False) lbs = [] for i in np.arange(1, len(data) - size, size): uncomp_numbers = data[i:i + size].values ratio = lzw_compression_ratio(uncomp_numbers, n) ent = myRegModel.get_entropy(ratio, "a multinomial sequence", False) lb = h_inverse(ent, n, a=0.001) lbs.append(lb) plt.title("P(e) changing over time, len={}".format(size)) plt.plot(lbs) plt.axhline(np.mean(lbs), color="red", label="mean={}".format(np.mean(lbs).round(3))) plt.xlabel("time") plt.ylim(0, 0.666) plt.ylabel("probability of error") plt.legend() plt.savefig("result/bitcoin_{}.pdf".format(size), format='pdf') plt.show() return lbs
def test1(): n = 5 samples = 100 ############# for power in [7, 8, 9, 10]: size = 2**power filename = "ex2_model_fitting_{}".format(size) myRegModel = regModel(n, size, samples) myRegModel.fit(filename=filename)
def lower_bounds(test, k_s=[1, 2, 3, 4, 5], plot=False): samples = 100 lbs = [] for k in k_s: n = 2**k ##### seq = test.y size = len(seq) ##### myRegModel = regModel(n, size, samples) myRegModel.fit(plot=False) # discretize the sequence discretized_seq, categories = discretize(seq, n) uncomp_numbers = list(discretized_seq) ratio = lzw_compression_ratio(uncomp_numbers, n) ent = myRegModel.get_entropy(ratio, "a multinomial sequence", plot) lb = h_inverse(ent, n, a=0.001) lbs.append(lb) return lbs
def test(): n = 2 samples = 100 size = 1024 ############# myRegModel = regModel(n, size, samples) myRegModel.fit(plot=False) # sample sequence to test - 1. multinomial diff_list = [] for num in range(100): p = random_p(n) uncomp_numbers = multinomial(size, p) multi_ratio = lzw_compression_ratio(uncomp_numbers, n) multi_ent = myRegModel.get_entropy(multi_ratio, "a multinomial sequence", False) multi_ent_true = entropy(p) diff = multi_ent_true - multi_ent diff_list.append(diff) plt.hist(diff_list) plt.show()
def run_test(test, samples): lbs = [] for k in [1, 2, 3, 4, 5]: n = 2**k ##### seq = get_diff(test.y) size = len(seq) ##### myRegModel = regModel(n, size, samples) myRegModel.fit(plot=False) # discretize the sequence discretized_seq, categories = discretize(seq, n) # convert format and get p_tilda uncomp_numbers = list(discretized_seq) ratio = lzw_compression_ratio(uncomp_numbers, n) ent = myRegModel.get_entropy(ratio, "a multinomial sequence", False) lb = h_inverse(ent, n, a=0.001) lbs.append(lb) print("Lower Bound: ", lb) return lbs
def year_test(df, fr='15S', yr=213): ##### df_q4_home, df_q4_away = get_matrix(get_year(df, yr=yr), fr=fr) samples = 100 size = df_q4_home.shape[1] - 1 myRegModel3 = regModel(3, size, samples) myRegModel4 = regModel(4, size, samples) myRegModel5 = regModel(5, size, samples) myRegModel6 = regModel(6, size, samples) myRegModel7 = regModel(7, size, samples) myRegModel3.fit(plot=False) myRegModel4.fit(plot=False) myRegModel5.fit(plot=False) myRegModel6.fit(plot=False) myRegModel7.fit(plot=False) lbs_home = [] for i in range(df_q4_home.shape[0]): seq = df_q4_home.astype(int).iloc[i] uncomp_numbers = get_first_diff(seq) n = max(uncomp_numbers) + 1 print(n) if n == 3: myRegModel = myRegModel3 elif n == 4: myRegModel = myRegModel4 elif n == 5: myRegModel = myRegModel5 elif n == 6: myRegModel = myRegModel6 elif n == 7: myRegModel = myRegModel7 if np.sum(uncomp_numbers < 0) != 0: continue ratio = lzw_compression_ratio(uncomp_numbers, n) ent = myRegModel.get_entropy(ratio, "a multinomial sequence", False) lb = h_inverse(ent, n, a=0.001) lbs_home.append(lb) lbs_away = [] for i in range(df_q4_away.shape[0]): seq = df_q4_away.astype(int).iloc[i] uncomp_numbers = get_first_diff(seq) n = max(uncomp_numbers) + 1 print(n) if n == 3: myRegModel = myRegModel3 elif n == 4: myRegModel = myRegModel4 elif n == 5: myRegModel = myRegModel5 elif n == 6: myRegModel = myRegModel6 elif n == 7: myRegModel = myRegModel7 if np.sum(uncomp_numbers < 0) != 0: continue ratio = lzw_compression_ratio(uncomp_numbers, n) ent = myRegModel.get_entropy(ratio, "a multinomial sequence", False) lb = h_inverse(ent, n, a=0.001) lbs_away.append(lb) lbs = np.append(np.array(lbs_home), np.array(lbs_away)) lbs_df = pd.DataFrame(lbs, columns=[yr]) return lbs_df
def test2(size): # size = 1024 n = 2 samples = 100 ############# myRegModel = regModel(n, size, samples) myRegModel.fit() # blue scatters plt.scatter(myRegModel.entropy, myRegModel.ratio, marker='.') # orange regression plt.plot(myRegModel.reg_inv.predict( np.array(myRegModel.ratio).reshape(-1, 1)), myRegModel.ratio, label="regression", color="orange") # sample sequence to test - 1. multinomial # p = random_p(n) p = [0.8, 0.2] uncomp_numbers = multinomial(size, p) multi_ratio = lzw_compression_ratio(uncomp_numbers, n) multi_ent = myRegModel.get_entropy(multi_ratio, "a multinomial sequence", False) multi_ent_true = entropy(p) # sample sequence to test - 2. Markov # P = random_P(n) P = np.array([[0.7, 0.3], [0.6, 0.4]]) uncomp_numbers = markov(size, P) markov_ratio = lzw_compression_ratio(uncomp_numbers, n) markov_ent = myRegModel.get_entropy(markov_ratio, "a Markov process", False) markov_ent_true = entropy_rate(P) # multi plt.axvline(multi_ent, color="grey", alpha=0.5) plt.axhline(multi_ratio, color="grey", alpha=0.5) plt.scatter(multi_ent, multi_ratio, zorder=10, color="red", label="(multinomial entropy) est={}, true={}".format( multi_ent.round(3), np.round(multi_ent_true, 3))) # Markov plt.axvline(markov_ent, color="grey", alpha=0.5) plt.axhline(markov_ratio, color="grey", alpha=0.5) plt.scatter(markov_ent, markov_ratio, zorder=10, marker="X", color="red", label="(Markov entropy) est={}, true={}".format( markov_ent.round(3), np.round(markov_ent_true, 3))) plt.title("Estimated entropy of {} of size {}".format( "a sequence", myRegModel.size)) plt.xlabel("entropy") plt.ylabel("compression ratio") plt.legend(loc="lower right") plt.savefig("result/example_2_{}.pdf".format(size), format='pdf') plt.show()