def simple_CV_test(bins,N,i): random.seed(i) test_num=int(N/2) sampled_data = get_data(N) sampled_data_exchange_1=sampled_data[0:int(N/2)] sampled_data_exchange_1.sort() sampled_data_exchange_2=sampled_data[int(N/2):N] sampled_data_exchange_2.sort() size=20/float(bins) pred_distribution_1=Hist_new(int(N/2),sampled_data_exchange_1,bins) pred_distribution_2=Hist_new(int(N/2),sampled_data_exchange_2,bins) return sum_of_squire(pred_distribution_1,pred_distribution_2,len(pred_distribution_1))
def requirement1() : global min_range global max_range ds = [100, 500, 1000, 10000] b = 100 h = 0.1 k = 10 xs = np.linspace(min_range, max_range, 200) # Histogram as example legends = [] data = get_data(200) plot_true_distribution(1000) legends.append('True distribution') for d in ds : data = get_data(d) plt.hist(data, density=True, bins=b, alpha=0.4) legends.append('#bin = ' + str(b) + ', #data = ' + str(d)) plt.legend(legends) plt.title('Requirement 1-1') plt.savefig('req1-1', dpi=300) plt.show() # KDE as example plt.figure() legends = [] data = get_data(200) plot_true_distribution(1000) legends.append('True distribution') density = kde(data) for d in ds : data = get_data(d) density = kde(data) density.set_bandwidth(h) plt.plot(xs, density(xs)) legends.append('h = ' + str(h) + ', #data = ' + str(d)) plt.legend(legends) plt.title('Requirement 1-2') plt.savefig('req1-2', dpi=300) plt.show() # KNN as example plt.figure() legends = [] data = get_data(200) plot_true_distribution(1000) legends.append('True distribution') for d in ds : data = get_data(d) density = knn(data, k) plt.plot(xs, density(xs)) legends.append('k = ' + str(k) + ', #data = ' + str(d)) plt.legend(legends) plt.ylim([0, 0.4]) plt.title('Requirement 1-3') plt.savefig('req1-3', dpi=300) plt.show()
def KDE(NUM, h, c, l): sampled_data = get_data(NUM) minvalue = min(sampled_data) - 3 maxvalue = max(sampled_data) + 3 bins = 500 #(int)((maxvalue-minvalue)/h) x = np.linspace(minvalue, maxvalue, bins) y = np.zeros(x.shape, dtype=np.float) for i in sampled_data: y = y + (1 / NUM) * (1 / (2 * math.pi * h * h)**0.5) * math.e**(-(x - i)**2 / (2 * h**2)) plt.plot(x, y, color=c, label=l)
def show_k_influence(): N = 200 sampled_data = get_data(N) # Ks = [2, 5, 10, 20, 30, 40, 50, 60, 80, 100] Ks = [2, 5, 20, 30] fig = plt.figure(figsize=(12, 6)) for i, K in enumerate(Ks): plt.subplot(4, 1, i+1) plt.ylim(0, 0.35) plt.ylabel('K = {}'.format(K)) draw_nearest(N, K) gm_plot(gm1d, N) plt.show()
def KDE(num_sample,h): sample_data = get_data(num_sample) xlist = np.linspace(min(sample_data),max(sample_data),2*num_sample) ylist = np.zeros_like(xlist) i = 0 for x in xlist: sum = 0 for x_compare in sample_data: sum += Gaussian(x,x_compare,h) ylist[i] = sum/num_sample i += 1 plt.plot(xlist,ylist) plt.xlabel("x") plt.ylabel("y")
def kde(num_data, h): sampled_data = get_data(num_data) xs = np.linspace( min(sampled_data) - 3 * np.std(sampled_data), max(sampled_data) + 3 * np.std(sampled_data), 2000) ys = np.zeros_like(xs) for i, x in enumerate(xs): for xi in sampled_data: ys[i] += exp(-pow(x - xi, 2) / (2 * h * h)) / (sqrt(2 * pi * h * h) * num_data) plt.plot(xs, ys) plt.xlabel("x") plt.ylabel("p(x)") plt.show()
def kNN_matrix(N=200, K=10): assert K > 0 assert K <= N data = get_data(N) x = np.linspace(min(data), max(data), 100) distance = np.abs(x - np.reshape(data, (N, 1))) px = K / N * 0.5 / np.sort(distance, axis=0)[K - 1, :] plt.plot(x, px, label="kNN_matrix") plt.legend() plt.title("N = %d, K = %d" % (N, K)) gm1d.plot()
def KFold_cross_validation_KDE(num_sample): sample_data = get_data(num_sample) print(sample_data) kf = KFold(n_splits = 3) h_test = 0 minCV = 10000000 h_ideal = 0 for i in range(0,100): h_test += 0.01 CV = 0 print(i) for train_index,test_index in kf.split(sample_data): print("_______________") train = [] test = [] for idx in train_index: train.append(sample_data[idx]) for idx in test_index: test.append(sample_data[idx]) # print(train) # print(test) xlist = np.linspace(min(sample_data),max(sample_data),200) y_train = np.zeros_like(xlist) y_test = np.zeros_like(xlist) j = 0 for x in xlist: sum = 0 for x_compare in train: sum += Gaussian(x,x_compare,h_test) y_train[j] = sum/len(train) j += 1 j = 0 for x in xlist: sum = 0 for x_compare in test: sum += Gaussian(x,x_compare,h_test) y_test[j] = sum/len(test) j += 1 MSE = 0 for j in range(0,len(xlist)): MSE += math.pow(y_train[j] - y_test[j],2) CV += MSE/len(xlist) print(CV) if(CV < minCV): minCV = CV h_ideal = h_test print(h_ideal) KDE(num_sample,h_ideal)
def kernel_density_estimate(N=100, h=0.35, show=True): assert h > 0 data = get_data(N) x = np.linspace(min(data), max(data), 1000) px = np.sum(np.exp(np.square(x - np.reshape(data, (N, 1))) / (-2 * h**2)), axis=0) / (np.sqrt(2 * np.pi) * h) / N plt.plot(x, px, label="kernel density estimate") plt.legend() plt.title("N = %d, h = %f" % (N, h)) plot_gm1d() if show: plt.show()
def task1(bins: int = 50, para_h: int = 0.2, k: int = 20): sample_data1 = get_data(100) sample_data2 = get_data(500) sample_data3 = get_data(1000) sample_data4 = get_data(10000) plt.subplot(3, 2, 1) plt.title("num_data=100") show_all(sample_data1, bins, para_h, k) plt.subplot(3, 2, 2) plt.title("num_data=500") show_all(sample_data2, bins, para_h, k) plt.subplot(3, 2, 5) plt.title("num_data=1000") show_all(sample_data3, bins, para_h, k) plt.subplot(3, 2, 6) plt.title("num_data=10000") show_all(sample_data4, bins, para_h, k) plt.show() return
def nnde(num_data, k): sampled_data = get_data(num_data) xs = np.linspace( min(sampled_data) - 3 * np.std(sampled_data), max(sampled_data) + 3 * np.std(sampled_data), 2000) ys = np.zeros_like(xs) for i, x in enumerate(xs): dist = [] for xi in sampled_data: dist.append(abs(x - xi)) dist.sort() ys[i] = k / (num_data * 2 * (dist[k] + 1e-9)) plt.plot(xs, ys) plt.xlabel("x") plt.ylabel("p(x)") plt.show()
def Kernel(num,N,h): np.random.seed(0) output_data=[] h_2=h**2 para=1/(float(N)*mt.sqrt(2*mt.pi*h_2)) sampled_data = get_data(N) # for x in np.linspace(0,1-h,h): # print(1) # output_data.append(KernelGaussian(x,sampled_data)) for x in np.linspace(0,50,num): output_data.append(KernelGaussian(x,sampled_data,h_2,para)) plt.plot(np.linspace(0,50,num),output_data) gm1d = GaussianMixture1D(mode_range=(0, 50)) gm1d.plot(200) plt.show() return output_data
def kNN_kdtree(N=200, K=10, show=True): assert K > 0 assert K <= N data = get_data(N) tree = KDTree(np.reshape(data, (N, 1))) x = np.linspace(min(data), max(data), 100).reshape((100, 1)) matrix = tree.query(x, k=K, p=1) px = K / N * 0.5 / np.abs(tree.data[matrix[1][:, K - 1]] - x) plt.plot(x, px, label="kNN_kdtree") plt.legend() plt.title("N = %d, K = %d" % (N, K)) plot_gm1d() if show: plt.show()
def kNN(num_data=200, K=10): data = sorted(get_data(num_data)) x = np.linspace(min(data), max(data), 100) px = [] left = 0 right = K - 1 for xi in x: while right <num_data- 1 and data[right + 1] + data[left] < 2 * xi: right = right + 1 left = left + 1 px.append(0.5 / max(data[right] - xi, xi - data[left])) px = np.array(px) * K / num_data plt.plot(x, px, label="K = %d" % (K)) plt.legend() plt.title("N = %d, K = %d" % (num_data, K)) plt.savefig("img/knn_sample_"+str(num_data)+"_k_"+str(K)+".png")
def show_bin_method(): N = 200 sampled_data = get_data(N) stdev = np.std(sampled_data) # Sturge’s Rule k = 1+log2(N) # Scott’s Rule h = 3.49σN^(−1/3) # Rice’s Rule k = pow(N, 1/3)*2 names = ['Sturge’s Rule', 'Scott’s Rule', 'Rice’s Rule', '', '', '', ''] bin_num = [int( 1 + np.ceil(np.log2(N)) ), int(np.ceil( (max(sampled_data) - min(sampled_data)) / (3.49*stdev/np.power(N, 1.0/3.0)) )), int(np.ceil( np.power(N, 1.0/3.0)*2 )), 20, 25, 30, 50] print(bin_num) fig = plt.figure(figsize=(6, 10)) for i, bins in enumerate(bin_num): plt.subplot(3, 3, i+1) plt.title(names[i]) plt.ylabel(bins) draw_hist(N, bins) plt.show()
def show_h_influence(): N = 100 sampled_data = get_data(N) sampled_data = sorted(sampled_data) distance = 0 for i, sample in enumerate(sampled_data[1:]): distance += sample - sampled_data[i-1] distance /= (N-1) print(distance) # choose sqrt(avg(distance)) * 2 hs = [0.05, 0.1, 0.2, 0.3, 0.5, 1.0, 1.5, np.power(distance, 0.5)*2] fig = plt.figure(figsize=(12, 6)) for i, h in enumerate(hs): plt.subplot(4, 2, i+1) plt.ylabel(h) if i == 7: plt.xlabel('sqrt(average_interval) * 2') draw_kernel(N, h) plt.show()
def gauss_kernel(num_data=100, h=None, ptype="varh", num_inter=2000): sampled_data = get_data(num_data) mini, maxi = min(sampled_data), max(sampled_data) interval = maxi - mini x_list = np.linspace(mini - interval * 0.05, maxi + interval * 0.05, num_inter) if h is None: h = find_maxli(sampled_data) p_list = [] for x in x_list: p = calc_density(x, sampled_data, h) p_list.append(p) if ptype == "varh": plt.title("gauss h={:.2f}".format(h)) else: plt.title("gauss n={}".format(num_data)) plt.plot(x_list, p_list)
def compare(min_range=10000, max_range=50001): xT = range(min_range, max_range, 2000) yT_kernel = [] yT_IFGT = [] xs = np.linspace(min_range, max_range, 10000) for x in xT: sampled_data = get_data(x) T = time.time() kernel(sampled_data, xs, h=0.1389) yT_kernel.append(time.time() - T) T = time.time() IFGT(sampled_data, xs, h=0.1389, K=100) yT_IFGT.append(time.time() - T) plt.title('Time Comparision') plt.plot(xT, yT_kernel, color='blue') plt.plot(xT, yT_IFGT, color='red') plt.xlabel("x") plt.ylabel("Time") plt.show()
def Kernel_Density_Estimation(num_data, h): sampled_data = get_data(num_data) min_range = min(sampled_data) - 3 max_range = max(sampled_data) + 3 xs = np.linspace(min_range, max_range, 2000) ys = np.zeros_like(xs) index = 0 for x in xs: tmp = 0 for xn in sampled_data: tmp += m.exp(-(m.pow(x-xn, 2))/(2*m.pow(h, 2)))/(m.sqrt(2*m.pi*m.pow(h, 2))) ys[index] = tmp / num_data index += 1 plt.title("num_data = %d & h = %f" % (num_data, h)) plt.plot(xs, ys) plt.xlabel("x") plt.ylabel("p(x)") plt.show()
def task4(): sample_data = get_data(200) # kf = KFold(n_splits = 2) # mincv = 1000000.0 # mink = 1 # for k in range(1,31): # #plt.title("K={}".format(k)) # #knn_method(sample_data, k) # cv = 0.0 # for train,test in kf.split(sample_data): # cv += task4_corss_validation(train,test,k,min(sample_data),max(sample_data)) # if cv<mincv: # mincv = cv # # print(mincv) # mink = k # k = mink k = int(math.sqrt(len(sample_data))) # k = 15 plt.title("k={}".format(k)) knn_method(sample_data, k)
def histogram_bins_selection(): num_data = 200 sample_data = get_data(num_data) num_bins = int(square_root_choice(sample_data)) title = "square_root_choice: " + "bins=" + str(num_bins) + ", num_sd=200" histogram_estimation(num_bins=num_bins, sample_data=sample_data, status=True, title=title) num_bins = int(sturges_formula(sample_data)) title = "sturges_formula: " + "bins=" + str(num_bins) + ", num_sd=200" histogram_estimation(num_bins=num_bins, sample_data=sample_data, status=True, title=title) num_bins = int(rice_rule(sample_data)) title = "rice_rule: " + "bins=" + str(num_bins) + ", num_sd=200" histogram_estimation(num_bins=num_bins, sample_data=sample_data, status=True, title=title) num_bins = int(scotts_normal_reference_rule(sample_data)) title = "scotts_normal_reference_rule: " + "bins=" + str( num_bins) + ", num_sd=200" histogram_estimation(num_bins=num_bins, sample_data=sample_data, status=True, title=title) num_bins = int(shimazaki_and_shinomoto(sample_data)) title = "shimazaki_and_shinomoto: " + "bins=" + str( num_bins) + ", num_sd=200" plt.cla() histogram_estimation(num_bins=num_bins, sample_data=sample_data, status=True, title=title)
def requirement2() : global min_range global max_range data = get_data(200) bs = [2, 10, 30] xs = np.linspace(min_range, max_range, 200) legends = [] plot_true_distribution() legends.append('True Distribution') # Plotting histogram with different bins for b in bs : plt.hist(data, density=True, bins=b, alpha=0.4) legends.append('bins = ' + str(b)) plt.title('Requirement 2') plt.legend(legends) plt.show()
def knn(num_sample,K): sample_data = get_data(num_sample) xlist = np.linspace(min(sample_data),max(sample_data),2*num_sample) ylist = np.zeros_like(xlist) i = 0 integration = 0 for x in xlist: j = 0 dis_list = np.zeros_like(sample_data) for x_compare in sample_data: dis_list[j] = abs(x_compare - x) j += 1 dis_list.sort() ylist[i] = K/(num_sample*2*max(dis_list[K-1],0.001)) integration += (max(sample_data) - min(sample_data))*ylist[i]/(2*num_sample) i += 1 print(integration) plt.plot(xlist,ylist) plt.xlabel("x") plt.ylabel("y")
def simple_CV(bins,N): #return the result of CV test test_num=int(N/2) sampled_data = get_data(N) sampled_data_exchange=sampled_data[test_num:N] sampled_data_exchange.sort() size=20/float(bins) pred_distribution=Hist_new(N-test_num,sampled_data_exchange,bins) # print(pred_distribution) # plt.plot(range(0,len(pred_distribution)),pred_distribution) # plt.show() pred=[] for i in range(0,test_num): s=random.random() j=0 while s>0 and j<len(pred_distribution): s=s-pred_distribution[j] j=j+1 pred.append(j*size+20) # print(pred) return sum_of_squire(pred,sampled_data[0:len(pred)],test_num)
def KDE_CV(NUM, h): sampled_data = get_data(NUM) train_data = sampled_data[0:NUM * 8 // 10] valid_data = sampled_data[NUM * 8 // 10:NUM] train_data_size = NUM * 8 // 10 valid_data_size = NUM // 5 minvalue = min(sampled_data) - 3 maxvalue = max(sampled_data) + 3 bins = 500 #(int)((maxvalue-minvalue)/h) deltax = (maxvalue - minvalue) / 500 x = np.linspace(minvalue, maxvalue, bins) y = np.zeros(valid_data_size, dtype=np.float) for i in train_data: y = y + (1 / train_data_size) * ( 1 / (2 * math.pi * h * h)**0.5) * math.e**(-(valid_data - i)**2 / (2 * h**2)) loss = 0 for i in range(0, valid_data_size): loss = loss - math.log(y[i]) return (loss)
def histogram_exploration(): num_data = 200 sample_data = get_data(num_data) title = "bins=10, num_sd=200" histogram_estimation(num_bins=10, sample_data=sample_data, status=False, title=title) title = "bins=25, num_sd=200" histogram_estimation(num_bins=25, sample_data=sample_data, status=False, title=title) title = "bins=100, num_sd=200" histogram_estimation(num_bins=100, sample_data=sample_data, status=False, title=title)
def Nearest_Neighbor_Estimation(num_data, k): sampled_data = get_data(num_data) min_range = min(sampled_data) - 3 max_range = max(sampled_data) + 3 xs = np.linspace(min_range, max_range, 2000) ys = np.zeros_like(xs) index = 0 for x in xs: data_list = [] for xn in sampled_data: data_list.append(abs(x-xn)) data_list.sort() ys[index] = k / (num_data*2*(data_list[k]+1e-10)) index += 1 plt.title("num_data = %d & k = %d" % (num_data, k)) plt.plot(xs, ys) plt.xlabel("x") plt.ylabel("p(x)") plt.show()
def KernelDensity(num_data, h): sampled_data = get_data(num_data) min_range = min(sampled_data) - 3 max_range = max(sampled_data) + 3 xs = np.linspace(min_range, max_range, 2000) ys = np.zeros_like(xs) print(h) index = 0 for x in xs: tmp = 0 for xn in sampled_data: tmp += np.exp(-(np.power(x-xn, 2))/(2*np.power(h, 2)))/(np.sqrt(2*np.pi*np.power(h, 2))) ys[index] = tmp / num_data index += 1 plt.title("num_data = %d & h = %f" % (num_data, h)) plt.plot(xs, ys) plt.xlabel("x") plt.ylabel("p(x)") plt.savefig("img/kernel_sample_"+str(num_data)+"_h_"+str(h)+".png") plt.close()
def M_KDE(h): sample_data=get_data() N=100 h_2=h**2 para=1/(float(N)*mt.sqrt(2*mt.pi*h_2)) para_i=1/(float(N-1)*mt.sqrt(2*mt.pi*h_2)) def KDE_new(x): return (KernelGaussian(x,sample_data,h_2,para))**2 the_first=scipy.integrate.quad(KDE_new,20,40) the_second=0 flag=0 for x in sample_data: tp_sample_data=sample_data[:] del tp_sample_data[flag] flag=flag+1 the_second=the_second+KernelGaussian(x,tp_sample_data,h_2,para_i) the_second=the_second*2/N M0=the_first[0]-the_second return M0
def M_k_NN(K): N=500 sample_data=get_data(N) sample_data.sort() def NNM_new(x): return (KNN_Pro(x,sample_data,N-1,K))**2 the_first=scipy.integrate.quad(NNM_new,20,40) the_second=0 flag=0 for x in sample_data: tp_sample_data=sample_data[:] del tp_sample_data[flag] flag=flag+1 the_second=the_second+KNN_Pro(x,tp_sample_data,N-1,K) # print(x) the_second=the_second*2/N M0=the_first[0]-the_second return M0