def test_bm3d(data1, data2, domain): for marginal, noise in [ [[5, 7, 8], 1000], [[2, 3], 800], [[6, 7, 9], 1800], [[4, 5, 7], 1600], [[0, 2], 630], [[2, 3, 8], 920], [[0, 2, 3], 430], [[2, 4, 7], 1200], [[3, 5], 1500], ]: print(marginal) hist1 = tools.get_marginal(data1, domain, marginal) hist2 = tools.get_marginal(data2, domain, marginal) noisy_hist2 = hist2 + np.random.normal(scale=noise, size=hist2.shape) print(' query TVD: {:.4f}'.format( tools.get_TVD(hist2, noisy_hist2))) bm3d_hist2 = tools.bm3d_denoise(marginal, noisy_hist2, noise, 'normal') print(' bm3d TVD: {:.4f}'.format( tools.get_TVD(hist2, bm3d_hist2))) concatenated_hist2 = np.concatenate([hist1, noisy_hist2], axis=1) concatenated_bm3d_hist2 = tools.bm3d_denoise(marginal, concatenated_hist2, noise, 'normal') concatenated_bm3d_hist2 = concatenated_bm3d_hist2[:, hist2.shape[1]:] print(' concatenated bm3d TVD: {:.4f}'.format( tools.get_TVD(hist2, concatenated_bm3d_hist2)))
def test_data_TVD(data1, data2, domain, test_num=10, k=4, attr_num=10): data_num1 = len(data1) data_num2 = len(data2) print(f"data num1: {data_num1}, data num2: {data_num2}") i = 0 random_marginal = list(itertools.combinations(list(range(attr_num)), k)) random.shuffle(random_marginal) random_marginal = random_marginal[:test_num] average_TVD = 0 for marginal in random_marginal: hist1 = tools.get_marginal(data1, domain, marginal) hist2 = tools.get_marginal(data2, domain, marginal) hist2 = hist2 * data_num1 / data_num2 TVD = np.sum(np.abs(hist1 - hist2)) / 2 / data_num1 print(" {} TVD: {:.4f}".format(marginal, TVD)) average_TVD += TVD i += 1 if i >= test_num: break print("average TVD: {:.4f}".format(average_TVD / test_num))
def plot_miles(df): columns = list(df.columns) data = df.to_numpy() data = data[:, -1].reshape((-1, 1)) # data[data==0] = 1 data[data > 400] = 400 # data = (np.log10(data) * 10).astype(int) temp = int(max(data)) print('max mile', temp) temp_dict = {0: temp + 1} domain = Domain(temp_dict, [ 0, ]) hist = tools.get_marginal(data, domain, (0, )) hist[hist == 0] = 1 hist = np.log10(hist) ptools.plot_list(hist, './evaluate/trip_miles.pdf', size=(20.0, 2.5), zero_line=True)
def plot_marginal2(data, domain): for attr in [2, 5, 7, 8, 9]: hist = tools.get_marginal(data, domain, (attr, )) hist[hist == 0] = 1 print(attr) print(hist) hist = np.log10(hist) print(hist) ptools.plot_list(hist, f'./evaluate/{attr}.pdf')
def check_marginal(data0, data1, data2, domain): hist1 = tools.get_marginal(data1, domain, (0, 2)) hist2 = tools.get_marginal(data2, domain, (0, 2)) print(np.sum(hist1 < 10)) print(np.sum(hist2 < 10)) hist = tools.get_marginal(data2, domain, (3, )) hist = hist / np.sum(hist) print(np.where(hist > 0.06)) feature_pos = [30, 36, 37, 42, 50, 51, 52] print(hist[feature_pos]) for marginal in [ # (0, 1, 2), (0, 2, 3), ]: # 2019 hist0 = tools.get_marginal(data0, domain, marginal) # 2018 hist1 = tools.get_marginal(data1, domain, marginal) # dp hist2 = tools.get_marginal(data2, domain, marginal) print(hist1.shape, hist2.shape) head_attrs = [0, 2] cell_tvd = tools.get_cell_TVD(domain, marginal, hist0, hist1, hist2, hist, head_attrs)
def plot_low_p_cell_data(data, domain): hist = tools.get_marginal(data, domain, (2, )) low_p_cells = np.where(hist < 30000)[0] print(low_p_cells) mask = np.isin(data[:, 2], low_p_cells) data1 = data[mask] data2 = data[~mask] print('low cell: ', data1.shape) print('high cell: ', data2.shape) # atools.cal_MI(data1, domain, path='./temp/low_p_edge.json', max_length=2) # ptools.plot_correlation(data1, domain, './temp/low_p_edge.json', path='./temp/low_p_correlations.png') # atools.cal_MI(data2, domain, path='./temp/high_p_edge.json', max_length=2) # ptools.plot_correlation(data2, domain, './temp/high_p_edge.json', path='./temp/high_p_correlations.png') hist0 = tools.get_marginal(data, domain, (1, )) hist1 = tools.get_marginal(data1, domain, (1, )) hist2 = tools.get_marginal(data2, domain, (1, )) hist0 = hist0 / np.sum(hist0) hist1 = hist1 / np.sum(hist1) hist2 = hist2 / np.sum(hist2) + 0.02 plt.rcParams['figure.figsize'] = (11.0, 2.5) plt.rcParams['savefig.dpi'] = 200 plt.locator_params(nbins=10) fig = plt.figure() plt.plot(hist0, color='black') plt.plot(hist1, color='green') plt.plot(hist2, color='yellow') plt.plot([0] * len(hist0), 'r') plt.savefig('./temp/company.pdf', bbox_inches='tight')
def check_marginal_distribution(path1, path2, attr_list): data1, domain, columns = tools.read_data(path1, domain_path) temp_data2, domain, columns = tools.read_data(path2, domain_path, dtype=None) data2 = np.zeros(shape=temp_data2.shape) data2[:, anchor_attr] = temp_data2[:, anchor_attr] for attr in attr_list: marginal1 = tools.get_marginal(data1, domain, (attr, )) marginal2 = tools.get_marginal(data2, domain, (attr, )) print(f'attr {attr}') print(marginal1[:20]) print(marginal2[:20]) dist_array = marginal1 - marginal2 print(dist_array[:20]) check_array = (marginal1 - marginal2)/(marginal1 + 1) print(check_array[:20]) abs_array = np.abs(check_array) print(f'{len(check_array)}: {np.sum(abs_array>0.01)} {np.sum(abs_array>0.05)} {np.sum(abs_array>0.10)}\n')
def compare_data_marginal(data1, data2, domain): marginal_list = [(0, 2), (0, 1, 2), (0, 2, 3), (2, 3), (0, 2, 3, 5), (0, 2, 3, 6), (0, 2, 3, 7), (0, 2, 3, 8)] for marginal in marginal_list: print(marginal) hist1 = tools.get_marginal(data1, domain, marginal) hist2 = tools.get_marginal(data2, domain, marginal) tvd = tools.get_TVD(hist1, hist2) print(" tvd: {:.4f}".format(tvd)) cell = False if 0 in marginal: head_attrs = [0, 2] cell = True elif 3 in marginal: head_attrs = [2, 3] cell = True if cell: cell_tvd = tools.get_cell_TVD(domain, marginal, hist1, hist2, head_attrs) print(" cell tvd: {:.4f}".format(cell_tvd))
def truncate_and_plot(data_path, domain_path, prefix): data, domain, headings = tools.read_data(data_path, domain_path) threshold_map = {'fare': 100, 'tips': 20, 'trip_miles': 40, 'trip_total': 100, 'trip_seconds': 10000} for attr in threshold_map: data, domain = atools.truncate_data(data, domain, headings.index(attr), threshold_map[attr]) for attr in common_attrs: attr_id = headings.index(attr) ptools.plot_attr(attr_id, data, domain, \ path=f'./info/{prefix}_{attr}_{attr_id}.pdf') print(tools.get_marginal(data, domain, (headings.index('payment_type'), )) )
def truncate_and_plot(data_path, domain_path, prefix): data, domain, headings = tools.read_data(data_path, domain_path) threshold_map = {'fare': 100, 'tips': 20, 'trip_miles': 40, 'trip_total': 100, 'trip_seconds': 10000} for attr in threshold_map: data, domain = atools.truncate_data(data, domain, headings.index(attr), threshold_map[attr]) parameters_json = json.load(open('./data/parameters.json', 'r')) attrs = list(parameters_json['schema'].keys()) for attr in attrs: attr_id = headings.index(attr) ptools.plot_attr(attr_id, data, domain, \ path=f'./info/{prefix}_{attr}_{attr_id}.pdf') print(tools.get_marginal(data, domain, (headings.index('payment_type'), )) )
def train_MRF(root, weights, public_trip_data, public_taxi_id, trip_data, trip_domain, taxi_id, epsilon, budget, downsample_data_dict): print('training MRF') print(trip_domain) config = base_config.copy() config['epsilon'] = epsilon config['beta0'] = budget if not submit: print(sys.argv[1]) if epsilon < 0.5: model_epsilon = 0.1 elif epsilon < 4.0: model_epsilon = 1.0 else: model_epsilon = 10.0 print(public_trip_data.shape, trip_data.shape) # atools.cal_MI(trip_data, trip_domain, path='./temp/trip_correlations.json', max_length=2) # ptools.plot_correlation(trip_data, trip_domain, './temp/trip_correlations.json', path='./temp/trip_correlations.png') marginal_list = [] marginal_list = json.load( open(root + 'temp/trip' + str(model_epsilon) + '_marginal_list_save.json')) # marginal_list = json.load(open(root+'temp/trip'+str(epsilon)+'_marginal_list_save_miles.json')) in_attr_set = set() for marginal in marginal_list: in_attr_set |= set(marginal) for attr in set(range(trip_data.shape[1])) - in_attr_set: marginal_list.append([ attr, ]) bins_map = {} re_bins_map = {} hierarchy_marginals = [] mask_marginal = { # (2, 3): 5, # (0, 2, 3): 0, # (5, 7): 10, } marginal_mask = {} bm3d_marginal = [] gpu = True noisy_taxi_num = int(2.8e5) test_marginal_list = [(2, 3), (0, 1, 2), (0, 2, 3)] if config['load_graph']: # graph = nx.node_link_graph(json.load(open(root+'temp/'+config['exp_name']+'_'+str(epsilon)+'_miles_graph.json', 'r'))) graph = nx.node_link_graph( json.load( open( root + 'temp/' + config['exp_name'] + '_' + str(model_epsilon) + '_graph.json', 'r'))) else: init_model = NestedGraphicalModel(trip_data, trip_domain, config, config['data_name']) graph, entropy = init_model.construct_model() json.dump( nx.node_link_data(graph), open( root + 'temp/' + config['exp_name'] + '_' + str(epsilon) + '_graph.json', 'w')) # json.dump(nx.node_link_data(graph), open(root+'temp/'+config['exp_name']+'_'+str(epsilon)+'_miles_graph.json', 'w')) marginal_mask = {} for marginal in mask_marginal: threshold = mask_marginal[marginal] public_hist = tools.get_marginal(public_trip_data, trip_domain, marginal) mask = public_hist <= threshold marginal_mask[marginal] = mask print("generate mask {}, threshold: {:.2f}, mask ratio: {:.4f}".format( marginal, threshold, np.sum(mask) / mask.size)) hist = tools.get_marginal(public_trip_data, trip_domain, (2, )) low_p_cells = np.where(hist < 30000)[0] print('low_p', len(low_p_cells), low_p_cells) high_p_cells = np.array( list(set(list(range(78))) - set(list(low_p_cells)))) print('high_p', len(high_p_cells), high_p_cells) low_p_map = {i: low_p_cells[i] for i in range(len(low_p_cells))} low_f = np.vectorize(lambda x: low_p_map.get(x, -1)) high_p_map = {i: high_p_cells[i] for i in range(len(high_p_cells))} high_f = np.vectorize(lambda x: high_p_map.get(x, -1)) config['marginal_coefficent'] = 1 config['divide_data'] = 'low_p' config['estimation_iter_num'] = 5000 config['final_iter_num'] = 8000 if epsilon < 5.0: config['total_marginal_num'] = 13 config['ed_step_num'] = 2 else: config['total_marginal_num'] = 14 config['ed_step_num'] = 2 config['theta1'] = 2.0 config['theta2'] = 2.0 model = MarkovRandomField(public_trip_data, trip_data, trip_domain, graph, marginal_list, \ config, bins_map, re_bins_map, hierarchy_marginals, \ mask_marginal, marginal_mask, bm3d_marginal, taxi_id, gpu=gpu, \ noisy_taxi_num=noisy_taxi_num, weights=weights, p_cells=high_p_cells) p_marginal_mask = {} for marginal in mask_marginal: if 2 in marginal: idx = list(marginal).index(2) print(idx, marginal, marginal_mask[marginal].shape) p_marginal_mask[marginal] = np.take(marginal_mask[marginal], high_p_cells, axis=idx) else: p_marginal_mask[marginal] = marginal_mask[marginal] model.marginal_mask = p_marginal_mask model.public_marginal_list = [ # (2, 3, 5), # (0, 2, 3, 5), ] model.downsample_data_dict = downsample_data_dict model.init() if not config['load_high_p_model']: model.entropy_descent([1] * config['ed_step_num']) json.dump( model.measure_list, open( root + 'temp/' + config['exp_name'] + str(config['epsilon']) + '_marginal_list.json', 'w')) model.save_parameters(root + 'temp/model1' + str(config['exp_name']) + '_high.mdl') else: model.load_parameters(root + 'temp/model1' + str(config['exp_name']) + '_high.mdl') model.test_TVD(test_marginal_list) # data = model.synthetic_data(root+'temp/'+str(config['exp_name'])+'.csv', total=model.noisy_trip_num) config['marginal_coefficent'] = 1 config['divide_data'] = 'high_p' config['estimation_iter_num'] = 2000 config['final_iter_num'] = 4000 if epsilon < 5.0: config['total_marginal_num'] = 13 config['ed_step_num'] = 1 else: config['total_marginal_num'] = 14 config['ed_step_num'] = 1 config['theta1'] = 1.0 config['theta2'] = 1.0 # marginal_list = marginal_list[:10] # orginal # marginal_list = marginal_list[:13] # trip miles marginal_list = json.load( open(root + 'temp/trip' + str(model_epsilon) + '_marginal_list_save_low_p.json')) in_attr_set = set() for marginal in marginal_list: in_attr_set |= set(marginal) for attr in set(range(trip_data.shape[1])) - in_attr_set: marginal_list.append([ attr, ]) model2 = MarkovRandomField(public_trip_data, trip_data, trip_domain, graph, marginal_list, \ config, bins_map, re_bins_map, hierarchy_marginals, \ mask_marginal, marginal_mask, bm3d_marginal, taxi_id, gpu=gpu, \ noisy_taxi_num=noisy_taxi_num, weights=weights, p_cells=low_p_cells) model2.downsample_data_dict = model.downsample_data_dict p_marginal_mask = {} for marginal in mask_marginal: if 2 in marginal: idx = list(marginal).index(2) p_marginal_mask[marginal] = np.take(marginal_mask[marginal], low_p_cells, axis=idx) else: p_marginal_mask[marginal] = marginal_mask[marginal] model2.marginal_mask = p_marginal_mask if epsilon > 5.0: model2.public_marginal_list = [ # (2, 3, 5), (0, 2, 3, 5), (2, 3, 7), ] else: model2.public_marginal_list = [ # (2, 3, 5), (0, 2, 3, 5), (2, 3, 7), (5, 7, 8) ] model2.init() if not config['load_low_p_model']: model2.entropy_descent([1] * config['ed_step_num']) json.dump( model2.measure_list, open( root + 'temp/' + config['exp_name'] + str(config['epsilon']) + '_marginal_list.json', 'w')) model2.save_parameters(root + 'temp/model2' + str(config['exp_name']) + '_low.mdl') else: model2.load_parameters(root + 'temp/model2' + str(config['exp_name']) + '_low.mdl') model2.test_TVD(test_marginal_list) high_p_data = model.synthetic_data(total=int(config['data_num_ratio'] * model.noisy_trip_num)) low_p_data = model2.synthetic_data(total=int(config['data_num_ratio'] * model2.noisy_trip_num)) low_p_data[:, 2] = low_f(low_p_data[:, 2]) high_p_data[:, 2] = high_f(high_p_data[:, 2]) temp_marginal = tools.get_marginal(high_p_data, trip_domain, (2, )) # ptools.plot_list(temp_marginal, f'./evaluate/(2,)_p_syn.pdf') if not submit: config['temp_save_data_name'] = sys.argv[1] print( 'write csv', root + config['temp_save_data_name'] + '_' + str(epsilon) + '.csv') data = np.concatenate([low_p_data, high_p_data], axis=0) df = pd.DataFrame(data, columns=list(range(10))) df.to_csv(root + config['temp_save_data_name'] + '_' + str(epsilon) + '.csv', index=False) dp_trip_data = np.zeros(shape=(data.shape[0], data.shape[1] + 1), dtype=np.int32) dp_trip_data[:, 1:] = data taxi_id = assign_taxi_id(data, public_trip_data, public_taxi_id) dp_trip_data[:, 0] = taxi_id return dp_trip_data
def evaluate_public(): sys.stdout = open('./evaluate/evaluate_log_2014_gt.txt', 'w') df_2018 = pd.read_csv('./preprocess/2014.csv') df_public = pd.read_csv('./preprocess/ground_truth.csv') df_2018 = df_2018.drop( columns=['taxi_id', 'trip_day_of_week', 'trip_hour_of_day']) df_public = df_public.drop( columns=['taxi_id', 'trip_day_of_week', 'trip_hour_of_day']) data_2018 = df_2018.to_numpy() data_public = df_public.to_numpy() domain_dict = json.load(open('./preprocess/domain.json', 'r')) domain_dict = { i: domain_dict[df_2018.columns[i]] for i in range(len(df_2018.columns)) } domain = Domain(domain_dict, list(range(len(domain_dict)))) print(str(domain)) hist = tools.get_marginal(data_public, domain, (2, )) low_p_cells = np.where(hist < 30000)[0] print('low p:', low_p_cells) high_p_cells = np.array( list(set(list(range(78))) - set(list(low_p_cells)))) print('high p:', high_p_cells) df_dp = pd.read_csv('./save_2_10.0.csv') data = df_dp.to_numpy() data_dp = np.zeros(shape=(data.shape[0], data.shape[1] + 1), dtype=int) data_dp[:, 1:] = data for i in range(len(data_dp)): data_dp[i, 0] = int(1e6 + i / 60) data_dp = df_dp.to_numpy() print('evaluate overall') evaluate(data_2018, data_dp, domain) for p_cells in [low_p_cells, high_p_cells]: if p_cells is low_p_cells: print('evaluate low p') else: print('evaluate high p') mask_2018 = np.isin(data_2018[:, 2], p_cells) p_2018 = data_2018[mask_2018] mask_dp = np.isin(data_dp[:, 2], p_cells) p_dp = data_dp[mask_dp] p_map = {p_cells[i]: i for i in range(len(p_cells))} f = np.vectorize(lambda x: p_map.get(x, -1)) p_2018[:, 2] = f(p_2018[:, 2]) p_dp[:, 2] = f(p_dp[:, 2]) assert (not (p_2018[:, 2] == -1).any()) assert (not (p_dp[:, 2] == -1).any()) print(np.max(p_2018[:, 2])) print(np.max(p_dp[:, 2])) p_domain_dict = domain.dict.copy() p_domain_dict[2] = len(p_cells) + 1 p_domain = Domain(p_domain_dict, domain.attr_list) evaluate(p_2018, p_dp, p_domain)
def test_marginal(data1, data2, domain): # for marginal in [(2, 3, 5), (2, 3, 7)]: # hist1 = tools.get_marginal(data1, domain, marginal) # hist2 = tools.get_marginal(data2, domain, marginal) # tvd = tools.get_TVD(hist1, hist2) # tvd_array, cell_tvd = tools.get_cell_TVD(domain, marginal, hist1, hist2, [2,]) # print(" {}, TVD: {:.4f}, cell TVD: {:.4f}".format(marginal, tvd, cell_tvd)) # for marginal in [(0, 2, 3), (0, 2, 3, 5), (0, 2, 3, 7)]: # hist1 = tools.get_marginal(data1, domain, marginal) # hist2 = tools.get_marginal(data2, domain, marginal) # tvd = tools.get_TVD(hist1, hist2) # tvd_array, cell_tvd = tools.get_cell_TVD(domain, marginal, hist1, hist2, [0, 2]) # print(" {}, TVD: {:.4f}, cell TVD: {:.4f}".format(marginal, tvd, cell_tvd)) for marginal in [(2, 3, 5), (2, 3, 7), (2, 3, 8)]: print(marginal) # for data in [data1, data2]: for data in [ data1, ]: # std_hist = tools.get_marginal(data, domain, marginal).astype(int) temp_marginal = [ 0, ] temp_marginal.extend(marginal) hist = tools.get_marginal(data, domain, temp_marginal) # for shifts in [ # [0, 3, 6, 9, 12, 15, 18], # [4, 7, 10, 13, 16], # [5, 8, 11, 14, 17] # ]: for shifts in [ [0, 3, 6, 9, 12, 15, 18], # all nights [5, 8, 11, 14, 17], # weekday moring [4, 7, 10, 13, 16], # weekday afternoon [1, 2, 19, 20] ]: tvd_array = np.full(shape=len(shifts), fill_value=-1, dtype=float) std_hist = np.sum(hist[shifts], axis=0) for i in range(len(shifts)): _, cell_tvd = tools.get_cell_TVD(domain, marginal, std_hist, hist[shifts[i]], [ 2, ]) tvd_array[i] = cell_tvd # print(cell_tvd) print(np.mean(tvd_array), tvd_array) break break
def get_mask(data, domain, marginal, threshold=0): hist = tools.get_marginal(data, domain, marginal) mask = hist.copy() mask[hist <= threshold] = 0 mask[hist > threshold] = 1 return mask
def plot_marginal(data1, data2, domain): for marginal in [ # (0, 2), # (0, 2, 3), (2, 5, 7, 8, 9) # (2,), # (0,) ]: hist1 = tools.get_marginal(data1, domain, marginal) unique, cnt = np.unique(hist1, return_counts=True) print(marginal) print(cnt) cnt = np.log10(cnt) print(cnt) # unique = np.log10(unique) cdf = 0 for i in range(len(cnt)): cdf += cnt[i] cnt[i] = cdf plt.rcParams['figure.figsize'] = (11.0, 2.5) plt.rcParams['savefig.dpi'] = 200 plt.locator_params(nbins=10) fig = plt.figure() plt.plot(unique, cnt, 'o', color='blue') # hist2 = tools.get_marginal(data2, domain, marginal) # unique, cnt = np.unique(hist2, return_counts=True) # unique = np.log10(unique) # cdf = 0 # for i in range(len(cnt)): # cdf += cnt[i] # cnt[i] = cdf # plt.plot(unique, cnt, 'o', color='green') plt.savefig(f'./{marginal}_counts_dist.pdf', bbox_inches='tight') # tvd_array, cell_tvd = tools.get_cell_TVD(domain, marginal, hist1, hist2, [0, 2], path=f'{marginal}_cell_tvd.pdf') # tvd_array = tvd_array.reshape((21, 78)) # tvd_array = np.sum(tvd_array, axis=0) # tvd_array /= 21 # p_hist = tools.get_marginal(data2, domain, (2,)) # print(np.sum(p_hist > 50000)) # mask = p_hist > 50000 # print(np.sum(p_hist[mask])) # p_hist /= np.max(p_hist) # plt.rcParams['figure.figsize'] = (11.0, 2.5) # plt.rcParams['savefig.dpi'] = 200 # plt.locator_params(nbins=10) # fig = plt.figure() # plt.plot(tvd_array) # plt.plot(p_hist) # plt.savefig('./p_score.pdf', bbox_inches='tight') # plt.plot(unique, [0]*len(unique), 'r') # plt.savefig(f'./{marginal}_counts_dist.pdf', bbox_inches='tight') plt.rcParams['figure.figsize'] = (11.0, 2.5) plt.rcParams['savefig.dpi'] = 200 plt.locator_params(nbins=10) fig = plt.figure() hist1 = tools.get_marginal(data1, domain, (1, )).reshape((-1, )) hist2 = tools.get_marginal(data2, domain, (1, )).reshape((-1, )) plt.plot(hist1, 'o', color='blue') # plt.plot(hist2, 'o', color='green') plt.savefig(f'./(1,)_dist_dp.pdf', bbox_inches='tight')
def test_mask(gt_trip_data, trip_data_list, trip_domain, taxi_id_list, domain): data_num = len(gt_trip_data) # shift_downsample_num = 15 # shift_downsample_data_list = [] # for i in range(len(trip_data_list)): # data = np.concatenate([trip_data_list[i], taxi_id_list[i]], axis=1) # shift_downsample_data = tools.downsample_data(data, [0, -1], shift_downsample_num) # shift_downsample_data_list.append(shift_downsample_data) # trip_s_downsample_num = 30 # trip_s_downsample_data_list = [] # for i in range(len(trip_data_list)): # data = np.concatenate([trip_data_list[i], taxi_id_list[i]], axis=1) # trip_s_downsample_data = tools.downsample_data(data, [8, -1], trip_s_downsample_num) # trip_s_downsample_data_list.append(trip_s_downsample_data) for marginal, threshold, noise in test_mask_list: mask = get_mask(gt_trip_data, trip_domain, marginal, threshold=threshold) domain_size = trip_domain.project(marginal).size() print("{} mask ratio: {:.2f}, domain size: {}, average num: {}".format( marginal, 1 - np.sum(mask) / mask.size, domain_size, data_num / domain_size)) if len(marginal) == 2: ptools.plot_img(mask, f'./temp/{marginal}_mask.pdf') gt_hist = tools.get_marginal(gt_trip_data, trip_domain, marginal) noisy_hist = gt_hist + np.random.normal(scale=noise, size=gt_hist.shape) noisy_hist[noisy_hist < 0] = 0 print(' gt query TVD: {:.4f}'.format( tools.get_TVD(gt_hist, noisy_hist))) masked_hist = noisy_hist.copy() masked_hist[mask == 0] = 0 print(' gt masked TVD: {:.4f}'.format( tools.get_TVD(gt_hist, masked_hist))) for i in range(len(trip_data_list)): trip_data = trip_data_list[i] cell = False if 0 in marginal: head_attrs = [0, 2] cell = True elif 3 in marginal: head_attrs = [2, 3] cell = True test_hist = tools.get_marginal(trip_data, trip_domain, marginal) noisy_hist = test_hist + np.random.normal(scale=noise, size=test_hist.shape) noisy_hist[noisy_hist < 0] = 0 print(' test query TVD: {:.4f}'.format( tools.get_TVD(test_hist, noisy_hist))) if cell: print(' test query cell TVD: {:.4f}'.format( tools.get_cell_TVD(domain, marginal, test_hist, noisy_hist, head_attrs))) # if 0 in marginal: # downsample_noise = noise/200 * shift_downsample_num * (21 ** 0.5) # noisy_hist = tools.get_marginal(shift_downsample_data_list[i], trip_domain, marginal) # noisy_hist = noisy_hist + np.random.normal(scale=downsample_noise, size=noisy_hist.shape) # print(' shift query TVD: {:.4f}'.format(tools.get_TVD(test_hist, noisy_hist))) # elif 8 in marginal: # downsample_noise = noise/200 * trip_s_downsample_num * (11 ** 0.5) # noisy_hist = tools.get_marginal(trip_s_downsample_data_list[i], trip_domain, marginal) # noisy_hist = noisy_hist + np.random.normal(scale=downsample_noise, size=noisy_hist.shape) # print(' trip s query TVD: {:.4f}'.format(tools.get_TVD(test_hist, noisy_hist))) masked_test_hist = test_hist.copy() masked_test_hist[mask == 0] = 0 masked_hist = noisy_hist.copy() masked_hist[mask == 0] = 0 if cell: print(' test inner cell TVD: {:.4f}, head attrs: {}'. format( tools.get_cell_TVD(domain, marginal, test_hist, masked_test_hist, head_attrs), head_attrs)) print(' test inner TVD: {:.4f}'.format( tools.get_TVD(test_hist, masked_test_hist))) print(' test masked TVD: {:.4f}'.format( tools.get_TVD(test_hist, masked_hist))) if cell: print(' test masked cell TVD: {:.4f}'.format( tools.get_cell_TVD(domain, marginal, test_hist, masked_hist, head_attrs))) print(' test avearge num: {:.4f}'.format( np.sum(test_hist[mask == 0]) / np.sum(mask == 0))) print(' test large cell num: {:.4f}'.format( np.sum(test_hist[mask == 0] > 100))) mask2 = get_mask(trip_data, trip_domain, marginal, threshold=10) print(np.sum((mask2 == 0) & (mask == 0))) print(np.sum(mask == 0)) if len(marginal) == 2: ptools.plot_img(mask, f'./info/{marginal}_mask.pdf') ptools.plot_img(mask2, f'./info/{marginal}_mask_dp.pdf') ptools.plot_img(gt_hist, f'./info/{marginal}_marginal.pdf') ptools.plot_img(test_hist, f'./info/{marginal}_marginal_test.pdf') uniques, cnts = np.unique(test_hist[mask == 0], return_counts=True) print([(uniques[i], cnts[i]) for i in range(len(uniques))]) ptools.plot_x_y(uniques, cnts, f'./info/{marginal}_masked_counts.pdf', zero_line=True) if marginal == (0, 2, 3): temp_hist = test_hist.copy() temp_hist[mask != 0] = 0 for j in range(78): pos = np.where(temp_hist[:, j, j] > 0)[0] if len(pos) != 0: print(j, pos, len(pos)) for idx in pos: print(temp_hist[idx, j, j]) print('')
def evaluate(gt_data, dp_data, domain): print(gt_data.shape, dp_data.shape) data_num = len(gt_data) # for marginal in [(1,), (2,), (3,), (5,), (0, 2, 3), (0, 2), (1, 2), (2, 3), (2, 4), (2, 5), (2, 7), (2, 8), (2, 9)]: for marginal in [(0, 1, 2), (0, 2, 3), (0, 2, 5), (2, 3)]: gt_marginal = tools.get_marginal(gt_data, domain, marginal) dp_marginal = tools.get_marginal(dp_data, domain, marginal) temp = np.sum(np.abs(gt_marginal - dp_marginal)) / 2 / data_num print("{} TVD: {:.4f}".format(marginal, temp)) if len(marginal) == 1: ptools.plot_list_list([gt_marginal, dp_marginal], path=f'./evaluate/{marginal}.pdf') if set([0, 2]) <= set(marginal): # print(marginal, gt_marginal.shape, dp_marginal.shape) tvd, cell_tvd = tools.get_cell_TVD(domain, marginal, gt_marginal, dp_marginal, [0, 2], f'./{marginal}_cell_tvd.pdf') print(" cell TVD: {:.4f}".format(cell_tvd)) elif marginal == (2, 3): tvd, cell_tvd = tools.get_cell_TVD(domain, marginal, gt_marginal, dp_marginal, [2]) print(" cell TVD: {:.4f}".format(cell_tvd)) random_attrs = list(range(10)) random_attrs.remove(0) random_attrs.remove(2) marginal_list = [] for attr1, attr2 in itertools.combinations(random_attrs, 2): marginal = [0, 2, attr1, attr2] marginal_list.append(marginal) # marginal_list = [] # low_score_cells = np.zeros((21, 78)) result_dict = {attr: [] for attr in range(10)} average_score = 0 for i in range(len(marginal_list)): marginal = marginal_list[i] temp_domain = Domain(domain.dict, list(marginal)) gt_marginal, _ = np.histogramdd(gt_data[:, marginal], bins=temp_domain.edge()) dp_marginal, _ = np.histogramdd(dp_data[:, marginal], bins=temp_domain.edge()) # print(gt_marginal.shape) tvd = tools.get_TVD(gt_marginal, dp_marginal) s1, s2, s3, s4 = gt_marginal.shape gt_marginal = gt_marginal.reshape((s1*s2, s3*s4)) dp_marginal = dp_marginal.reshape((s1*s2, s3*s4)) gt_sum = np.sum(gt_marginal, keepdims=True, axis=1) + 1 dp_sum = np.sum(dp_marginal, keepdims=True, axis=1) + 1 gt_marginal = gt_marginal / gt_sum dp_marginal = dp_marginal / dp_sum scores = np.sum(np.abs(gt_marginal - dp_marginal), axis=1) / 2 score = 1 - np.sum(scores) / scores.size print(" {} score: {:.4f}, tvd: {:.4f}".format(marginal, score, tvd)) # low_score_cells[scores.reshape((21, 78)) > 0.5] += 1 average_score += score for attr in marginal: result_dict[attr].append(score) if len(marginal_list) > 0: print("average score: {:.4f}".format(average_score/len(marginal_list))) for attr in result_dict: print("{} score: {:.4f}".format(attr, sum(result_dict[attr])/len(result_dict[attr])))