def generate_dataset_in_random_mode(self, n, description_file, seed=0, minimum=0, maximum=100): set_random_seed(seed) description = read_json_file(description_file) self.synthetic_dataset = DataFrame() for attr in description['attribute_description'].keys(): attr_info = description['attribute_description'][attr] datatype = attr_info['data_type'] is_categorical = attr_info['is_categorical'] is_candidate_key = attr_info['is_candidate_key'] if is_candidate_key: self.synthetic_dataset[attr] = parse_json( attr_info).generate_values_as_candidate_key(n) elif is_categorical: self.synthetic_dataset[attr] = random.choice( attr_info['distribution_bins'], n) elif datatype == 'String': length = random.randint(attr_info['min'], attr_info['max'] + 1) self.synthetic_dataset[attr] = length self.synthetic_dataset[attr] = self.synthetic_dataset[ attr].map(lambda x: generate_random_string(x)) else: if datatype == 'Integer': self.synthetic_dataset[attr] = random.randint( minimum, maximum + 1, n) else: self.synthetic_dataset[attr] = random.uniform( minimum, maximum, n)
def get_plot_data(input_dataset_file, synthetic_dataset_file, description_file): description = read_json_file(description_file) df_before = pd.read_csv(input_dataset_file) df_after = pd.read_csv(synthetic_dataset_file) plot_data = {'histogram': {}, 'barchart': {}, 'heatmap': {}} for attr in df_before: if description['attribute_description'][attr]['is_categorical']: bins_before, counts_before = get_barchart_data(df_before, attr) bins_after, counts_after = get_barchart_data(df_after, attr) plot_data['barchart'][attr] = { 'before': { 'bins': bins_before, 'counts': counts_before }, 'after': { 'bins': bins_after, 'counts': counts_after } } elif description['attribute_description'][attr]['data_type'] in { 'Integer', 'Float' }: plot_data['histogram'][attr] = { 'before': get_histogram_data(df_before, attr), 'after': get_histogram_data(df_after, attr) } plot_data['heatmap']['before'] = get_heatmap_data(input_dataset_file) plot_data['heatmap']['after'] = get_heatmap_data(synthetic_dataset_file) plot_file_name = input_dataset_file.replace(".csv", "_plot.json") with open(plot_file_name, 'w') as outfile: json.dump(plot_data, outfile, indent=4)
def generate_dataset_in_random_mode(self, n, description_file, seed=0): set_random_seed(seed) description = read_json_file(description_file) self.synthetic_dataset = pd.DataFrame() for attr in description['attribute_description'].keys(): attr_description = description['attribute_description'][attr] datatype = attr_description['datatype'] is_categorical = attr_description['is_categorical'] if is_categorical: self.synthetic_dataset[attr] = np.random.choice( attr_description['distribution_bins'], n) elif datatype == 'string': length = np.random.randint(attr_description['min'], attr_description['max']) self.synthetic_dataset[attr] = length self.synthetic_dataset[attr] = self.synthetic_dataset[ attr].map(lambda x: generate_random_string(x)) else: minimum, maximum = attr_description['min'], attr_description[ 'max'] if datatype == 'int': self.synthetic_dataset[attr] = np.random.randint( minimum, maximum + 1, n) else: self.synthetic_dataset[attr] = np.random.uniform( minimum, maximum, n)
def generate_dataset_in_correlated_attribute_mode(self, n, description_file, seed=0): set_random_seed(seed) self.n = n self.description = read_json_file(description_file) all_attributes = self.description['meta']['all_attributes'] candidate_keys = set(self.description['meta']['candidate_keys']) self.encoded_dataset = DataGenerator.generate_encoded_dataset( self.n, self.description) self.synthetic_dataset = DataFrame(columns=all_attributes) for attr in all_attributes: attr_info = self.description['attribute_description'][attr] column = parse_json(attr_info) if attr in self.encoded_dataset: self.synthetic_dataset[ attr] = column.sample_values_from_binning_indices( self.encoded_dataset[attr]) elif attr in candidate_keys: self.synthetic_dataset[ attr] = column.generate_values_as_candidate_key(n) else: # for attributes not in BN or candidate keys, use independent attribute mode. binning_indices = column.sample_binning_indices_in_independent_attribute_mode( n) self.synthetic_dataset[ attr] = column.sample_values_from_binning_indices( binning_indices)
def generateRanking(current_file, top_K=100): """ Generate a ranking of input data. Attributes: current_file: file name that stored the data (with out ".csv" suffix) top_K: threshold of returned generated ranking Return: json data of a dataframe that stored the generated ranking """ ranks_file = current_file + "_rankings.json" rankings_paras = read_json_file(ranks_file) data = pd.read_csv(current_file + ".csv") # before compute the score, replace the NA in the data with 0 filled_data = data.fillna(value=0) chosed_atts = rankings_paras["ranked_atts"] filled_data["GeneratedScore"] = 0 for i in range(len(chosed_atts)): cur_weight = rankings_paras["ranked_atts_weight"][i] filled_data["GeneratedScore"] += cur_weight * filled_data[ chosed_atts[i]] filled_data = filled_data.reindex_axis( ['GeneratedScore'] + list([a for a in filled_data.columns if a != 'GeneratedScore']), axis=1) # save data with weight sum to a csv on server filled_data.sort_values(by="GeneratedScore", ascending=False, inplace=True) filled_data.to_csv(current_file + "_weightsum.csv", index=False) # only show top_K rows in the UI display_data = filled_data.head(top_K) return display_data.to_json(orient='records')
def generate_dataset_in_correlated_attribute_mode(self, n, description_file, seed=0): self.n = n set_random_seed(seed) self.description = read_json_file(description_file) self.encoded_dataset = DataGenerator.generate_encoded_dataset( self.n, self.description) self.sample_from_encoded_dataset()
def computePvaluePairwise(att_name, att_value, current_file, round_default=2): """ Compute p-value using Pairwise oracle Attributes: att_name: sensitive attribute name att_value: value of protected group of above attribute current_file: file name that stored the data (with out ".csv" suffix) run_time: running times of simulation using mergeUnfairRanking round_default: threshold of round function for the returned p-value Return: rounded p-value """ data = pd.read_csv(current_file + "_weightsum.csv") total_N = len(data) # for attribute value, compute the current pairs and estimated fair pairs position_lists_val = data[data[att_name] == att_value].index + 1 size_vi = len(position_lists_val) fair_p_vi = size_vi / total_N # get the pre-computed pairwise results from simulation simu_data = read_json_file( "/home/ec2-user/dataResponsiblyUI/playdata/rankingfacts/SimulationPairs_N" + str(total_N) + "_R1000.json") # simu_data = read_json_file("./playdata/rankingfacts/SimulationPairs_N" + str(total_N) + "_R1000.json") all_fair_p = list(simu_data.keys()) if str(fair_p_vi) in all_fair_p: cur_pi = str(fair_p_vi) else: diff_p = [] for pi in all_fair_p: num_pi = float(pi) diff_p.append(abs(num_pi - fair_p_vi)) min_diff_index = diff_p.index(min(diff_p)) cur_pi = all_fair_p[min_diff_index] # compute the number of pairs of value > * in the input ranking that is stored in the current file pair_N_vi, estimated_fair_pair_vi, size_vi = computePairN( att_name, att_value, current_file) # compute the cdf, i.e. p-value of input pair value sample_pairs = simu_data[cur_pi] cdf_pair = Cdf(sample_pairs, pair_N_vi) # decide to use left tail or right tail # mode_pair_sim,_ = mode(sample_pairs) # median_mode = np.median(list(mode_pair_sim)) # if pair_N_vi <= mode_pair_sim: # p_value = cdf_pair # else: # p_value = 1- cdf_pair return round(cdf_pair, round_default)
def describe_dataset_in_random_mode( self, dataset_file: str, attribute_to_datatype: Dict[str, DataType] = None, attribute_to_is_categorical: Dict[str, bool] = None, attribute_to_is_candidate_key: Dict[str, bool] = None, categorical_attribute_domain_file: str = None, numerical_attribute_ranges: Dict[str, List] = None, seed=0): attribute_to_datatype = attribute_to_datatype or {} attribute_to_is_categorical = attribute_to_is_categorical or {} attribute_to_is_candidate_key = attribute_to_is_candidate_key or {} numerical_attribute_ranges = numerical_attribute_ranges or {} if categorical_attribute_domain_file: categorical_attribute_to_domain = utils.read_json_file( categorical_attribute_domain_file) else: categorical_attribute_to_domain = {} utils.set_random_seed(seed) self.attr_to_datatype = { attr: DataType(datatype) for attr, datatype in attribute_to_datatype.items() } self.attr_to_is_categorical = attribute_to_is_categorical self.attr_to_is_candidate_key = attribute_to_is_candidate_key self.read_dataset_from_csv(dataset_file) self.infer_attribute_data_types() self.analyze_dataset_meta() self.represent_input_dataset_by_columns() for column in self.attr_to_column.values(): attr_name = column.name if attr_name in categorical_attribute_to_domain: column.infer_domain( categorical_domain=categorical_attribute_to_domain[ attr_name]) elif attr_name in numerical_attribute_ranges: column.infer_domain( numerical_range=numerical_attribute_ranges[attr_name]) else: column.infer_domain() # record attribute information in json format self.data_description['attribute_description'] = {} for attr, column in self.attr_to_column.items(): self.data_description['attribute_description'][ attr] = column.to_json()
def generate_dataset_in_independent_mode(self, n, description_file, seed=0): set_random_seed(seed) self.description = read_json_file(description_file) all_attributes = self.description['meta']['all_attributes'] candidate_keys = set(self.description['meta']['candidate_keys']) self.synthetic_dataset = DataFrame(columns=all_attributes) for attr in all_attributes: attr_info = self.description['attribute_description'][attr] column = parse_json(attr_info) if attr in candidate_keys: self.synthetic_dataset[attr] = column.generate_values_as_candidate_key(n) else: binning_indices = column.sample_binning_indices_in_independent_attribute_mode(n) self.synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices)
def generate_dataset_in_correlated_attribute_mode(self, n, description_file, seed=0): self.n = n set_random_seed(seed) self.description = read_json_file(description_file) self.encoded_dataset = DataGenerator.generate_encoded_dataset( self.n, self.description) # # use independent attribute mode for attributes ignored by BN, which are non-categorical strings. # for attr in self.description['meta']['attributes_ignored_by_BN']: # attr_info = self.description['attribute_description'][attr] # bins = attr_info['distribution_bins'] # probs = attr_info['distribution_probabilities'] # self.encoded_dataset[attr] = np.random.choice(list(range(len(bins))), size=n, p=probs) self.sample_from_encoded_dataset()
def json_piechart_data(request): passed_data_name = request.session.get('passed_data_name') passed_running_data_flag = request.session.get("running_data") if passed_running_data_flag == "processed": cur_data_name = passed_data_name + "_norm" ranks_file = passed_data_name + "_norm_rankings.json" else: cur_data_name = passed_data_name ranks_file = passed_data_name + "_rankings.json" # read the parameter file in server that stores all the parameter inputs from user rankings_paras = read_json_file(ranks_file) checked_cate_atts = rankings_paras["checked_cate_atts"] piechart_json_data = get_chart_data(cur_data_name, checked_cate_atts) return HttpResponse(json.dumps(piechart_json_data), content_type='application/json')
def generate_dataset_in_independent_mode(self, n, description_file, seed=0): set_random_seed(seed) self.description = read_json_file(description_file) attributes = self.description['meta']['attribute_list'] self.encoded_dataset = pd.DataFrame(columns=attributes, index=list(range(n))) for attr in attributes: attr_info = self.description['attribute_description'][attr] bins = attr_info['distribution_bins'] probs = attr_info['distribution_probabilities'] self.encoded_dataset[attr] = np.random.choice(list(range( len(bins))), size=n, p=probs) self.sample_from_encoded_dataset()
def get_categorical_attributes(plot_json_file): plot_data = read_json_file(plot_json_file) return list(plot_data['barchart'].keys())
def nutrition_facts(request): passed_data_name = request.session.get('passed_data_name') passed_running_data_flag = request.session.get("running_data") # for previous step, return to corresponding parameter setting page unprocessed_flag = True if passed_running_data_flag == "processed": ranks_file = passed_data_name + "_norm_rankings.json" cur_data_name = passed_data_name + "_norm" unprocessed_flag = False else: ranks_file = passed_data_name + "_rankings.json" cur_data_name = passed_data_name # read the parameter file in server that stores all the parameter inputs from user rankings_paras = read_json_file(ranks_file) chosed_atts = rankings_paras["ranked_atts"] checked_sensi_atts = rankings_paras["checked_sensi_atts"] checked_cate_atts = rankings_paras["checked_cate_atts"] # get the choosed atts and its weight in the parameter file att_weights = {} for i in range(len(chosed_atts)): att_weights[chosed_atts[i]] = rankings_paras["ranked_atts_weight"][i] # get size of upload data total_n = getSizeOfRanking(cur_data_name) # compute statistics of top 10 and overall in generated ranking att_stats_topTen = compute_statistic_topN(chosed_atts, cur_data_name, 10) att_stats_all = compute_statistic_topN(chosed_atts, cur_data_name, total_n) # compute top 3 correlated attributes att_correlated = compute_correlation(cur_data_name) # set the correlation threshold low_coef_threshold = 0.25 high_coef_threshold = 0.75 # generate the json data for correlation table top3_correlated_attts = {} for ai in att_correlated: ai_coef = abs(ai[0]) ai_name = ai[1] if ai_coef >= high_coef_threshold: top3_correlated_attts[ai_name] = [ai_coef, "high"] else: if ai_coef <= low_coef_threshold: top3_correlated_attts[ai_name] = [ai_coef, "low"] else: top3_correlated_attts[ai_name] = [ai_coef, "median"] # compute statistics of top 3 correlated attributes for detailed ingredients widget top_corre_atts = [att_correlated[i][1] for i in range(len(att_correlated))] corre_att_stats_topTen = compute_statistic_topN(top_corre_atts, cur_data_name, 10) corre_att_stats_all = compute_statistic_topN(top_corre_atts, cur_data_name, total_n) # compute the slope of generated scores at a specified top-k # set the slope threshold for stability slope_threshold = 0.25 if total_n >= 100: slope_top_ten = computeSlopeOfScores(cur_data_name, 10) slope_top_hundred = computeSlopeOfScores(cur_data_name, 100) stable_ten = abs(slope_top_ten) <= slope_threshold stable_hundred = abs(slope_top_hundred) <= slope_threshold stable_res = {"Top-10": stable_ten, "Top-100": stable_hundred} slope_overall = "false" else: if total_n >= 10: slope_top_ten = computeSlopeOfScores(cur_data_name, 10) slope_overall = computeSlopeOfScores(cur_data_name, total_n) stable_ten = abs(slope_top_ten) <= slope_threshold stable_overall = abs(slope_overall) <= slope_threshold slope_top_hundred = "NA" stable_res = {"Top-10": stable_ten, "Overall": stable_overall} else: slope_top_ten = "NA" slope_top_hundred = "NA" slope_overall = "false" stable_res = {} # run the fairness validation for three oracles fair_all_oracles, fair_res_oracles, alpha_default, top_K = runFairOracles( checked_sensi_atts, cur_data_name) checked_cate_att_ids = [ "att" + str(i) for i in range(len(checked_cate_atts)) ] # compute the number of pir charts pie_n = len(checked_cate_att_ids) row_n = int(np.ceil(pie_n / 2)) place_n = int(5 + (row_n - 1) * 2) split_n = int(row_n * 2 + 1) context = { 'passed_data_name': passed_data_name, "passed_att_weights": att_weights, "passed_att_stats_topTen": att_stats_topTen, "passed_att_stats_all": att_stats_all, "passed_att_correlated": top3_correlated_attts, "passed_unprocessing_flag": unprocessed_flag, "corre_att_stats_topTen": corre_att_stats_topTen, "corre_att_stats_all": corre_att_stats_all, "passed_fair_all_oracles": fair_all_oracles, "passed_fair_res_oracles": fair_res_oracles, "passed_slope_ten": slope_top_ten, "passed_slope_hundred": slope_top_hundred, "passed_stable_res": stable_res, "passed_slope_threshold": slope_threshold, "passed_alpha_default": alpha_default, "passed_coef_high": high_coef_threshold, "passed_top_k": top_K, "passed_coef_low": low_coef_threshold, "passed_slope_overall": slope_overall, "passed_pie_att_ids": checked_cate_att_ids, "passed_pie_atts": checked_cate_atts, "passed_range_place": range(place_n), "passed_range_split": range(split_n), } return render(request, 'rankingfacts/ranking_facts_widget_boot.html', context)
def get_drawable_attributes(plot_json_file): plot_data = read_json_file(plot_json_file) return list(plot_data['barchart'].keys()) + list( plot_data['histogram'].keys())
def res_json_processing_plot(request): passed_data_name = request.session.get('passed_data_name') description_file = passed_data_name + "_plot.json" plot_json = read_json_file(description_file) return HttpResponse(json.dumps(plot_json), content_type='application/json')
def generate_data(username): configuration = read_json_file('{}_parameters.json'.format(username)) input_dataset_file = '{}.csv'.format(username) description_file = '{}_description.json'.format(username) synthetic_dataset_file = '{}_synthetic_data.csv'.format(username) initial_dataset_info = get_dataset_info(input_dataset_file) attribute_to_is_candidate = {} for attr in initial_dataset_info['attribute_list']: if attr in configuration['candidate_atts']: attribute_to_is_candidate[attr] = True else: attribute_to_is_candidate[attr] = False attribute_to_is_categorical = {} for attr in initial_dataset_info['attribute_list']: if attr in configuration['categorical_atts']: attribute_to_is_categorical[attr] = True else: attribute_to_is_categorical[attr] = False if configuration['tuple_n'] == '': n = initial_dataset_info['number_of_tuples'] else: n = int(configuration['tuple_n']) # if configuration['categorical_threshold'] == '': # categorical_threshold = 10 # else: # categorical_threshold = int(configuration['categorical_threshold']) if configuration['seed'] == '': seed = 0 else: seed = int(configuration['seed']) generator = DataGenerator() if configuration['chose_mode'] == 'mode1': describer = DataDescriber() describer.describe_dataset_in_random_mode(input_dataset_file, {}, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_random_mode(n, description_file, seed) else: if configuration['histogram_size'] == '': histogram_size = 20 else: histogram_size = int(configuration['histogram_size']) if configuration['epsilon'] == '': epsilon = 10 else: epsilon = float(configuration['epsilon']) attribute_to_datatype = configuration['type_atts'] describer = DataDescriber(histogram_size) if configuration['chose_mode'] == 'mode2': describer.describe_dataset_in_independent_attribute_mode( input_dataset_file, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_independent_mode( n, description_file, seed) elif configuration['chose_mode'] == 'mode3': if configuration['max_degree'] == '': max_degree = 3 else: max_degree = int(configuration['max_degree']) describer.describe_dataset_in_correlated_attribute_mode( input_dataset_file, max_degree, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_correlated_attribute_mode( n, description_file, seed) generator.save_synthetic_data(synthetic_dataset_file)
def norm_json_processing_hist(request): # NOTICE: input name need to update to _norm version for processing data passed_data_name = request.session.get('passed_data_name') description_file = passed_data_name + "_norm_plot.json" plot_json = read_json_file(description_file) return HttpResponse(json.dumps(plot_json), content_type='application/json')