def get_plot_data(input_dataset_file, synthetic_dataset_file, description_file): description = read_json_file(description_file) df_before = pd.read_csv(input_dataset_file) df_after = pd.read_csv(synthetic_dataset_file) plot_data = {'histogram': {}, 'barchart': {}, 'heatmap': {}} for attr in df_before: if description['attribute_description'][attr]['is_categorical']: bins_before, counts_before = get_barchart_data(df_before, attr) bins_after, counts_after = get_barchart_data(df_after, attr) plot_data['barchart'][attr] = { 'before': { 'bins': bins_before, 'counts': counts_before }, 'after': { 'bins': bins_after, 'counts': counts_after } } elif description['attribute_description'][attr]['data_type'] in { 'Integer', 'Float' }: plot_data['histogram'][attr] = { 'before': get_histogram_data(df_before, attr), 'after': get_histogram_data(df_after, attr) } plot_data['heatmap']['before'] = get_heatmap_data(input_dataset_file) plot_data['heatmap']['after'] = get_heatmap_data(synthetic_dataset_file) plot_file_name = input_dataset_file.replace(".csv", "_plot.json") with open(plot_file_name, 'w') as outfile: json.dump(plot_data, outfile, indent=4)
def compare_histograms(mode: str, hospital_ae_df: pd.DataFrame, description_filepath: str, synthetic_data_filepath: str): ''' Makes comapirson plots showing the histograms for each column in the synthetic data. Keyword arguments: mode -- what type of synthetic data hospital_ae_df -- DataFrame of the original dataset description_filepath -- filepath to the data description synthetic_data_filepath -- filepath to where synthetic data written ''' synthetic_df = pd.read_csv(synthetic_data_filepath) # Read attribute description from the dataset description file. attribute_description = read_json_file( description_filepath)['attribute_description'] inspector = ModelInspector(hospital_ae_df, synthetic_df, attribute_description) for attribute in synthetic_df.columns: figure_filepath = os.path.join(plots_dir, mode + '_' + attribute + '.png') # need to replace whitespace in filepath for Markdown reference figure_filepath = figure_filepath.replace(' ', '_') inspector.compare_histograms(attribute, figure_filepath) return inspector
def compare_pairwise_mutual_information(mode: str, hospital_ae_df: pd.DataFrame, description_filepath: str, synthetic_data_filepath: str): ''' Looks at correlation of attributes by producing heatmap Keyword arguments: mode -- what type of synthetic data hospital_ae_df -- DataFrame of the original dataset description_filepath -- filepath to the data description synthetic_data_filepath -- filepath to where synthetic data written ''' synthetic_df = pd.read_csv(synthetic_data_filepath) attribute_description = read_json_file( description_filepath)['attribute_description'] inspector = ModelInspector(hospital_ae_df, synthetic_df, attribute_description) figure_filepath = os.path.join( filepaths.plots_dir, 'mutual_information_heatmap_' + mode + '.png') inspector.mutual_information_heatmap(figure_filepath)
def generate_dataset_in_correlated_attribute_mode(self, n, description_file, seed=0): set_random_seed(seed) self.n = n self.description = read_json_file(description_file) all_attributes = self.description['meta']['all_attributes'] candidate_keys = set(self.description['meta']['candidate_keys']) self.encoded_dataset = DataGenerator.generate_encoded_dataset( self.n, self.description) self.synthetic_dataset = DataFrame(columns=all_attributes) for attr in all_attributes: attr_info = self.description['attribute_description'][attr] column = parse_json(attr_info) if attr in self.encoded_dataset: self.synthetic_dataset[ attr] = column.sample_values_from_binning_indices( self.encoded_dataset[attr]) elif attr in candidate_keys: self.synthetic_dataset[ attr] = column.generate_values_as_candidate_key(n) else: # for attributes not in BN or candidate keys, use independent attribute mode. binning_indices = column.sample_binning_indices_in_independent_attribute_mode( n) self.synthetic_dataset[ attr] = column.sample_values_from_binning_indices( binning_indices)
def generate_dataset_in_random_mode(self, n, description_file, seed=0): set_random_seed(seed) description = read_json_file(description_file) self.synthetic_dataset = DataFrame() for attr in description['attribute_description'].keys(): attr_info = description['attribute_description'][attr] datatype = attr_info['data_type'] is_categorical = attr_info['is_categorical'] is_candidate_key = attr_info['is_candidate_key'] minimum = attr_info['min'] maximum = attr_info['max'] static_num = attr_info['min'] if minimum == maximum else None if is_candidate_key: self.synthetic_dataset[attr] = parse_json( attr_info).generate_values_as_candidate_key(n) elif is_categorical: self.synthetic_dataset[attr] = random.choice( attr_info['distribution_bins'], n) elif datatype == 'String': length = static_num or random.randint(minimum, maximum) self.synthetic_dataset[attr] = length self.synthetic_dataset[attr] = self.synthetic_dataset[ attr].map(lambda x: generate_random_string(x)) else: if datatype == 'Integer': self.synthetic_dataset[ attr] = static_num or random.randint( minimum, maximum + 1, n) else: self.synthetic_dataset[ attr] = static_num or random.uniform( minimum, maximum, n)
def describe_dataset_in_random_mode( self, dataset_file: str, attribute_to_datatype: Dict[str, DataType] = None, attribute_to_is_categorical: Dict[str, bool] = None, attribute_to_is_candidate_key: Dict[str, bool] = None, categorical_attribute_domain_file: str = None, numerical_attribute_ranges: Dict[str, List] = None, seed=0): attribute_to_datatype = attribute_to_datatype or {} attribute_to_is_categorical = attribute_to_is_categorical or {} attribute_to_is_candidate_key = attribute_to_is_candidate_key or {} numerical_attribute_ranges = numerical_attribute_ranges or {} if categorical_attribute_domain_file: categorical_attribute_to_domain = utils.read_json_file( categorical_attribute_domain_file) else: categorical_attribute_to_domain = {} utils.set_random_seed(seed) self.attr_to_datatype = { attr: DataType(datatype) for attr, datatype in attribute_to_datatype.items() } self.attr_to_is_categorical = attribute_to_is_categorical self.attr_to_is_candidate_key = attribute_to_is_candidate_key self.read_dataset_from_csv(dataset_file) self.infer_attribute_data_types() self.analyze_dataset_meta() self.represent_input_dataset_by_columns() for column in self.attr_to_column.values(): attr_name = column.name if attr_name in categorical_attribute_to_domain: column.infer_domain( categorical_domain=categorical_attribute_to_domain[ attr_name]) elif attr_name in numerical_attribute_ranges: column.infer_domain( numerical_range=numerical_attribute_ranges[attr_name]) else: column.infer_domain() # record attribute information in json format self.data_description['attribute_description'] = {} for attr, column in self.attr_to_column.items(): self.data_description['attribute_description'][ attr] = column.to_json()
def generate_dataset_in_independent_mode(self, n, description_file, seed=0): set_random_seed(seed) self.description = read_json_file(description_file) all_attributes = self.description['meta']['all_attributes'] candidate_keys = set(self.description['meta']['candidate_keys']) self.synthetic_dataset = DataFrame(columns=all_attributes) for attr in all_attributes: attr_info = self.description['attribute_description'][attr] column = parse_json(attr_info) if attr in candidate_keys: self.synthetic_dataset[attr] = column.generate_values_as_candidate_key(n) else: binning_indices = column.sample_binning_indices_in_independent_attribute_mode(n) self.synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices)
def synthetize(): #get_ipython().run_line_magic('matplotlib', 'auto') # Adding current direcotry into sys.path # input dataset input_data = './census/adult_data.csv' # location of two output files mode = 'correlated_attribute_mode' description_file = f'./census/out/{mode}/description.json' synthetic_data = f'./census/out/{mode}/sythetic_data.csv' # An attribute is categorical if its domain size is less than this threshold. # Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset). threshold_value = 20 # specify categorical attributes categorical_attributes = {'education': True, 'native-country': True} # specify which attributes are candidate keys of input dataset. candidate_keys = {'ssn': True} # A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not # change the probability of getting the same output more than a multiplicative difference of exp(epsilon). # Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy. epsilon = 0.1 # The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges. degree_of_bayesian_network = 2 # Number of tuples generated in synthetic dataset. num_tuples_to_generate = 32561 # Here 32561 is the same as input dataset, but it can be set to another number. # ### Step 3 DataDescriber # # 1. Instantiate a DataDescriber. # 2. Compute the statistics of the dataset. # 3. Save dataset description to a file on local machine. describer = DataDescriber(category_threshold=threshold_value) describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, epsilon=epsilon, k=degree_of_bayesian_network, attribute_to_is_categorical=categorical_attributes, attribute_to_is_candidate_key=candidate_keys) describer.save_dataset_description_to_file(description_file) display_bayesian_network(describer.bayesian_network) # ### Step 4 generate synthetic dataset # # 1. Instantiate a DataGenerator. # 2. Generate a synthetic dataset. # 3. Save it to local machine. generator = DataGenerator() generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file) generator.save_synthetic_data(synthetic_data) # ### Step 5 compare the statistics of input and sythetic data (optional) # # The synthetic data is already saved in a file by step 4. The ModelInspector is for a quick test on the similarity between input and synthetic datasets. # # #### 5.1 instantiate a ModelInspector. # # It needs input dataset, synthetic dataset, and attribute description. # Read both datasets using Pandas. input_df = pd.read_csv(input_data, skipinitialspace=True) synthetic_df = pd.read_csv(synthetic_data) # Read attribute description from the dataset description file. attribute_description = read_json_file(description_file)['attribute_description'] inspector = ModelInspector(input_df, synthetic_df, attribute_description) # #### 5.2 compare histograms between input and synthetic datasets. plot_id = 0 for attribute in synthetic_df.columns: inspector.compare_histograms(attribute, plot_id) plot_id += 1 # #### 5.3 compare pairwise mutual information inspector.mutual_information_heatmap()
def generate_data(username): configuration = read_json_file('{}_parameters.json'.format(username)) input_dataset_file = '{}.csv'.format(username) description_file = '{}_description.json'.format(username) synthetic_dataset_file = '{}_synthetic_data.csv'.format(username) initial_dataset_info = get_dataset_info(input_dataset_file) attribute_to_is_candidate = {} for attr in initial_dataset_info['attribute_list']: if attr in configuration['candidate_atts']: attribute_to_is_candidate[attr] = True else: attribute_to_is_candidate[attr] = False attribute_to_is_categorical = {} for attr in initial_dataset_info['attribute_list']: if attr in configuration['categorical_atts']: attribute_to_is_categorical[attr] = True else: attribute_to_is_categorical[attr] = False if configuration['tuple_n'] == '': n = initial_dataset_info['number_of_tuples'] else: n = int(configuration['tuple_n']) # if configuration['categorical_threshold'] == '': # categorical_threshold = 10 # else: # categorical_threshold = int(configuration['categorical_threshold']) if configuration['seed'] == '': seed = 0 else: seed = int(configuration['seed']) generator = DataGenerator() if configuration['chose_mode'] == 'mode1': describer = DataDescriber() describer.describe_dataset_in_random_mode(input_dataset_file, {}, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_random_mode(n, description_file, seed) else: if configuration['histogram_size'] == '': histogram_size = 20 else: histogram_size = int(configuration['histogram_size']) if configuration['epsilon'] == '': epsilon = 0.1 else: epsilon = float(configuration['epsilon']) attribute_to_datatype = configuration['type_atts'] describer = DataDescriber(histogram_size) if configuration['chose_mode'] == 'mode2': describer.describe_dataset_in_independent_attribute_mode( input_dataset_file, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_independent_mode( n, description_file, seed) elif configuration['chose_mode'] == 'mode3': if configuration['max_degree'] == '': max_degree = 3 else: max_degree = int(configuration['max_degree']) describer.describe_dataset_in_correlated_attribute_mode( input_dataset_file, max_degree, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_correlated_attribute_mode( n, description_file, seed) generator.save_synthetic_data(synthetic_dataset_file)
def get_drawable_attributes(plot_json_file): plot_data = read_json_file(plot_json_file) return list(plot_data['barchart'].keys()) + list( plot_data['histogram'].keys())
def get_categorical_attributes(plot_json_file): plot_data = read_json_file(plot_json_file) return list(plot_data['barchart'].keys())
def res_json_processing_plot(request): passed_data_name = request.session.get('passed_data_name') description_file = passed_data_name + "_plot.json" plot_json = read_json_file(description_file) return HttpResponse(json.dumps(plot_json), content_type='application/json')