def describe_synthetic_data(mode: str, description_filepath: str): ''' Describes the synthetic data and saves it to the data/ directory. Keyword arguments: mode -- what type of synthetic data category_threshold -- limit at which categories are considered blah description_filepath -- filepath to the data description ''' describer = DataDescriber() if mode == 'random': describer.describe_dataset_in_random_mode( filepaths.hospital_ae_data_deidentify, attribute_to_datatype=attribute_to_datatype, attribute_to_is_categorical=attribute_is_categorical) elif mode == 'independent': describer.describe_dataset_in_independent_attribute_mode( filepaths.hospital_ae_data_deidentify, attribute_to_datatype=attribute_to_datatype, attribute_to_is_categorical=attribute_is_categorical) elif mode == 'correlated': # Increase epsilon value to reduce the injected noises. # We're not using differential privacy in this tutorial, # so we'll set epsilon=0 to turn off differential privacy epsilon = 0 # The maximum number of parents in Bayesian network # i.e., the maximum number of incoming edges. degree_of_bayesian_network = 1 describer.describe_dataset_in_correlated_attribute_mode( dataset_file=filepaths.hospital_ae_data_deidentify, epsilon=epsilon, k=degree_of_bayesian_network, attribute_to_datatype=attribute_to_datatype, attribute_to_is_categorical=attribute_is_categorical) # attribute_to_is_candidate_key=attribute_to_is_candidate_key) describer.save_dataset_description_to_file(description_filepath)
def describe_synthetic_data(mode: str, description_filepath: str, data_filepath: str, candidate_keys: object): ''' Describes the synthetic data and saves it to the data/ directory. Keyword arguments: mode -- what type of synthetic data category_threshold -- limit at which categories are considered blah description_filepath -- filepath to the data description ''' describer = DataDescriber() if mode == 'random': describer.describe_dataset_in_random_mode( data_filepath, attribute_to_datatype=attribute_to_datatype, attribute_to_is_categorical=attribute_is_categorical, attribute_to_is_candidate_key=candidate_keys) elif mode == 'independent': describer.describe_dataset_in_independent_attribute_mode( data_filepath, attribute_to_datatype=attribute_to_datatype, attribute_to_is_categorical=attribute_is_categorical, attribute_to_is_candidate_key=candidate_keys) elif mode == 'correlated': epsilon = model_config.CORRELATED_EPSILON_VALUE degree_of_bayesian_network = model_config.CORRELATED_DEGREE_OF_BAYESIAN_NETWORK describer.describe_dataset_in_correlated_attribute_mode( dataset_file=data_filepath, epsilon=epsilon, k=degree_of_bayesian_network, attribute_to_datatype=attribute_to_datatype, attribute_to_is_categorical=attribute_is_categorical, attribute_to_is_candidate_key=candidate_keys) describer.save_dataset_description_to_file(description_filepath)
def get_synthetic_data(df): # An attribute is categorical if its domain size is less than this threshold. # Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset). threshold_value = 1 # specify categorical attributes categorical_attributes = {} # specify which attributes are candidate keys of input dataset. candidate_keys = {} # A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not # change the probability of getting the same output more than a multiplicative difference of exp(epsilon). # Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy. epsilon = 0.1 # The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges. degree_of_bayesian_network = 2 num_tuples_to_generate = len(df)*20 # Here 32561 is the same as input dataset, but it can be set to another number. input_data = 'temp_train.csv' df.to_csv(input_data,index=False) describer = DataDescriber(category_threshold=threshold_value) describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, epsilon=epsilon, k=degree_of_bayesian_network, attribute_to_is_categorical=categorical_attributes, attribute_to_is_candidate_key=candidate_keys) describer.save_dataset_description_to_file(description_file) generator = DataGenerator() generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file) generator.save_synthetic_data(synthetic_data) synth_data = pd.read_csv(synthetic_data) return synth_data
def get_dataset_info(file_name): d = DataDescriber() d.describe_dataset_in_independent_attribute_mode(file_name) dataset_info = { 'candidate_attributes': [], 'categorical_attributes': [], 'attribute_datatypes': {}, 'number_of_tuples': d.data_description['meta']['num_tuples'], 'attribute_list': d.data_description['meta']['all_attributes'] } for attribute in d.data_description['attribute_description']: current_attribute_info = d.data_description['attribute_description'][ attribute] if current_attribute_info['is_candidate_key']: dataset_info['candidate_attributes'].append(attribute) if current_attribute_info['is_categorical']: dataset_info['categorical_attributes'].append(attribute) dataset_info['attribute_datatypes'][ attribute] = current_attribute_info['data_type'] return dataset_info
def run_trainer(mode_choice): prompt = '\nWhat would you like to call your new table?\n(Will be prefixed with "' + table_choice + '_' + mode_choice + '_")\n\n' name_choice = input(prompt) if name_choice: table_name = table_choice + '_' + mode_choice + '_' + name_choice else: table_name = table_choice + '_' + mode_choice print( '\nWhat version would you like to call the dataset ' + table_name + '?\n(Please check previous version in dataset_processing_meta.csv sheet)\n' ) version_input = input() print('\nAny additional notes about this version?\n') notes_input = input() start = time.time() status_update(start) if not os.path.exists(temp_file_path): results = read_entries(db_input, table_choice) generate_csv(db_input, table_choice, results, temp_file_path, ignore_synth_columns) db_output = open_database(config.DATABASES['synth'], table_name) db_output.drop_table(table_name, with_all_data=True) db_output.create_tables() describer = DataDescriber() data_df = pd.read_csv(temp_file_path) num_rows = len(data_df) save_file_name = table_choice + '_' + name_choice print('describing synthetic data for', mode_choice, 'mode...') describe_synthetic_data( mode_choice, mode_filepaths(mode_choice, 'description', save_file_name), temp_file_path, candidate_keys) print('generating synthetic data for', mode_choice, 'mode...') generate_synthetic_data( mode_choice, num_rows, mode_filepaths(mode_choice, 'description', save_file_name), mode_filepaths(mode_choice, 'data', save_file_name)) print('saving synthetic data to database for', mode_choice, 'mode...') save_synthetic_data(mode_filepaths(mode_choice, 'data', save_file_name), db_output, table_name) print('comparing histograms for', mode_choice, 'mode...') inspector = compare_histograms( table_name, data_df, mode_filepaths(mode_choice, 'description', save_file_name), mode_filepaths(mode_choice, 'data', save_file_name)) print('comparing pairwise mutual information for', mode_choice, 'mode...') compare_pairwise_mutual_information(table_name, inspector) end = time.time() elapsed = round(end - start, 2) print('done in ' + str(elapsed) + ' seconds.') str_cat = 0 str_not_cat = 0 int_cat = 0 int_not_cat = 0 datetimes = 0 with open('./data_models/' + attr_choice + '.json') as json_file: fields = json.load(json_file) for row in fields: if fields[row][0] == "str_cat": str_cat += 1 elif fields[row][0] == "str_not_cat": str_not_cat += 1 elif fields[row][0] == "int_cat": int_cat += 1 elif fields[row][0] == "int_not_cat": int_not_cat += 1 elif fields[row][0] == "datetime_not_cat": datetimes += 1 entry_csv = [ table_name, table_choice, 'synth_' + mode_choice, get_table_size(db_output, table_name), len(select_columns(db_output, table_name)), count_rows(db_output, table_name), datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), elapsed, round(start), round(end), config.EC2_INSTANCE_TYPE, version_input, notes_input, '', '', str_cat, str_not_cat, int_cat, int_not_cat, datetimes ] if mode_choice == 'correlated': entry_csv[13] = model_config.CORRELATED_EPSILON_VALUE entry_csv[14] = model_config.CORRELATED_DEGREE_OF_BAYESIAN_NETWORK append_list_as_row(config.SYNTH_META_CSV_PATH, entry_csv) process_timer.cancel() print( "\n%s%sProcessed %s%s%s data into %s%s%s table in the %ssynth_datasets%s database!%s\n" % (colour.BOLD, colour.GREEN, colour.DARKCYAN, table_choice, colour.GREEN, colour.DARKCYAN, table_name, colour.GREEN, colour.DARKCYAN, colour.GREEN, colour.END))
def synthetize(): #get_ipython().run_line_magic('matplotlib', 'auto') # Adding current direcotry into sys.path # input dataset input_data = './census/adult_data.csv' # location of two output files mode = 'correlated_attribute_mode' description_file = f'./census/out/{mode}/description.json' synthetic_data = f'./census/out/{mode}/sythetic_data.csv' # An attribute is categorical if its domain size is less than this threshold. # Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset). threshold_value = 20 # specify categorical attributes categorical_attributes = {'education': True, 'native-country': True} # specify which attributes are candidate keys of input dataset. candidate_keys = {'ssn': True} # A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not # change the probability of getting the same output more than a multiplicative difference of exp(epsilon). # Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy. epsilon = 0.1 # The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges. degree_of_bayesian_network = 2 # Number of tuples generated in synthetic dataset. num_tuples_to_generate = 32561 # Here 32561 is the same as input dataset, but it can be set to another number. # ### Step 3 DataDescriber # # 1. Instantiate a DataDescriber. # 2. Compute the statistics of the dataset. # 3. Save dataset description to a file on local machine. describer = DataDescriber(category_threshold=threshold_value) describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, epsilon=epsilon, k=degree_of_bayesian_network, attribute_to_is_categorical=categorical_attributes, attribute_to_is_candidate_key=candidate_keys) describer.save_dataset_description_to_file(description_file) display_bayesian_network(describer.bayesian_network) # ### Step 4 generate synthetic dataset # # 1. Instantiate a DataGenerator. # 2. Generate a synthetic dataset. # 3. Save it to local machine. generator = DataGenerator() generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file) generator.save_synthetic_data(synthetic_data) # ### Step 5 compare the statistics of input and sythetic data (optional) # # The synthetic data is already saved in a file by step 4. The ModelInspector is for a quick test on the similarity between input and synthetic datasets. # # #### 5.1 instantiate a ModelInspector. # # It needs input dataset, synthetic dataset, and attribute description. # Read both datasets using Pandas. input_df = pd.read_csv(input_data, skipinitialspace=True) synthetic_df = pd.read_csv(synthetic_data) # Read attribute description from the dataset description file. attribute_description = read_json_file(description_file)['attribute_description'] inspector = ModelInspector(input_df, synthetic_df, attribute_description) # #### 5.2 compare histograms between input and synthetic datasets. plot_id = 0 for attribute in synthetic_df.columns: inspector.compare_histograms(attribute, plot_id) plot_id += 1 # #### 5.3 compare pairwise mutual information inspector.mutual_information_heatmap()
def generate_data(username): configuration = read_json_file('{}_parameters.json'.format(username)) input_dataset_file = '{}.csv'.format(username) description_file = '{}_description.json'.format(username) synthetic_dataset_file = '{}_synthetic_data.csv'.format(username) initial_dataset_info = get_dataset_info(input_dataset_file) attribute_to_is_candidate = {} for attr in initial_dataset_info['attribute_list']: if attr in configuration['candidate_atts']: attribute_to_is_candidate[attr] = True else: attribute_to_is_candidate[attr] = False attribute_to_is_categorical = {} for attr in initial_dataset_info['attribute_list']: if attr in configuration['categorical_atts']: attribute_to_is_categorical[attr] = True else: attribute_to_is_categorical[attr] = False if configuration['tuple_n'] == '': n = initial_dataset_info['number_of_tuples'] else: n = int(configuration['tuple_n']) # if configuration['categorical_threshold'] == '': # categorical_threshold = 10 # else: # categorical_threshold = int(configuration['categorical_threshold']) if configuration['seed'] == '': seed = 0 else: seed = int(configuration['seed']) generator = DataGenerator() if configuration['chose_mode'] == 'mode1': describer = DataDescriber() describer.describe_dataset_in_random_mode(input_dataset_file, {}, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_random_mode(n, description_file, seed) else: if configuration['histogram_size'] == '': histogram_size = 20 else: histogram_size = int(configuration['histogram_size']) if configuration['epsilon'] == '': epsilon = 0.1 else: epsilon = float(configuration['epsilon']) attribute_to_datatype = configuration['type_atts'] describer = DataDescriber(histogram_size) if configuration['chose_mode'] == 'mode2': describer.describe_dataset_in_independent_attribute_mode( input_dataset_file, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_independent_mode( n, description_file, seed) elif configuration['chose_mode'] == 'mode3': if configuration['max_degree'] == '': max_degree = 3 else: max_degree = int(configuration['max_degree']) describer.describe_dataset_in_correlated_attribute_mode( input_dataset_file, max_degree, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_correlated_attribute_mode( n, description_file, seed) generator.save_synthetic_data(synthetic_dataset_file)