def get_dataset_info(file_name): d = DataDescriber() d.describe_dataset_in_independent_attribute_mode(file_name) dataset_info = {'candidate_attributes': [], 'categorical_attributes': [], 'attribute_datatypes': {}, 'number_of_tuples': d.dataset_description['meta']['num_tuples'], 'attribute_list': d.dataset_description['meta']['all_attributes']} for attribute in d.dataset_description['attribute_description']: current_attribute_info = d.dataset_description['attribute_description'][attribute] if current_attribute_info['is_candidate_key']: dataset_info['candidate_attributes'].append(attribute) if current_attribute_info['is_categorical']: dataset_info['categorical_attributes'].append(attribute) dataset_info['attribute_datatypes'][attribute] = current_attribute_info['data_type'] return dataset_info
def test_datasynthesizer(): data_dir = Path(__file__).parent / 'data' input_data = data_dir / 'adult_tiny.csv' description_file = data_dir / 'description.json' output_data = data_dir / 'output.csv' uniform_data = data_dir / 'output_uniform.csv' threshold_value = 20 categorical_attributes = {'education': True} epsilon = 1 degree_of_bayesian_network = 2 num_tuples_to_generate = 10000 describer = DataDescriber(category_threshold=threshold_value) describer.describe_dataset_in_correlated_attribute_mode( dataset_file=input_data, epsilon=epsilon, k=degree_of_bayesian_network, attribute_to_is_categorical=categorical_attributes) describer.save_dataset_description_to_file(description_file) generator = DataGenerator() generator.generate_dataset_in_correlated_attribute_mode( num_tuples_to_generate, description_file) generator.save_synthetic_data(output_data) generator.generate_dataset_in_random_mode(num_tuples_to_generate, description_file) generator.save_synthetic_data(uniform_data) df_input = pd.read_csv(input_data, skipinitialspace=True) df_output = pd.read_csv(output_data) df_uniform = pd.read_csv(uniform_data) for col in df_input: if col == 'age': assert ks_test(df_input, df_output, col) < 0.1 else: assert kl_test(df_input, df_output, col) < 0.01 df_input_mi = pairwise_attributes_mutual_information(df_input) df_output_mi = pairwise_attributes_mutual_information(df_output) df_uniform_mi = pairwise_attributes_mutual_information(df_uniform) output_diff = (df_output_mi - df_input_mi).abs().sum().sum() uniform_diff = (df_uniform_mi - df_input_mi).abs().sum().sum() assert output_diff < 5 * uniform_diff
def generate_data(username): configuration = read_json_file('{}_parameters.json'.format(username)) input_dataset_file = '{}.csv'.format(username) description_file = '{}_description.json'.format(username) synthetic_dataset_file = '{}_synthetic_data.csv'.format(username) initial_dataset_info = get_dataset_info(input_dataset_file) attribute_to_is_candidate = {} for attr in initial_dataset_info['attribute_list']: if attr in configuration['candidate_atts']: attribute_to_is_candidate[attr] = True else: attribute_to_is_candidate[attr] = False attribute_to_is_categorical = {} for attr in initial_dataset_info['attribute_list']: if attr in configuration['categorical_atts']: attribute_to_is_categorical[attr] = True else: attribute_to_is_categorical[attr] = False if configuration['tuple_n'] == '': n = initial_dataset_info['number_of_tuples'] else: n = int(configuration['tuple_n']) # if configuration['categorical_threshold'] == '': # categorical_threshold = 10 # else: # categorical_threshold = int(configuration['categorical_threshold']) if configuration['seed'] == '': seed = 0 else: seed = int(configuration['seed']) generator = DataGenerator() if configuration['chose_mode'] == 'mode1': describer = DataDescriber() describer.describe_dataset_in_random_mode(input_dataset_file, {}, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_random_mode(n, description_file, seed) else: if configuration['histogram_size'] == '': histogram_size = 20 else: histogram_size = int(configuration['histogram_size']) if configuration['epsilon'] == '': epsilon = 10 else: epsilon = float(configuration['epsilon']) attribute_to_datatype = configuration['type_atts'] describer = DataDescriber(histogram_size) if configuration['chose_mode'] == 'mode2': describer.describe_dataset_in_independent_attribute_mode( input_dataset_file, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_independent_mode( n, description_file, seed) elif configuration['chose_mode'] == 'mode3': if configuration['max_degree'] == '': max_degree = 3 else: max_degree = int(configuration['max_degree']) describer.describe_dataset_in_correlated_attribute_mode( input_dataset_file, max_degree, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate, seed) describer.save_dataset_description_to_file(description_file) generator.generate_dataset_in_correlated_attribute_mode( n, description_file, seed) generator.save_synthetic_data(synthetic_dataset_file)
def main(): parser = argparse.ArgumentParser( description="DP Data Synthesizer - PriveBayes") """ parser.add_argument( # remove/change "-i", "--input-file", type=str, default="./bank-data/bank-additional-full.csv", help="Path to input data", ) parser.add_argument( # remove/change "-o", "--output-dir", type=str, default="./bank-data/synth/", help="Path to input data", ) """ parser.add_argument( "-m", "--mode", type=str, default="correlated", help="Synthesizer Mode: 'independent', 'correlated', 'random'", ) parser.add_argument( "-d", "--data", type=str, default="bank", help="Dataset ('bank' for bank dataset, 'adult' for adult dataset)", ) parser.add_argument( "-e", "--epsilon", type=float, default=0.1, help="Noise parameter (default = 0.1)", ) args = parser.parse_args() if args.data == 'bank': input_file = './bank-data/bank-additional-full.csv' cols = [ 'age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y' ] df = pd.read_csv(input_file, sep=';', names=cols) categorical_columns = [ 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome' ] for category in categorical_columns: df[category] = df[category].astype('object') # specify categorical attributes categorical_attributes = { 'age': True, 'job': True, 'marital': True, 'education': True, 'default': True, 'housing': True, 'loan': True, 'contact': True, 'month': True, 'day_of_week': True, 'poutcome': True, 'y': True } output_dir = './bank-data/synth' sep = ';' elif args.data == 'adult': input_file = './adult-data/adult.data' cols = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'y' ] df = pd.read_csv(input_file, sep=',', names=cols) categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country' ] for category in categorical_columns: df[category] = df[category].astype('object') categorical_attributes = { 'workclass': True, 'education': True, 'marital-status': True, 'occupation': True, 'relationship': True, 'race': True, 'sex': True, 'native-country': True, 'y': True } output_dir = './adult-data/synth' sep = ',' elif args.data == 'german': input_file = './german-data/german.train' cols = [ 'existing_checking', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings', 'employment_since', 'installment_rate', 'status_sex', 'other_debtors', 'residence_since', 'property', 'age', 'other_installment_plans', 'housing', 'existing_credits', 'job', 'people_liable', 'telephone', 'foreign_worker', 'y' ] df = pd.read_csv(input_file, sep=' ', names=cols) categorical_columns = [ 'existing_checking', 'credit_history', 'purpose', 'savings', 'employment_since', 'status_sex', 'other_debtors', 'property', 'other_installment_plans', 'housing', 'job', 'telephone', 'foreign_worker' ] for category in categorical_columns: df[category] = df[category].astype('object') categorical_attributes = { 'existing_checking': True, 'credit_history': True, 'purpose': True, 'savings': True, 'employment_since': True, 'status_sex': True, 'other_debtors': True, 'property': True, 'other_installment_plans': True, 'housing': True, 'job': True, 'telephone': True, 'foreign_worker': True, 'y': True } output_dir = './german-data/synth' sep = ' ' elif args.data == 'home': input_file = './home-data/hcdf_train.csv' df = pd.read_csv(input_file, sep=',', header=0) df = df.rename(columns={"TARGET": "y", "CODE_GENDER": "GENDER"}) categorical_columns = [ "NAME_CONTRACT_TYPE", "GENDER", "FLAG_OWN_REALTY", "NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "OCCUPATION_TYPE", "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE", "FONDKAPREMONT_MODE", "HOUSETYPE_MODE", "WALLSMATERIAL_MODE", "EMERGENCYSTATE_MODE", "y" ] for category in categorical_columns: df[category] = df[category].astype('object') categorical_attributes = { "NAME_CONTRACT_TYPE": True, "GENDER": True, "FLAG_OWN_REALTY": True, "NAME_TYPE_SUITE": True, "NAME_INCOME_TYPE": True, "NAME_EDUCATION_TYPE": True, "NAME_FAMILY_STATUS": True, "NAME_HOUSING_TYPE": True, "OCCUPATION_TYPE": True, "WEEKDAY_APPR_PROCESS_START": True, "ORGANIZATION_TYPE": True, "FONDKAPREMONT_MODE": True, "HOUSETYPE_MODE": True, "WALLSMATERIAL_MODE": True, "EMERGENCYSTATE_MODE": True, "y": True } output_dir = './home-data/synth' sep = ',' #df = df.dropna() # input to DataSynthetizer must be comma separated. Create a temp file. df.to_csv('comma_data.csv', sep=',') input_data = 'comma_data.csv' description_file = output_dir + '/description' + args.mode + '_' + str( args.epsilon) + '.json' synthetic_data = '' save_path = '' # An attribute is categorical if its domain size is less than this threshold. # Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset). threshold_value = 20 # Number of tuples generated in synthetic dataset. num_tuples_to_generate = len(df) # specify which attributes are candidate keys of input dataset. candidate_keys = {'ssn': True} # The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges. degree_of_bayesian_network = 2 # Number of tuples generated in synthetic dataset. num_tuples_to_generate = len(df) # Data describer describer = DataDescriber(category_threshold=threshold_value) if args.mode == 'independent': synthetic_data = output_dir + '/syth_data_independent_' + str( args.epsilon) + '.csv' save_path = output_dir + '/syth_data_independent_ymod_' + str( args.epsilon) + '.csv' describer.describe_dataset_in_independent_attribute_mode( dataset_file=input_data, attribute_to_is_categorical=categorical_attributes, attribute_to_is_candidate_key=candidate_keys) describer.save_dataset_description_to_file(description_file) elif args.mode == 'correlated': synthetic_data = output_dir + '/syth_data_correlated_' + str( args.epsilon) + '.csv' save_path = output_dir + '/syth_data_correlated_ymod_' + str( args.epsilon) + '.csv' describer.describe_dataset_in_correlated_attribute_mode( dataset_file=input_data, epsilon=args.epsilon, k=degree_of_bayesian_network, attribute_to_is_categorical=categorical_attributes, attribute_to_is_candidate_key=candidate_keys) describer.save_dataset_description_to_file(description_file) print(display_bayesian_network(describer.bayesian_network)) else: synthetic_data = output_dir + '/syth_data_random_' + str( args.epsilon) + '.csv' save_path = output_dir + '/syth_data_random_ymod_' + str( args.epsilon) + '.csv' describer.describe_dataset_in_random_mode(input_data) describer.save_dataset_description_to_file(description_file) # Generate synthetic dataset generator = DataGenerator() generator.generate_dataset_in_random_mode(num_tuples_to_generate, description_file) generator.save_synthetic_data(synthetic_data) """ # Compare the stats of original and synthetic data # Read both datasets using Pandas. input_df = pd.read_csv(input_data, skipinitialspace=True) synthetic_df = pd.read_csv(synthetic_data) # Read attribute description from the dataset description file. attribute_description = read_json_file(description_file)['attribute_description'] inspector = ModelInspector(input_df, synthetic_df, attribute_description) for attribute in synthetic_df.columns: inspector.compare_histograms(attribute) """ # Delete temporary file (comma separated df) if os.path.exists(input_data): os.remove(input_data) synth_df = pd.read_csv(synthetic_data, sep=',') synth_df['y'] = df['y'] save_df = synth_df.loc[:, 'age':'y'] save_df.to_csv(save_path, sep=sep, index=False, header=None)
def __init__(self, *args, df_in=None, verbose=True, **kwargs): self.verbose = verbose self.df_in = df_in DataDescriber.__init__(self, *args, **kwargs)