コード例 #1
0
def describe_synthetic_data(mode: str, description_filepath: str):
    '''
    Describes the synthetic data and saves it to the data/ directory.

    Keyword arguments:
    mode -- what type of synthetic data
    category_threshold -- limit at which categories are considered blah
    description_filepath -- filepath to the data description
    '''
    describer = DataDescriber()

    if mode == 'random':
        describer.describe_dataset_in_random_mode(
            filepaths.hospital_ae_data_deidentify,
            attribute_to_datatype=attribute_to_datatype,
            attribute_to_is_categorical=attribute_is_categorical)

    elif mode == 'independent':
        describer.describe_dataset_in_independent_attribute_mode(
            filepaths.hospital_ae_data_deidentify,
            attribute_to_datatype=attribute_to_datatype,
            attribute_to_is_categorical=attribute_is_categorical)

    elif mode == 'correlated':
        # Increase epsilon value to reduce the injected noises.
        # We're not using differential privacy in this tutorial,
        # so we'll set epsilon=0 to turn off differential privacy
        epsilon = 0

        # The maximum number of parents in Bayesian network
        # i.e., the maximum number of incoming edges.
        degree_of_bayesian_network = 1

        describer.describe_dataset_in_correlated_attribute_mode(
            dataset_file=filepaths.hospital_ae_data_deidentify,
            epsilon=epsilon,
            k=degree_of_bayesian_network,
            attribute_to_datatype=attribute_to_datatype,
            attribute_to_is_categorical=attribute_is_categorical)
        # attribute_to_is_candidate_key=attribute_to_is_candidate_key)

    describer.save_dataset_description_to_file(description_filepath)
def describe_synthetic_data(mode: str, description_filepath: str,
                            data_filepath: str, candidate_keys: object):
    '''
    Describes the synthetic data and saves it to the data/ directory.

    Keyword arguments:
    mode -- what type of synthetic data
    category_threshold -- limit at which categories are considered blah
    description_filepath -- filepath to the data description
    '''
    describer = DataDescriber()

    if mode == 'random':
        describer.describe_dataset_in_random_mode(
            data_filepath,
            attribute_to_datatype=attribute_to_datatype,
            attribute_to_is_categorical=attribute_is_categorical,
            attribute_to_is_candidate_key=candidate_keys)

    elif mode == 'independent':
        describer.describe_dataset_in_independent_attribute_mode(
            data_filepath,
            attribute_to_datatype=attribute_to_datatype,
            attribute_to_is_categorical=attribute_is_categorical,
            attribute_to_is_candidate_key=candidate_keys)

    elif mode == 'correlated':
        epsilon = model_config.CORRELATED_EPSILON_VALUE

        degree_of_bayesian_network = model_config.CORRELATED_DEGREE_OF_BAYESIAN_NETWORK

        describer.describe_dataset_in_correlated_attribute_mode(
            dataset_file=data_filepath,
            epsilon=epsilon,
            k=degree_of_bayesian_network,
            attribute_to_datatype=attribute_to_datatype,
            attribute_to_is_categorical=attribute_is_categorical,
            attribute_to_is_candidate_key=candidate_keys)

    describer.save_dataset_description_to_file(description_filepath)
コード例 #3
0
ファイル: CMM.py プロジェクト: sagyome/forest_based_tree
def get_synthetic_data(df):
    # An attribute is categorical if its domain size is less than this threshold.
    # Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset).
    threshold_value = 1

    # specify categorical attributes
    categorical_attributes = {}

    # specify which attributes are candidate keys of input dataset.
    candidate_keys = {}

    # A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not
    # change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
    # Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
    epsilon = 0.1

    # The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
    degree_of_bayesian_network = 2

    num_tuples_to_generate = len(df)*20  # Here 32561 is the same as input dataset, but it can be set to another number.
    input_data = 'temp_train.csv'
    df.to_csv(input_data,index=False)
    describer = DataDescriber(category_threshold=threshold_value)
    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data,
                                                            epsilon=epsilon,
                                                            k=degree_of_bayesian_network,
                                                            attribute_to_is_categorical=categorical_attributes,
                                                            attribute_to_is_candidate_key=candidate_keys)
    describer.save_dataset_description_to_file(description_file)
    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    generator.save_synthetic_data(synthetic_data)
    synth_data = pd.read_csv(synthetic_data)
    return synth_data
コード例 #4
0
def get_dataset_info(file_name):
    d = DataDescriber()
    d.describe_dataset_in_independent_attribute_mode(file_name)

    dataset_info = {
        'candidate_attributes': [],
        'categorical_attributes': [],
        'attribute_datatypes': {},
        'number_of_tuples': d.data_description['meta']['num_tuples'],
        'attribute_list': d.data_description['meta']['all_attributes']
    }

    for attribute in d.data_description['attribute_description']:
        current_attribute_info = d.data_description['attribute_description'][
            attribute]
        if current_attribute_info['is_candidate_key']:
            dataset_info['candidate_attributes'].append(attribute)
        if current_attribute_info['is_categorical']:
            dataset_info['categorical_attributes'].append(attribute)
        dataset_info['attribute_datatypes'][
            attribute] = current_attribute_info['data_type']

    return dataset_info
def run_trainer(mode_choice):
    prompt = '\nWhat would you like to call your new table?\n(Will be prefixed with "' + table_choice + '_' + mode_choice + '_")\n\n'
    name_choice = input(prompt)
    if name_choice:
        table_name = table_choice + '_' + mode_choice + '_' + name_choice
    else:
        table_name = table_choice + '_' + mode_choice

    print(
        '\nWhat version would you like to call the dataset ' + table_name +
        '?\n(Please check previous version in dataset_processing_meta.csv sheet)\n'
    )
    version_input = input()

    print('\nAny additional notes about this version?\n')
    notes_input = input()

    start = time.time()
    status_update(start)

    if not os.path.exists(temp_file_path):
        results = read_entries(db_input, table_choice)
        generate_csv(db_input, table_choice, results, temp_file_path,
                     ignore_synth_columns)

    db_output = open_database(config.DATABASES['synth'], table_name)
    db_output.drop_table(table_name, with_all_data=True)
    db_output.create_tables()

    describer = DataDescriber()

    data_df = pd.read_csv(temp_file_path)
    num_rows = len(data_df)
    save_file_name = table_choice + '_' + name_choice

    print('describing synthetic data for', mode_choice, 'mode...')
    describe_synthetic_data(
        mode_choice, mode_filepaths(mode_choice, 'description',
                                    save_file_name), temp_file_path,
        candidate_keys)

    print('generating synthetic data for', mode_choice, 'mode...')
    generate_synthetic_data(
        mode_choice, num_rows,
        mode_filepaths(mode_choice, 'description', save_file_name),
        mode_filepaths(mode_choice, 'data', save_file_name))

    print('saving synthetic data to database for', mode_choice, 'mode...')
    save_synthetic_data(mode_filepaths(mode_choice, 'data', save_file_name),
                        db_output, table_name)

    print('comparing histograms for', mode_choice, 'mode...')
    inspector = compare_histograms(
        table_name, data_df,
        mode_filepaths(mode_choice, 'description', save_file_name),
        mode_filepaths(mode_choice, 'data', save_file_name))

    print('comparing pairwise mutual information for', mode_choice, 'mode...')
    compare_pairwise_mutual_information(table_name, inspector)

    end = time.time()
    elapsed = round(end - start, 2)
    print('done in ' + str(elapsed) + ' seconds.')

    str_cat = 0
    str_not_cat = 0
    int_cat = 0
    int_not_cat = 0
    datetimes = 0

    with open('./data_models/' + attr_choice + '.json') as json_file:
        fields = json.load(json_file)
        for row in fields:
            if fields[row][0] == "str_cat":
                str_cat += 1
            elif fields[row][0] == "str_not_cat":
                str_not_cat += 1
            elif fields[row][0] == "int_cat":
                int_cat += 1
            elif fields[row][0] == "int_not_cat":
                int_not_cat += 1
            elif fields[row][0] == "datetime_not_cat":
                datetimes += 1

    entry_csv = [
        table_name, table_choice, 'synth_' + mode_choice,
        get_table_size(db_output, table_name),
        len(select_columns(db_output, table_name)),
        count_rows(db_output, table_name),
        datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), elapsed,
        round(start),
        round(end), config.EC2_INSTANCE_TYPE, version_input, notes_input, '',
        '', str_cat, str_not_cat, int_cat, int_not_cat, datetimes
    ]

    if mode_choice == 'correlated':
        entry_csv[13] = model_config.CORRELATED_EPSILON_VALUE
        entry_csv[14] = model_config.CORRELATED_DEGREE_OF_BAYESIAN_NETWORK

    append_list_as_row(config.SYNTH_META_CSV_PATH, entry_csv)

    process_timer.cancel()

    print(
        "\n%s%sProcessed %s%s%s data into %s%s%s table in the %ssynth_datasets%s database!%s\n"
        % (colour.BOLD, colour.GREEN, colour.DARKCYAN, table_choice,
           colour.GREEN, colour.DARKCYAN, table_name, colour.GREEN,
           colour.DARKCYAN, colour.GREEN, colour.END))
def synthetize():
	#get_ipython().run_line_magic('matplotlib', 'auto')
	# Adding current direcotry into sys.path
	# input dataset
	input_data = './census/adult_data.csv'
	# location of two output files
	mode = 'correlated_attribute_mode'
	description_file = f'./census/out/{mode}/description.json'
	synthetic_data = f'./census/out/{mode}/sythetic_data.csv'


	# An attribute is categorical if its domain size is less than this threshold.
	# Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset).
	threshold_value = 20

	# specify categorical attributes
	categorical_attributes = {'education': True, 'native-country': True}

	# specify which attributes are candidate keys of input dataset.
	candidate_keys = {'ssn': True}

	# A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
	# change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
	# Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
	epsilon = 0.1

	# The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
	degree_of_bayesian_network = 2

	# Number of tuples generated in synthetic dataset.
	num_tuples_to_generate = 32561 # Here 32561 is the same as input dataset, but it can be set to another number.


	# ### Step 3 DataDescriber
	# 
	# 1. Instantiate a DataDescriber.
	# 2. Compute the statistics of the dataset.
	# 3. Save dataset description to a file on local machine.

	describer = DataDescriber(category_threshold=threshold_value)
	describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
	                                                        epsilon=epsilon, 
	                                                        k=degree_of_bayesian_network,
	                                                        attribute_to_is_categorical=categorical_attributes,
	                                                        attribute_to_is_candidate_key=candidate_keys)
	describer.save_dataset_description_to_file(description_file)

	display_bayesian_network(describer.bayesian_network)


	# ### Step 4 generate synthetic dataset
	# 
	# 1. Instantiate a DataGenerator.
	# 2. Generate a synthetic dataset.
	# 3. Save it to local machine.

	generator = DataGenerator()
	generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
	generator.save_synthetic_data(synthetic_data)


	# ### Step 5 compare the statistics of input and sythetic data (optional)
	# 
	# The synthetic data is already saved in a file by step 4. The ModelInspector is for a quick test on the similarity between input and synthetic datasets.
	# 
	# #### 5.1 instantiate a ModelInspector.
	# 
	# It needs input dataset, synthetic dataset, and attribute description.

	# Read both datasets using Pandas.
	input_df = pd.read_csv(input_data, skipinitialspace=True)
	synthetic_df = pd.read_csv(synthetic_data)
	# Read attribute description from the dataset description file.
	attribute_description = read_json_file(description_file)['attribute_description']

	inspector = ModelInspector(input_df, synthetic_df, attribute_description)


	# #### 5.2 compare histograms between input and synthetic datasets.

	plot_id = 0
	for attribute in synthetic_df.columns:
	    inspector.compare_histograms(attribute, plot_id)
	    plot_id += 1


	# #### 5.3 compare pairwise mutual information

	inspector.mutual_information_heatmap()
コード例 #7
0
def generate_data(username):
    configuration = read_json_file('{}_parameters.json'.format(username))
    input_dataset_file = '{}.csv'.format(username)
    description_file = '{}_description.json'.format(username)
    synthetic_dataset_file = '{}_synthetic_data.csv'.format(username)

    initial_dataset_info = get_dataset_info(input_dataset_file)

    attribute_to_is_candidate = {}
    for attr in initial_dataset_info['attribute_list']:
        if attr in configuration['candidate_atts']:
            attribute_to_is_candidate[attr] = True
        else:
            attribute_to_is_candidate[attr] = False

    attribute_to_is_categorical = {}
    for attr in initial_dataset_info['attribute_list']:
        if attr in configuration['categorical_atts']:
            attribute_to_is_categorical[attr] = True
        else:
            attribute_to_is_categorical[attr] = False

    if configuration['tuple_n'] == '':
        n = initial_dataset_info['number_of_tuples']
    else:
        n = int(configuration['tuple_n'])

    # if configuration['categorical_threshold'] == '':
    #     categorical_threshold = 10
    # else:
    #     categorical_threshold = int(configuration['categorical_threshold'])

    if configuration['seed'] == '':
        seed = 0
    else:
        seed = int(configuration['seed'])

    generator = DataGenerator()
    if configuration['chose_mode'] == 'mode1':
        describer = DataDescriber()
        describer.describe_dataset_in_random_mode(input_dataset_file, {},
                                                  attribute_to_is_categorical,
                                                  attribute_to_is_candidate,
                                                  seed)
        describer.save_dataset_description_to_file(description_file)
        generator.generate_dataset_in_random_mode(n, description_file, seed)
    else:

        if configuration['histogram_size'] == '':
            histogram_size = 20
        else:
            histogram_size = int(configuration['histogram_size'])

        if configuration['epsilon'] == '':
            epsilon = 0.1
        else:
            epsilon = float(configuration['epsilon'])

        attribute_to_datatype = configuration['type_atts']

        describer = DataDescriber(histogram_size)
        if configuration['chose_mode'] == 'mode2':
            describer.describe_dataset_in_independent_attribute_mode(
                input_dataset_file, epsilon, attribute_to_datatype,
                attribute_to_is_categorical, attribute_to_is_candidate, seed)
            describer.save_dataset_description_to_file(description_file)
            generator.generate_dataset_in_independent_mode(
                n, description_file, seed)
        elif configuration['chose_mode'] == 'mode3':
            if configuration['max_degree'] == '':
                max_degree = 3
            else:
                max_degree = int(configuration['max_degree'])

            describer.describe_dataset_in_correlated_attribute_mode(
                input_dataset_file, max_degree, epsilon, attribute_to_datatype,
                attribute_to_is_categorical, attribute_to_is_candidate, seed)
            describer.save_dataset_description_to_file(description_file)
            generator.generate_dataset_in_correlated_attribute_mode(
                n, description_file, seed)

    generator.save_synthetic_data(synthetic_dataset_file)