Esempio n. 1
0
def test_datasynthesizer():
    data_dir = Path(__file__).parent / 'data'
    input_data = data_dir / 'adult_tiny.csv'
    description_file = data_dir / 'description.json'
    output_data = data_dir / 'output.csv'
    uniform_data = data_dir / 'output_uniform.csv'

    threshold_value = 20
    categorical_attributes = {'education': True}
    epsilon = 1
    degree_of_bayesian_network = 2
    num_tuples_to_generate = 10000

    describer = DataDescriber(category_threshold=threshold_value)
    describer.describe_dataset_in_correlated_attribute_mode(
        dataset_file=input_data,
        epsilon=epsilon,
        k=degree_of_bayesian_network,
        attribute_to_is_categorical=categorical_attributes)

    describer.save_dataset_description_to_file(description_file)

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(
        num_tuples_to_generate, description_file)
    generator.save_synthetic_data(output_data)
    generator.generate_dataset_in_random_mode(num_tuples_to_generate,
                                              description_file)
    generator.save_synthetic_data(uniform_data)

    df_input = pd.read_csv(input_data, skipinitialspace=True)
    df_output = pd.read_csv(output_data)
    df_uniform = pd.read_csv(uniform_data)

    for col in df_input:
        if col == 'age':
            assert ks_test(df_input, df_output, col) < 0.1
        else:
            assert kl_test(df_input, df_output, col) < 0.01

    df_input_mi = pairwise_attributes_mutual_information(df_input)
    df_output_mi = pairwise_attributes_mutual_information(df_output)
    df_uniform_mi = pairwise_attributes_mutual_information(df_uniform)

    output_diff = (df_output_mi - df_input_mi).abs().sum().sum()
    uniform_diff = (df_uniform_mi - df_input_mi).abs().sum().sum()

    assert output_diff < 5 * uniform_diff
Esempio n. 2
0
def generate_data(username):
    configuration = read_json_file('{}_parameters.json'.format(username))
    input_dataset_file = '{}.csv'.format(username)
    description_file = '{}_description.json'.format(username)
    synthetic_dataset_file = '{}_synthetic_data.csv'.format(username)

    initial_dataset_info = get_dataset_info(input_dataset_file)

    attribute_to_is_candidate = {}
    for attr in initial_dataset_info['attribute_list']:
        if attr in configuration['candidate_atts']:
            attribute_to_is_candidate[attr] = True
        else:
            attribute_to_is_candidate[attr] = False

    attribute_to_is_categorical = {}
    for attr in initial_dataset_info['attribute_list']:
        if attr in configuration['categorical_atts']:
            attribute_to_is_categorical[attr] = True
        else:
            attribute_to_is_categorical[attr] = False

    if configuration['tuple_n'] == '':
        n = initial_dataset_info['number_of_tuples']
    else:
        n = int(configuration['tuple_n'])

    # if configuration['categorical_threshold'] == '':
    #     categorical_threshold = 10
    # else:
    #     categorical_threshold = int(configuration['categorical_threshold'])

    if configuration['seed'] == '':
        seed = 0
    else:
        seed = int(configuration['seed'])

    generator = DataGenerator()
    if configuration['chose_mode'] == 'mode1':
        describer = DataDescriber()
        describer.describe_dataset_in_random_mode(input_dataset_file, {},
                                                  attribute_to_is_categorical,
                                                  attribute_to_is_candidate,
                                                  seed)
        describer.save_dataset_description_to_file(description_file)
        generator.generate_dataset_in_random_mode(n, description_file, seed)
    else:

        if configuration['histogram_size'] == '':
            histogram_size = 20
        else:
            histogram_size = int(configuration['histogram_size'])

        if configuration['epsilon'] == '':
            epsilon = 10
        else:
            epsilon = float(configuration['epsilon'])

        attribute_to_datatype = configuration['type_atts']

        describer = DataDescriber(histogram_size)
        if configuration['chose_mode'] == 'mode2':
            describer.describe_dataset_in_independent_attribute_mode(
                input_dataset_file, epsilon, attribute_to_datatype,
                attribute_to_is_categorical, attribute_to_is_candidate, seed)
            describer.save_dataset_description_to_file(description_file)
            generator.generate_dataset_in_independent_mode(
                n, description_file, seed)
        elif configuration['chose_mode'] == 'mode3':
            if configuration['max_degree'] == '':
                max_degree = 3
            else:
                max_degree = int(configuration['max_degree'])

            describer.describe_dataset_in_correlated_attribute_mode(
                input_dataset_file, max_degree, epsilon, attribute_to_datatype,
                attribute_to_is_categorical, attribute_to_is_candidate, seed)
            describer.save_dataset_description_to_file(description_file)
            generator.generate_dataset_in_correlated_attribute_mode(
                n, description_file, seed)

    generator.save_synthetic_data(synthetic_dataset_file)
Esempio n. 3
0
    # An attribute is categorical if its domain size is less than this threshold.
    # Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset).
    threshold_value = 20

    # Additional strings to recognize as NA/NaN.
    null_values = '<NULL>'

    # specify which attributes are candidate keys of input dataset.
    candidate_keys = {'age': False}

    # A parameter in differential privacy.
    # It roughtly means that removing one tuple will change the probability of any output by  at most exp(epsilon).
    # Set epsilon=0 to turn off differential privacy.
    epsilon = 0.1

    # The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
    degree_of_bayesian_network = 2

    # Number of tuples generated in synthetic dataset.
    num_tuples_to_generate = 32561  # Here 32561 is the same as input dataset, but it can be set to another number.

    # describer = DataDescriber(threshold_of_categorical_variable=threshold_value, null_values=null_values)
    # describer.describe_dataset_in_correlated_attribute_mode(input_data, epsilon=epsilon, k=degree_of_bayesian_network,
    #                                                         attribute_to_is_candidate_key=candidate_keys)
    # describer.save_dataset_description_to_file(description_file)

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(
        num_tuples_to_generate, description_file)
    generator.save_synthetic_data(synthetic_data)
def main():
    parser = argparse.ArgumentParser(
        description="DP Data Synthesizer - PriveBayes")
    """
    parser.add_argument( # remove/change
        "-i",
        "--input-file",
        type=str,
        default="./bank-data/bank-additional-full.csv",
        help="Path to input data",
    )
    parser.add_argument( # remove/change
        "-o",
        "--output-dir",
        type=str,
        default="./bank-data/synth/",
        help="Path to input data",
    )
    """
    parser.add_argument(
        "-m",
        "--mode",
        type=str,
        default="correlated",
        help="Synthesizer Mode: 'independent', 'correlated', 'random'",
    )
    parser.add_argument(
        "-d",
        "--data",
        type=str,
        default="bank",
        help="Dataset ('bank' for bank dataset, 'adult' for adult dataset)",
    )
    parser.add_argument(
        "-e",
        "--epsilon",
        type=float,
        default=0.1,
        help="Noise parameter (default = 0.1)",
    )
    args = parser.parse_args()
    if args.data == 'bank':
        input_file = './bank-data/bank-additional-full.csv'
        cols = [
            'age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
            'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
            'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
            'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'
        ]
        df = pd.read_csv(input_file, sep=';', names=cols)
        categorical_columns = [
            'job', 'marital', 'education', 'default', 'housing', 'loan',
            'contact', 'month', 'day_of_week', 'poutcome'
        ]

        for category in categorical_columns:
            df[category] = df[category].astype('object')

        # specify categorical attributes
        categorical_attributes = {
            'age': True,
            'job': True,
            'marital': True,
            'education': True,
            'default': True,
            'housing': True,
            'loan': True,
            'contact': True,
            'month': True,
            'day_of_week': True,
            'poutcome': True,
            'y': True
        }
        output_dir = './bank-data/synth'
        sep = ';'
    elif args.data == 'adult':
        input_file = './adult-data/adult.data'
        cols = [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'y'
        ]

        df = pd.read_csv(input_file, sep=',', names=cols)
        categorical_columns = [
            'workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'native-country'
        ]

        for category in categorical_columns:
            df[category] = df[category].astype('object')

        categorical_attributes = {
            'workclass': True,
            'education': True,
            'marital-status': True,
            'occupation': True,
            'relationship': True,
            'race': True,
            'sex': True,
            'native-country': True,
            'y': True
        }
        output_dir = './adult-data/synth'
        sep = ','

    elif args.data == 'german':
        input_file = './german-data/german.train'
        cols = [
            'existing_checking', 'duration', 'credit_history', 'purpose',
            'credit_amount', 'savings', 'employment_since', 'installment_rate',
            'status_sex', 'other_debtors', 'residence_since', 'property',
            'age', 'other_installment_plans', 'housing', 'existing_credits',
            'job', 'people_liable', 'telephone', 'foreign_worker', 'y'
        ]

        df = pd.read_csv(input_file, sep=' ', names=cols)
        categorical_columns = [
            'existing_checking', 'credit_history', 'purpose', 'savings',
            'employment_since', 'status_sex', 'other_debtors', 'property',
            'other_installment_plans', 'housing', 'job', 'telephone',
            'foreign_worker'
        ]
        for category in categorical_columns:
            df[category] = df[category].astype('object')

        categorical_attributes = {
            'existing_checking': True,
            'credit_history': True,
            'purpose': True,
            'savings': True,
            'employment_since': True,
            'status_sex': True,
            'other_debtors': True,
            'property': True,
            'other_installment_plans': True,
            'housing': True,
            'job': True,
            'telephone': True,
            'foreign_worker': True,
            'y': True
        }
        output_dir = './german-data/synth'
        sep = ' '
    elif args.data == 'home':
        input_file = './home-data/hcdf_train.csv'

        df = pd.read_csv(input_file, sep=',', header=0)

        df = df.rename(columns={"TARGET": "y", "CODE_GENDER": "GENDER"})

        categorical_columns = [
            "NAME_CONTRACT_TYPE", "GENDER", "FLAG_OWN_REALTY",
            "NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE",
            "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "OCCUPATION_TYPE",
            "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE",
            "FONDKAPREMONT_MODE", "HOUSETYPE_MODE", "WALLSMATERIAL_MODE",
            "EMERGENCYSTATE_MODE", "y"
        ]

        for category in categorical_columns:
            df[category] = df[category].astype('object')

        categorical_attributes = {
            "NAME_CONTRACT_TYPE": True,
            "GENDER": True,
            "FLAG_OWN_REALTY": True,
            "NAME_TYPE_SUITE": True,
            "NAME_INCOME_TYPE": True,
            "NAME_EDUCATION_TYPE": True,
            "NAME_FAMILY_STATUS": True,
            "NAME_HOUSING_TYPE": True,
            "OCCUPATION_TYPE": True,
            "WEEKDAY_APPR_PROCESS_START": True,
            "ORGANIZATION_TYPE": True,
            "FONDKAPREMONT_MODE": True,
            "HOUSETYPE_MODE": True,
            "WALLSMATERIAL_MODE": True,
            "EMERGENCYSTATE_MODE": True,
            "y": True
        }
        output_dir = './home-data/synth'
        sep = ','
    #df = df.dropna()

    # input to DataSynthetizer must be comma separated. Create a temp file.
    df.to_csv('comma_data.csv', sep=',')
    input_data = 'comma_data.csv'

    description_file = output_dir + '/description' + args.mode + '_' + str(
        args.epsilon) + '.json'
    synthetic_data = ''
    save_path = ''
    # An attribute is categorical if its domain size is less than this threshold.
    # Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset).
    threshold_value = 20

    # Number of tuples generated in synthetic dataset.
    num_tuples_to_generate = len(df)

    # specify which attributes are candidate keys of input dataset.
    candidate_keys = {'ssn': True}

    # The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
    degree_of_bayesian_network = 2

    # Number of tuples generated in synthetic dataset.
    num_tuples_to_generate = len(df)

    # Data describer
    describer = DataDescriber(category_threshold=threshold_value)
    if args.mode == 'independent':
        synthetic_data = output_dir + '/syth_data_independent_' + str(
            args.epsilon) + '.csv'
        save_path = output_dir + '/syth_data_independent_ymod_' + str(
            args.epsilon) + '.csv'

        describer.describe_dataset_in_independent_attribute_mode(
            dataset_file=input_data,
            attribute_to_is_categorical=categorical_attributes,
            attribute_to_is_candidate_key=candidate_keys)

        describer.save_dataset_description_to_file(description_file)

    elif args.mode == 'correlated':
        synthetic_data = output_dir + '/syth_data_correlated_' + str(
            args.epsilon) + '.csv'
        save_path = output_dir + '/syth_data_correlated_ymod_' + str(
            args.epsilon) + '.csv'

        describer.describe_dataset_in_correlated_attribute_mode(
            dataset_file=input_data,
            epsilon=args.epsilon,
            k=degree_of_bayesian_network,
            attribute_to_is_categorical=categorical_attributes,
            attribute_to_is_candidate_key=candidate_keys)

        describer.save_dataset_description_to_file(description_file)

        print(display_bayesian_network(describer.bayesian_network))

    else:
        synthetic_data = output_dir + '/syth_data_random_' + str(
            args.epsilon) + '.csv'
        save_path = output_dir + '/syth_data_random_ymod_' + str(
            args.epsilon) + '.csv'

        describer.describe_dataset_in_random_mode(input_data)

        describer.save_dataset_description_to_file(description_file)

    # Generate synthetic dataset
    generator = DataGenerator()
    generator.generate_dataset_in_random_mode(num_tuples_to_generate,
                                              description_file)
    generator.save_synthetic_data(synthetic_data)
    """
    # Compare the stats of original and synthetic data
    # Read both datasets using Pandas.
    input_df = pd.read_csv(input_data, skipinitialspace=True)
    synthetic_df = pd.read_csv(synthetic_data)
    # Read attribute description from the dataset description file.
    attribute_description = read_json_file(description_file)['attribute_description']

    inspector = ModelInspector(input_df, synthetic_df, attribute_description)
    for attribute in synthetic_df.columns:
        inspector.compare_histograms(attribute)
    """

    # Delete temporary file (comma separated df)
    if os.path.exists(input_data):
        os.remove(input_data)

    synth_df = pd.read_csv(synthetic_data, sep=',')
    synth_df['y'] = df['y']
    save_df = synth_df.loc[:, 'age':'y']

    save_df.to_csv(save_path, sep=sep, index=False, header=None)
Esempio n. 5
0
    def fauxify(self, df_in=None, *args, **kwargs):

        from DataSynthesizer.DataDescriber import DataDescriber
        from DataSynthesizer.DataGenerator import DataGenerator
        from DataSynthesizer.ModelInspector import ModelInspector
        from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network

        if df_in is None:
            warn_text = 'Input data frame is None.  This will cause the data describer to roll back to the '
            warn_text += 'file name in input_dataset.'
            warnings.warn(warn_text)
        else:
            self.df_in = df_in

        for key, value in kwargs.items():
            if key == "mode":
                self.mode = value
            elif key == "threshold_value":
                self.threshold_value = value
            elif key == "categorical_attributes":
                self.categorical_attributes = value
            elif key == "candidate_keys":
                self.candidate_keys = value
            elif key == "num_tuples_to_generate":
                self.num_tuples_to_generate = value
            elif key == "save_faux_data_to_file":
                self.save_faux_data_to_file = value
            else:
                if self.verbose:
                    warnings.warn('Keyword argument', key, 'not used')

        # for now, override tuples generated to be same as input dataframe
        self.num_tuples_to_generate = len(self.df_in)

        # Below copied from example file
        self.description_file = './out/{}/description.txt'.format(self.mode)
        self.synthetic_data = './out/{}/sythetic_data.csv'.format(self.mode)

        describer = KFP_DataDescriber(df_in=self.df_in,
                                      category_threshold=self.threshold_value)
        generator = DataGenerator()

        # currently can't get correlated_attribute_mode to work, but leaving it here for now
        if self.mode == "correlated_attribute_mode":
            # this block prints a lot to stdout, supress in non-verbose mode
            if self.verbose:
                describer.describe_dataset_in_correlated_attribute_mode(
                    describer.df_input,
                    epsilon=self.epsilon,
                    k=self.degree_of_bayesian_network,
                    attribute_to_is_categorical=self.categorical_attributes,
                    attribute_to_is_candidate_key=self.candidate_keys)
            else:
                with nostdout():
                    describer.describe_dataset_in_correlated_attribute_mode(
                        describer.df_input,
                        epsilon=self.epsilon,
                        k=self.degree_of_bayesian_network,
                        attribute_to_is_categorical=self.
                        categorical_attributes,
                        attribute_to_is_candidate_key=self.candidate_keys)
            describer.save_dataset_description_to_file(self.description_file)
            generator.generate_dataset_in_correlated_attribute_mode(
                self.num_tuples_to_generate, self.description_file)
        elif self.mode == "independent_attribute_mode":
            describer.describe_dataset_in_independent_attribute_mode(
                describer.df_input,
                attribute_to_is_categorical=self.categorical_attributes,
                attribute_to_is_candidate_key=self.candidate_keys)
            describer.save_dataset_description_to_file(self.description_file)
            generator.generate_dataset_in_independent_mode(
                self.num_tuples_to_generate, self.description_file)

        if self.save_faux_data_to_file:
            generator.save_synthetic_data(self.synthetic_data)

        return generator.synthetic_dataset