Beispiel #1
0
def get_dataset_info(file_name):
    d = DataDescriber()
    d.describe_dataset_in_independent_attribute_mode(file_name)

    dataset_info = {'candidate_attributes': [],
                    'categorical_attributes': [],
                    'attribute_datatypes': {},
                    'number_of_tuples': d.dataset_description['meta']['num_tuples'],
                    'attribute_list': d.dataset_description['meta']['all_attributes']}

    for attribute in d.dataset_description['attribute_description']:
        current_attribute_info = d.dataset_description['attribute_description'][attribute]
        if current_attribute_info['is_candidate_key']:
            dataset_info['candidate_attributes'].append(attribute)
        if current_attribute_info['is_categorical']:
            dataset_info['categorical_attributes'].append(attribute)
        dataset_info['attribute_datatypes'][attribute] = current_attribute_info['data_type']

    return dataset_info
Beispiel #2
0
def test_datasynthesizer():
    data_dir = Path(__file__).parent / 'data'
    input_data = data_dir / 'adult_tiny.csv'
    description_file = data_dir / 'description.json'
    output_data = data_dir / 'output.csv'
    uniform_data = data_dir / 'output_uniform.csv'

    threshold_value = 20
    categorical_attributes = {'education': True}
    epsilon = 1
    degree_of_bayesian_network = 2
    num_tuples_to_generate = 10000

    describer = DataDescriber(category_threshold=threshold_value)
    describer.describe_dataset_in_correlated_attribute_mode(
        dataset_file=input_data,
        epsilon=epsilon,
        k=degree_of_bayesian_network,
        attribute_to_is_categorical=categorical_attributes)

    describer.save_dataset_description_to_file(description_file)

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(
        num_tuples_to_generate, description_file)
    generator.save_synthetic_data(output_data)
    generator.generate_dataset_in_random_mode(num_tuples_to_generate,
                                              description_file)
    generator.save_synthetic_data(uniform_data)

    df_input = pd.read_csv(input_data, skipinitialspace=True)
    df_output = pd.read_csv(output_data)
    df_uniform = pd.read_csv(uniform_data)

    for col in df_input:
        if col == 'age':
            assert ks_test(df_input, df_output, col) < 0.1
        else:
            assert kl_test(df_input, df_output, col) < 0.01

    df_input_mi = pairwise_attributes_mutual_information(df_input)
    df_output_mi = pairwise_attributes_mutual_information(df_output)
    df_uniform_mi = pairwise_attributes_mutual_information(df_uniform)

    output_diff = (df_output_mi - df_input_mi).abs().sum().sum()
    uniform_diff = (df_uniform_mi - df_input_mi).abs().sum().sum()

    assert output_diff < 5 * uniform_diff
Beispiel #3
0
def generate_data(username):
    configuration = read_json_file('{}_parameters.json'.format(username))
    input_dataset_file = '{}.csv'.format(username)
    description_file = '{}_description.json'.format(username)
    synthetic_dataset_file = '{}_synthetic_data.csv'.format(username)

    initial_dataset_info = get_dataset_info(input_dataset_file)

    attribute_to_is_candidate = {}
    for attr in initial_dataset_info['attribute_list']:
        if attr in configuration['candidate_atts']:
            attribute_to_is_candidate[attr] = True
        else:
            attribute_to_is_candidate[attr] = False

    attribute_to_is_categorical = {}
    for attr in initial_dataset_info['attribute_list']:
        if attr in configuration['categorical_atts']:
            attribute_to_is_categorical[attr] = True
        else:
            attribute_to_is_categorical[attr] = False

    if configuration['tuple_n'] == '':
        n = initial_dataset_info['number_of_tuples']
    else:
        n = int(configuration['tuple_n'])

    # if configuration['categorical_threshold'] == '':
    #     categorical_threshold = 10
    # else:
    #     categorical_threshold = int(configuration['categorical_threshold'])

    if configuration['seed'] == '':
        seed = 0
    else:
        seed = int(configuration['seed'])

    generator = DataGenerator()
    if configuration['chose_mode'] == 'mode1':
        describer = DataDescriber()
        describer.describe_dataset_in_random_mode(input_dataset_file, {},
                                                  attribute_to_is_categorical,
                                                  attribute_to_is_candidate,
                                                  seed)
        describer.save_dataset_description_to_file(description_file)
        generator.generate_dataset_in_random_mode(n, description_file, seed)
    else:

        if configuration['histogram_size'] == '':
            histogram_size = 20
        else:
            histogram_size = int(configuration['histogram_size'])

        if configuration['epsilon'] == '':
            epsilon = 10
        else:
            epsilon = float(configuration['epsilon'])

        attribute_to_datatype = configuration['type_atts']

        describer = DataDescriber(histogram_size)
        if configuration['chose_mode'] == 'mode2':
            describer.describe_dataset_in_independent_attribute_mode(
                input_dataset_file, epsilon, attribute_to_datatype,
                attribute_to_is_categorical, attribute_to_is_candidate, seed)
            describer.save_dataset_description_to_file(description_file)
            generator.generate_dataset_in_independent_mode(
                n, description_file, seed)
        elif configuration['chose_mode'] == 'mode3':
            if configuration['max_degree'] == '':
                max_degree = 3
            else:
                max_degree = int(configuration['max_degree'])

            describer.describe_dataset_in_correlated_attribute_mode(
                input_dataset_file, max_degree, epsilon, attribute_to_datatype,
                attribute_to_is_categorical, attribute_to_is_candidate, seed)
            describer.save_dataset_description_to_file(description_file)
            generator.generate_dataset_in_correlated_attribute_mode(
                n, description_file, seed)

    generator.save_synthetic_data(synthetic_dataset_file)
def main():
    parser = argparse.ArgumentParser(
        description="DP Data Synthesizer - PriveBayes")
    """
    parser.add_argument( # remove/change
        "-i",
        "--input-file",
        type=str,
        default="./bank-data/bank-additional-full.csv",
        help="Path to input data",
    )
    parser.add_argument( # remove/change
        "-o",
        "--output-dir",
        type=str,
        default="./bank-data/synth/",
        help="Path to input data",
    )
    """
    parser.add_argument(
        "-m",
        "--mode",
        type=str,
        default="correlated",
        help="Synthesizer Mode: 'independent', 'correlated', 'random'",
    )
    parser.add_argument(
        "-d",
        "--data",
        type=str,
        default="bank",
        help="Dataset ('bank' for bank dataset, 'adult' for adult dataset)",
    )
    parser.add_argument(
        "-e",
        "--epsilon",
        type=float,
        default=0.1,
        help="Noise parameter (default = 0.1)",
    )
    args = parser.parse_args()
    if args.data == 'bank':
        input_file = './bank-data/bank-additional-full.csv'
        cols = [
            'age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
            'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
            'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
            'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'
        ]
        df = pd.read_csv(input_file, sep=';', names=cols)
        categorical_columns = [
            'job', 'marital', 'education', 'default', 'housing', 'loan',
            'contact', 'month', 'day_of_week', 'poutcome'
        ]

        for category in categorical_columns:
            df[category] = df[category].astype('object')

        # specify categorical attributes
        categorical_attributes = {
            'age': True,
            'job': True,
            'marital': True,
            'education': True,
            'default': True,
            'housing': True,
            'loan': True,
            'contact': True,
            'month': True,
            'day_of_week': True,
            'poutcome': True,
            'y': True
        }
        output_dir = './bank-data/synth'
        sep = ';'
    elif args.data == 'adult':
        input_file = './adult-data/adult.data'
        cols = [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'y'
        ]

        df = pd.read_csv(input_file, sep=',', names=cols)
        categorical_columns = [
            'workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'native-country'
        ]

        for category in categorical_columns:
            df[category] = df[category].astype('object')

        categorical_attributes = {
            'workclass': True,
            'education': True,
            'marital-status': True,
            'occupation': True,
            'relationship': True,
            'race': True,
            'sex': True,
            'native-country': True,
            'y': True
        }
        output_dir = './adult-data/synth'
        sep = ','

    elif args.data == 'german':
        input_file = './german-data/german.train'
        cols = [
            'existing_checking', 'duration', 'credit_history', 'purpose',
            'credit_amount', 'savings', 'employment_since', 'installment_rate',
            'status_sex', 'other_debtors', 'residence_since', 'property',
            'age', 'other_installment_plans', 'housing', 'existing_credits',
            'job', 'people_liable', 'telephone', 'foreign_worker', 'y'
        ]

        df = pd.read_csv(input_file, sep=' ', names=cols)
        categorical_columns = [
            'existing_checking', 'credit_history', 'purpose', 'savings',
            'employment_since', 'status_sex', 'other_debtors', 'property',
            'other_installment_plans', 'housing', 'job', 'telephone',
            'foreign_worker'
        ]
        for category in categorical_columns:
            df[category] = df[category].astype('object')

        categorical_attributes = {
            'existing_checking': True,
            'credit_history': True,
            'purpose': True,
            'savings': True,
            'employment_since': True,
            'status_sex': True,
            'other_debtors': True,
            'property': True,
            'other_installment_plans': True,
            'housing': True,
            'job': True,
            'telephone': True,
            'foreign_worker': True,
            'y': True
        }
        output_dir = './german-data/synth'
        sep = ' '
    elif args.data == 'home':
        input_file = './home-data/hcdf_train.csv'

        df = pd.read_csv(input_file, sep=',', header=0)

        df = df.rename(columns={"TARGET": "y", "CODE_GENDER": "GENDER"})

        categorical_columns = [
            "NAME_CONTRACT_TYPE", "GENDER", "FLAG_OWN_REALTY",
            "NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE",
            "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "OCCUPATION_TYPE",
            "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE",
            "FONDKAPREMONT_MODE", "HOUSETYPE_MODE", "WALLSMATERIAL_MODE",
            "EMERGENCYSTATE_MODE", "y"
        ]

        for category in categorical_columns:
            df[category] = df[category].astype('object')

        categorical_attributes = {
            "NAME_CONTRACT_TYPE": True,
            "GENDER": True,
            "FLAG_OWN_REALTY": True,
            "NAME_TYPE_SUITE": True,
            "NAME_INCOME_TYPE": True,
            "NAME_EDUCATION_TYPE": True,
            "NAME_FAMILY_STATUS": True,
            "NAME_HOUSING_TYPE": True,
            "OCCUPATION_TYPE": True,
            "WEEKDAY_APPR_PROCESS_START": True,
            "ORGANIZATION_TYPE": True,
            "FONDKAPREMONT_MODE": True,
            "HOUSETYPE_MODE": True,
            "WALLSMATERIAL_MODE": True,
            "EMERGENCYSTATE_MODE": True,
            "y": True
        }
        output_dir = './home-data/synth'
        sep = ','
    #df = df.dropna()

    # input to DataSynthetizer must be comma separated. Create a temp file.
    df.to_csv('comma_data.csv', sep=',')
    input_data = 'comma_data.csv'

    description_file = output_dir + '/description' + args.mode + '_' + str(
        args.epsilon) + '.json'
    synthetic_data = ''
    save_path = ''
    # An attribute is categorical if its domain size is less than this threshold.
    # Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset).
    threshold_value = 20

    # Number of tuples generated in synthetic dataset.
    num_tuples_to_generate = len(df)

    # specify which attributes are candidate keys of input dataset.
    candidate_keys = {'ssn': True}

    # The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
    degree_of_bayesian_network = 2

    # Number of tuples generated in synthetic dataset.
    num_tuples_to_generate = len(df)

    # Data describer
    describer = DataDescriber(category_threshold=threshold_value)
    if args.mode == 'independent':
        synthetic_data = output_dir + '/syth_data_independent_' + str(
            args.epsilon) + '.csv'
        save_path = output_dir + '/syth_data_independent_ymod_' + str(
            args.epsilon) + '.csv'

        describer.describe_dataset_in_independent_attribute_mode(
            dataset_file=input_data,
            attribute_to_is_categorical=categorical_attributes,
            attribute_to_is_candidate_key=candidate_keys)

        describer.save_dataset_description_to_file(description_file)

    elif args.mode == 'correlated':
        synthetic_data = output_dir + '/syth_data_correlated_' + str(
            args.epsilon) + '.csv'
        save_path = output_dir + '/syth_data_correlated_ymod_' + str(
            args.epsilon) + '.csv'

        describer.describe_dataset_in_correlated_attribute_mode(
            dataset_file=input_data,
            epsilon=args.epsilon,
            k=degree_of_bayesian_network,
            attribute_to_is_categorical=categorical_attributes,
            attribute_to_is_candidate_key=candidate_keys)

        describer.save_dataset_description_to_file(description_file)

        print(display_bayesian_network(describer.bayesian_network))

    else:
        synthetic_data = output_dir + '/syth_data_random_' + str(
            args.epsilon) + '.csv'
        save_path = output_dir + '/syth_data_random_ymod_' + str(
            args.epsilon) + '.csv'

        describer.describe_dataset_in_random_mode(input_data)

        describer.save_dataset_description_to_file(description_file)

    # Generate synthetic dataset
    generator = DataGenerator()
    generator.generate_dataset_in_random_mode(num_tuples_to_generate,
                                              description_file)
    generator.save_synthetic_data(synthetic_data)
    """
    # Compare the stats of original and synthetic data
    # Read both datasets using Pandas.
    input_df = pd.read_csv(input_data, skipinitialspace=True)
    synthetic_df = pd.read_csv(synthetic_data)
    # Read attribute description from the dataset description file.
    attribute_description = read_json_file(description_file)['attribute_description']

    inspector = ModelInspector(input_df, synthetic_df, attribute_description)
    for attribute in synthetic_df.columns:
        inspector.compare_histograms(attribute)
    """

    # Delete temporary file (comma separated df)
    if os.path.exists(input_data):
        os.remove(input_data)

    synth_df = pd.read_csv(synthetic_data, sep=',')
    synth_df['y'] = df['y']
    save_df = synth_df.loc[:, 'age':'y']

    save_df.to_csv(save_path, sep=sep, index=False, header=None)
Beispiel #5
0
    def __init__(self, *args, df_in=None, verbose=True, **kwargs):

        self.verbose = verbose
        self.df_in = df_in
        DataDescriber.__init__(self, *args, **kwargs)