def get_plot_data(input_dataset_file, synthetic_dataset_file,
                  description_file):
    description = read_json_file(description_file)
    df_before = pd.read_csv(input_dataset_file)
    df_after = pd.read_csv(synthetic_dataset_file)
    plot_data = {'histogram': {}, 'barchart': {}, 'heatmap': {}}
    for attr in df_before:
        if description['attribute_description'][attr]['is_categorical']:
            bins_before, counts_before = get_barchart_data(df_before, attr)
            bins_after, counts_after = get_barchart_data(df_after, attr)
            plot_data['barchart'][attr] = {
                'before': {
                    'bins': bins_before,
                    'counts': counts_before
                },
                'after': {
                    'bins': bins_after,
                    'counts': counts_after
                }
            }
        elif description['attribute_description'][attr]['data_type'] in {
                'Integer', 'Float'
        }:
            plot_data['histogram'][attr] = {
                'before': get_histogram_data(df_before, attr),
                'after': get_histogram_data(df_after, attr)
            }

    plot_data['heatmap']['before'] = get_heatmap_data(input_dataset_file)
    plot_data['heatmap']['after'] = get_heatmap_data(synthetic_dataset_file)
    plot_file_name = input_dataset_file.replace(".csv", "_plot.json")
    with open(plot_file_name, 'w') as outfile:
        json.dump(plot_data, outfile, indent=4)
def compare_histograms(mode: str, hospital_ae_df: pd.DataFrame,
                       description_filepath: str,
                       synthetic_data_filepath: str):
    '''
    Makes comapirson plots showing the histograms for each column in the 
    synthetic data.

    Keyword arguments:
    mode -- what type of synthetic data
    hospital_ae_df -- DataFrame of the original dataset
    description_filepath -- filepath to the data description
    synthetic_data_filepath -- filepath to where synthetic data written
    '''

    synthetic_df = pd.read_csv(synthetic_data_filepath)

    # Read attribute description from the dataset description file.
    attribute_description = read_json_file(
        description_filepath)['attribute_description']

    inspector = ModelInspector(hospital_ae_df, synthetic_df,
                               attribute_description)

    for attribute in synthetic_df.columns:
        figure_filepath = os.path.join(plots_dir,
                                       mode + '_' + attribute + '.png')
        # need to replace whitespace in filepath for Markdown reference
        figure_filepath = figure_filepath.replace(' ', '_')
        inspector.compare_histograms(attribute, figure_filepath)

    return inspector
def compare_pairwise_mutual_information(mode: str,
                                        hospital_ae_df: pd.DataFrame,
                                        description_filepath: str,
                                        synthetic_data_filepath: str):
    '''
    Looks at correlation of attributes by producing heatmap

    Keyword arguments:
    mode -- what type of synthetic data
    hospital_ae_df -- DataFrame of the original dataset
    description_filepath -- filepath to the data description
    synthetic_data_filepath -- filepath to where synthetic data written
    '''

    synthetic_df = pd.read_csv(synthetic_data_filepath)

    attribute_description = read_json_file(
        description_filepath)['attribute_description']

    inspector = ModelInspector(hospital_ae_df, synthetic_df,
                               attribute_description)

    figure_filepath = os.path.join(
        filepaths.plots_dir, 'mutual_information_heatmap_' + mode + '.png')

    inspector.mutual_information_heatmap(figure_filepath)
Ejemplo n.º 4
0
    def generate_dataset_in_correlated_attribute_mode(self,
                                                      n,
                                                      description_file,
                                                      seed=0):
        set_random_seed(seed)
        self.n = n
        self.description = read_json_file(description_file)

        all_attributes = self.description['meta']['all_attributes']
        candidate_keys = set(self.description['meta']['candidate_keys'])
        self.encoded_dataset = DataGenerator.generate_encoded_dataset(
            self.n, self.description)
        self.synthetic_dataset = DataFrame(columns=all_attributes)
        for attr in all_attributes:
            attr_info = self.description['attribute_description'][attr]
            column = parse_json(attr_info)

            if attr in self.encoded_dataset:
                self.synthetic_dataset[
                    attr] = column.sample_values_from_binning_indices(
                        self.encoded_dataset[attr])
            elif attr in candidate_keys:
                self.synthetic_dataset[
                    attr] = column.generate_values_as_candidate_key(n)
            else:
                # for attributes not in BN or candidate keys, use independent attribute mode.
                binning_indices = column.sample_binning_indices_in_independent_attribute_mode(
                    n)
                self.synthetic_dataset[
                    attr] = column.sample_values_from_binning_indices(
                        binning_indices)
Ejemplo n.º 5
0
    def generate_dataset_in_random_mode(self, n, description_file, seed=0):
        set_random_seed(seed)
        description = read_json_file(description_file)

        self.synthetic_dataset = DataFrame()
        for attr in description['attribute_description'].keys():
            attr_info = description['attribute_description'][attr]
            datatype = attr_info['data_type']
            is_categorical = attr_info['is_categorical']
            is_candidate_key = attr_info['is_candidate_key']
            minimum = attr_info['min']
            maximum = attr_info['max']
            static_num = attr_info['min'] if minimum == maximum else None
            if is_candidate_key:
                self.synthetic_dataset[attr] = parse_json(
                    attr_info).generate_values_as_candidate_key(n)
            elif is_categorical:
                self.synthetic_dataset[attr] = random.choice(
                    attr_info['distribution_bins'], n)
            elif datatype == 'String':
                length = static_num or random.randint(minimum, maximum)
                self.synthetic_dataset[attr] = length
                self.synthetic_dataset[attr] = self.synthetic_dataset[
                    attr].map(lambda x: generate_random_string(x))
            else:
                if datatype == 'Integer':
                    self.synthetic_dataset[
                        attr] = static_num or random.randint(
                            minimum, maximum + 1, n)
                else:
                    self.synthetic_dataset[
                        attr] = static_num or random.uniform(
                            minimum, maximum, n)
    def describe_dataset_in_random_mode(
            self,
            dataset_file: str,
            attribute_to_datatype: Dict[str, DataType] = None,
            attribute_to_is_categorical: Dict[str, bool] = None,
            attribute_to_is_candidate_key: Dict[str, bool] = None,
            categorical_attribute_domain_file: str = None,
            numerical_attribute_ranges: Dict[str, List] = None,
            seed=0):
        attribute_to_datatype = attribute_to_datatype or {}
        attribute_to_is_categorical = attribute_to_is_categorical or {}
        attribute_to_is_candidate_key = attribute_to_is_candidate_key or {}
        numerical_attribute_ranges = numerical_attribute_ranges or {}

        if categorical_attribute_domain_file:
            categorical_attribute_to_domain = utils.read_json_file(
                categorical_attribute_domain_file)
        else:
            categorical_attribute_to_domain = {}

        utils.set_random_seed(seed)
        self.attr_to_datatype = {
            attr: DataType(datatype)
            for attr, datatype in attribute_to_datatype.items()
        }
        self.attr_to_is_categorical = attribute_to_is_categorical
        self.attr_to_is_candidate_key = attribute_to_is_candidate_key
        self.read_dataset_from_csv(dataset_file)
        self.infer_attribute_data_types()
        self.analyze_dataset_meta()
        self.represent_input_dataset_by_columns()

        for column in self.attr_to_column.values():
            attr_name = column.name
            if attr_name in categorical_attribute_to_domain:
                column.infer_domain(
                    categorical_domain=categorical_attribute_to_domain[
                        attr_name])
            elif attr_name in numerical_attribute_ranges:
                column.infer_domain(
                    numerical_range=numerical_attribute_ranges[attr_name])
            else:
                column.infer_domain()

        # record attribute information in json format
        self.data_description['attribute_description'] = {}
        for attr, column in self.attr_to_column.items():
            self.data_description['attribute_description'][
                attr] = column.to_json()
Ejemplo n.º 7
0
    def generate_dataset_in_independent_mode(self, n, description_file, seed=0):
        set_random_seed(seed)
        self.description = read_json_file(description_file)

        all_attributes = self.description['meta']['all_attributes']
        candidate_keys = set(self.description['meta']['candidate_keys'])
        self.synthetic_dataset = DataFrame(columns=all_attributes)
        for attr in all_attributes:
            attr_info = self.description['attribute_description'][attr]
            column = parse_json(attr_info)

            if attr in candidate_keys:
                self.synthetic_dataset[attr] = column.generate_values_as_candidate_key(n)
            else:
                binning_indices = column.sample_binning_indices_in_independent_attribute_mode(n)
                self.synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices)
def synthetize():
	#get_ipython().run_line_magic('matplotlib', 'auto')
	# Adding current direcotry into sys.path
	# input dataset
	input_data = './census/adult_data.csv'
	# location of two output files
	mode = 'correlated_attribute_mode'
	description_file = f'./census/out/{mode}/description.json'
	synthetic_data = f'./census/out/{mode}/sythetic_data.csv'


	# An attribute is categorical if its domain size is less than this threshold.
	# Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset).
	threshold_value = 20

	# specify categorical attributes
	categorical_attributes = {'education': True, 'native-country': True}

	# specify which attributes are candidate keys of input dataset.
	candidate_keys = {'ssn': True}

	# A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
	# change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
	# Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
	epsilon = 0.1

	# The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
	degree_of_bayesian_network = 2

	# Number of tuples generated in synthetic dataset.
	num_tuples_to_generate = 32561 # Here 32561 is the same as input dataset, but it can be set to another number.


	# ### Step 3 DataDescriber
	# 
	# 1. Instantiate a DataDescriber.
	# 2. Compute the statistics of the dataset.
	# 3. Save dataset description to a file on local machine.

	describer = DataDescriber(category_threshold=threshold_value)
	describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
	                                                        epsilon=epsilon, 
	                                                        k=degree_of_bayesian_network,
	                                                        attribute_to_is_categorical=categorical_attributes,
	                                                        attribute_to_is_candidate_key=candidate_keys)
	describer.save_dataset_description_to_file(description_file)

	display_bayesian_network(describer.bayesian_network)


	# ### Step 4 generate synthetic dataset
	# 
	# 1. Instantiate a DataGenerator.
	# 2. Generate a synthetic dataset.
	# 3. Save it to local machine.

	generator = DataGenerator()
	generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
	generator.save_synthetic_data(synthetic_data)


	# ### Step 5 compare the statistics of input and sythetic data (optional)
	# 
	# The synthetic data is already saved in a file by step 4. The ModelInspector is for a quick test on the similarity between input and synthetic datasets.
	# 
	# #### 5.1 instantiate a ModelInspector.
	# 
	# It needs input dataset, synthetic dataset, and attribute description.

	# Read both datasets using Pandas.
	input_df = pd.read_csv(input_data, skipinitialspace=True)
	synthetic_df = pd.read_csv(synthetic_data)
	# Read attribute description from the dataset description file.
	attribute_description = read_json_file(description_file)['attribute_description']

	inspector = ModelInspector(input_df, synthetic_df, attribute_description)


	# #### 5.2 compare histograms between input and synthetic datasets.

	plot_id = 0
	for attribute in synthetic_df.columns:
	    inspector.compare_histograms(attribute, plot_id)
	    plot_id += 1


	# #### 5.3 compare pairwise mutual information

	inspector.mutual_information_heatmap()
def generate_data(username):
    configuration = read_json_file('{}_parameters.json'.format(username))
    input_dataset_file = '{}.csv'.format(username)
    description_file = '{}_description.json'.format(username)
    synthetic_dataset_file = '{}_synthetic_data.csv'.format(username)

    initial_dataset_info = get_dataset_info(input_dataset_file)

    attribute_to_is_candidate = {}
    for attr in initial_dataset_info['attribute_list']:
        if attr in configuration['candidate_atts']:
            attribute_to_is_candidate[attr] = True
        else:
            attribute_to_is_candidate[attr] = False

    attribute_to_is_categorical = {}
    for attr in initial_dataset_info['attribute_list']:
        if attr in configuration['categorical_atts']:
            attribute_to_is_categorical[attr] = True
        else:
            attribute_to_is_categorical[attr] = False

    if configuration['tuple_n'] == '':
        n = initial_dataset_info['number_of_tuples']
    else:
        n = int(configuration['tuple_n'])

    # if configuration['categorical_threshold'] == '':
    #     categorical_threshold = 10
    # else:
    #     categorical_threshold = int(configuration['categorical_threshold'])

    if configuration['seed'] == '':
        seed = 0
    else:
        seed = int(configuration['seed'])

    generator = DataGenerator()
    if configuration['chose_mode'] == 'mode1':
        describer = DataDescriber()
        describer.describe_dataset_in_random_mode(input_dataset_file, {},
                                                  attribute_to_is_categorical,
                                                  attribute_to_is_candidate,
                                                  seed)
        describer.save_dataset_description_to_file(description_file)
        generator.generate_dataset_in_random_mode(n, description_file, seed)
    else:

        if configuration['histogram_size'] == '':
            histogram_size = 20
        else:
            histogram_size = int(configuration['histogram_size'])

        if configuration['epsilon'] == '':
            epsilon = 0.1
        else:
            epsilon = float(configuration['epsilon'])

        attribute_to_datatype = configuration['type_atts']

        describer = DataDescriber(histogram_size)
        if configuration['chose_mode'] == 'mode2':
            describer.describe_dataset_in_independent_attribute_mode(
                input_dataset_file, epsilon, attribute_to_datatype,
                attribute_to_is_categorical, attribute_to_is_candidate, seed)
            describer.save_dataset_description_to_file(description_file)
            generator.generate_dataset_in_independent_mode(
                n, description_file, seed)
        elif configuration['chose_mode'] == 'mode3':
            if configuration['max_degree'] == '':
                max_degree = 3
            else:
                max_degree = int(configuration['max_degree'])

            describer.describe_dataset_in_correlated_attribute_mode(
                input_dataset_file, max_degree, epsilon, attribute_to_datatype,
                attribute_to_is_categorical, attribute_to_is_candidate, seed)
            describer.save_dataset_description_to_file(description_file)
            generator.generate_dataset_in_correlated_attribute_mode(
                n, description_file, seed)

    generator.save_synthetic_data(synthetic_dataset_file)
def get_drawable_attributes(plot_json_file):
    plot_data = read_json_file(plot_json_file)
    return list(plot_data['barchart'].keys()) + list(
        plot_data['histogram'].keys())
def get_categorical_attributes(plot_json_file):
    plot_data = read_json_file(plot_json_file)
    return list(plot_data['barchart'].keys())
Ejemplo n.º 12
0
def res_json_processing_plot(request):
    passed_data_name = request.session.get('passed_data_name')
    description_file = passed_data_name + "_plot.json"
    plot_json = read_json_file(description_file)
    return HttpResponse(json.dumps(plot_json), content_type='application/json')