def generate_dataset_in_correlated_attribute_mode(self, n, description_file, seed=0): set_random_seed(seed) self.n = n self.description = read_json_file(description_file) all_attributes = self.description['meta']['all_attributes'] candidate_keys = set(self.description['meta']['candidate_keys']) self.encoded_dataset = DataGenerator.generate_encoded_dataset( self.n, self.description) self.synthetic_dataset = DataFrame(columns=all_attributes) for attr in all_attributes: attr_info = self.description['attribute_description'][attr] column = parse_json(attr_info) if attr in self.encoded_dataset: self.synthetic_dataset[ attr] = column.sample_values_from_binning_indices( self.encoded_dataset[attr]) elif attr in candidate_keys: self.synthetic_dataset[ attr] = column.generate_values_as_candidate_key(n) else: # for attributes not in BN or candidate keys, use independent attribute mode. binning_indices = column.sample_binning_indices_in_independent_attribute_mode( n) self.synthetic_dataset[ attr] = column.sample_values_from_binning_indices( binning_indices)
def generate_dataset_in_random_mode(self, n, description_file, seed=0, minimum=0, maximum=100): set_random_seed(seed) description = read_json_file(description_file) self.synthetic_dataset = DataFrame() for attr in description['attribute_description'].keys(): attr_info = description['attribute_description'][attr] datatype = attr_info['data_type'] is_categorical = attr_info['is_categorical'] is_candidate_key = attr_info['is_candidate_key'] if is_candidate_key: self.synthetic_dataset[attr] = parse_json( attr_info).generate_values_as_candidate_key(n) elif is_categorical: self.synthetic_dataset[attr] = random.choice( attr_info['distribution_bins'], n) elif datatype == 'String': length = random.randint(attr_info['min'], attr_info['max'] + 1) self.synthetic_dataset[attr] = length self.synthetic_dataset[attr] = self.synthetic_dataset[ attr].map(lambda x: generate_random_string(x)) else: if datatype == 'Integer': self.synthetic_dataset[attr] = random.randint( minimum, maximum + 1, n) else: self.synthetic_dataset[attr] = random.uniform( minimum, maximum, n)
def generate_dataset_in_independent_mode(self, n, description_file, seed=0): set_random_seed(seed) self.description = read_json_file(description_file) all_attributes = self.description['meta']['all_attributes'] candidate_keys = set(self.description['meta']['candidate_keys']) self.synthetic_dataset = DataFrame(columns=all_attributes) for attr in all_attributes: attr_info = self.description['attribute_description'][attr] column = parse_json(attr_info) if attr in candidate_keys: self.synthetic_dataset[attr] = column.generate_values_as_candidate_key(n) else: binning_indices = column.sample_binning_indices_in_independent_attribute_mode(n) self.synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices)