Exemple #1
0
    def generate_dataset_in_correlated_attribute_mode(self,
                                                      n,
                                                      description_file,
                                                      seed=0):
        set_random_seed(seed)
        self.n = n
        self.description = read_json_file(description_file)

        all_attributes = self.description['meta']['all_attributes']
        candidate_keys = set(self.description['meta']['candidate_keys'])
        self.encoded_dataset = DataGenerator.generate_encoded_dataset(
            self.n, self.description)
        self.synthetic_dataset = DataFrame(columns=all_attributes)
        for attr in all_attributes:
            attr_info = self.description['attribute_description'][attr]
            column = parse_json(attr_info)

            if attr in self.encoded_dataset:
                self.synthetic_dataset[
                    attr] = column.sample_values_from_binning_indices(
                        self.encoded_dataset[attr])
            elif attr in candidate_keys:
                self.synthetic_dataset[
                    attr] = column.generate_values_as_candidate_key(n)
            else:
                # for attributes not in BN or candidate keys, use independent attribute mode.
                binning_indices = column.sample_binning_indices_in_independent_attribute_mode(
                    n)
                self.synthetic_dataset[
                    attr] = column.sample_values_from_binning_indices(
                        binning_indices)
Exemple #2
0
    def generate_dataset_in_random_mode(self,
                                        n,
                                        description_file,
                                        seed=0,
                                        minimum=0,
                                        maximum=100):
        set_random_seed(seed)
        description = read_json_file(description_file)

        self.synthetic_dataset = DataFrame()
        for attr in description['attribute_description'].keys():
            attr_info = description['attribute_description'][attr]
            datatype = attr_info['data_type']
            is_categorical = attr_info['is_categorical']
            is_candidate_key = attr_info['is_candidate_key']
            if is_candidate_key:
                self.synthetic_dataset[attr] = parse_json(
                    attr_info).generate_values_as_candidate_key(n)
            elif is_categorical:
                self.synthetic_dataset[attr] = random.choice(
                    attr_info['distribution_bins'], n)
            elif datatype == 'String':
                length = random.randint(attr_info['min'], attr_info['max'] + 1)
                self.synthetic_dataset[attr] = length
                self.synthetic_dataset[attr] = self.synthetic_dataset[
                    attr].map(lambda x: generate_random_string(x))
            else:
                if datatype == 'Integer':
                    self.synthetic_dataset[attr] = random.randint(
                        minimum, maximum + 1, n)
                else:
                    self.synthetic_dataset[attr] = random.uniform(
                        minimum, maximum, n)
    def generate_dataset_in_independent_mode(self, n, description_file, seed=0):
        set_random_seed(seed)
        self.description = read_json_file(description_file)

        all_attributes = self.description['meta']['all_attributes']
        candidate_keys = set(self.description['meta']['candidate_keys'])
        self.synthetic_dataset = DataFrame(columns=all_attributes)
        for attr in all_attributes:
            attr_info = self.description['attribute_description'][attr]
            column = parse_json(attr_info)

            if attr in candidate_keys:
                self.synthetic_dataset[attr] = column.generate_values_as_candidate_key(n)
            else:
                binning_indices = column.sample_binning_indices_in_independent_attribute_mode(n)
                self.synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices)