Beispiel #1
0
    def generate_dataset_in_random_mode(self, n, description_file, seed=0):
        set_random_seed(seed)
        description = read_json_file(description_file)

        self.synthetic_dataset = pd.DataFrame()
        for attr in description['attribute_description'].keys():
            attr_description = description['attribute_description'][attr]
            datatype = attr_description['datatype']
            is_categorical = attr_description['is_categorical']
            if is_categorical:
                self.synthetic_dataset[attr] = np.random.choice(
                    attr_description['distribution_bins'], n)
            elif datatype == 'string':
                length = np.random.randint(attr_description['min'],
                                           attr_description['max'])
                self.synthetic_dataset[attr] = length
                self.synthetic_dataset[attr] = self.synthetic_dataset[
                    attr].map(lambda x: generate_random_string(x))
            else:
                minimum, maximum = attr_description['min'], attr_description[
                    'max']
                if datatype == 'int':
                    self.synthetic_dataset[attr] = np.random.randint(
                        minimum, maximum + 1, n)
                else:
                    self.synthetic_dataset[attr] = np.random.uniform(
                        minimum, maximum, n)
Beispiel #2
0
    def sample_from_encoded_dataset(self):
        self.synthetic_dataset = self.encoded_dataset.copy()
        for attribute in self.synthetic_dataset:
            datatype = self.description['attribute_description'][attribute][
                'datatype']
            not_categorical = not self.description['attribute_description'][
                attribute]['is_categorical']
            self.synthetic_dataset[attribute] = self.synthetic_dataset[
                attribute].apply(lambda x: self.sample_uniformly_for_attribute(
                    attribute, int(x)))
            if datatype == 'integer':
                self.synthetic_dataset[attribute] = self.synthetic_dataset[
                    ~self.synthetic_dataset[attribute].isnull(
                    )][attribute].astype(int)
            elif datatype == 'string' and not_categorical:
                self.synthetic_dataset[attribute] = self.synthetic_dataset[
                    ~self.synthetic_dataset[attribute].isnull(
                    )][attribute].map(lambda x: generate_random_string(int(x)))

        sorted_attributes = [
            attr for attr in self.description['meta']['attribute_list']
            if attr in self.synthetic_dataset
        ]
        self.synthetic_dataset = self.synthetic_dataset.loc[:,
                                                            sorted_attributes]
    def sample_values_from_binning_indices(self, binning_indices):
        column = super().sample_values_from_binning_indices(binning_indices)
        if not self.is_categorical:
            column[~column.isnull()] = column[~column.isnull()].apply(
                lambda x: utils.generate_random_string(int(x)))

        return column
Beispiel #4
0
    def generate_dataset_in_random_mode(self,
                                        n,
                                        description_file,
                                        seed=0,
                                        minimum=0,
                                        maximum=100):
        set_random_seed(seed)
        description = read_json_file(description_file)

        self.synthetic_dataset = DataFrame()
        for attr in description['attribute_description'].keys():
            attr_info = description['attribute_description'][attr]
            datatype = attr_info['data_type']
            is_categorical = attr_info['is_categorical']
            is_candidate_key = attr_info['is_candidate_key']
            if is_candidate_key:
                self.synthetic_dataset[attr] = parse_json(
                    attr_info).generate_values_as_candidate_key(n)
            elif is_categorical:
                self.synthetic_dataset[attr] = random.choice(
                    attr_info['distribution_bins'], n)
            elif datatype == 'String':
                length = random.randint(attr_info['min'], attr_info['max'] + 1)
                self.synthetic_dataset[attr] = length
                self.synthetic_dataset[attr] = self.synthetic_dataset[
                    attr].map(lambda x: generate_random_string(x))
            else:
                if datatype == 'Integer':
                    self.synthetic_dataset[attr] = random.randint(
                        minimum, maximum + 1, n)
                else:
                    self.synthetic_dataset[attr] = random.uniform(
                        minimum, maximum, n)
 def generate_values_as_candidate_key(self, n):
     length = np.random.randint(self.min, self.max)
     vectorized = np.vectorize(
         lambda x: '{}{}'.format(utils.generate_random_string(length), x))
     return vectorized(np.arange(n))