Ejemplo n.º 1
0
    def describe_dataset_in_independent_attribute_mode(
            self,
            dataset_file,
            epsilon=0.1,
            attribute_to_datatype={},
            attribute_to_is_categorical={},
            attribute_to_is_candidate_key={},
            seed=0):

        utils.set_random_seed(seed)
        self.attribute_to_datatype = {
            attr: DataType(data_type)
            for attr, data_type in attribute_to_datatype.items()
        }
        self.attribute_to_is_categorical = dict(attribute_to_is_categorical)
        self.attribute_to_is_candidate_key = dict(
            attribute_to_is_candidate_key)
        self.read_dataset_from_csv(dataset_file)
        self.infer_attribute_data_types()
        self.get_dataset_meta_info()
        self.convert_input_dataset_into_a_dict_of_columns()
        self.infer_domains()
        self.inject_laplace_noise_into_distribution_per_attribute(epsilon)
        # record attribute information in json format
        self.dataset_description['attribute_description'] = {}
        for attr, column in self.input_dataset_as_column_dict.items():
            assert isinstance(column, AbstractAttribute)
            self.dataset_description['attribute_description'][
                attr] = column.to_json()
Ejemplo n.º 2
0
    def generate_dataset_in_random_mode(self, n, description_file, seed=0):
        set_random_seed(seed)
        description = read_json_file(description_file)

        self.synthetic_dataset = pd.DataFrame()
        for attr in description['attribute_description'].keys():
            attr_description = description['attribute_description'][attr]
            datatype = attr_description['datatype']
            is_categorical = attr_description['is_categorical']
            if is_categorical:
                self.synthetic_dataset[attr] = np.random.choice(
                    attr_description['distribution_bins'], n)
            elif datatype == 'string':
                length = np.random.randint(attr_description['min'],
                                           attr_description['max'])
                self.synthetic_dataset[attr] = length
                self.synthetic_dataset[attr] = self.synthetic_dataset[
                    attr].map(lambda x: generate_random_string(x))
            else:
                minimum, maximum = attr_description['min'], attr_description[
                    'max']
                if datatype == 'int':
                    self.synthetic_dataset[attr] = np.random.randint(
                        minimum, maximum + 1, n)
                else:
                    self.synthetic_dataset[attr] = np.random.uniform(
                        minimum, maximum, n)
Ejemplo n.º 3
0
    def generate_dataset_in_correlated_attribute_mode(self,
                                                      n,
                                                      description_file,
                                                      seed=0):
        set_random_seed(seed)
        self.n = n
        self.description = read_json_file(description_file)

        all_attributes = self.description['meta']['all_attributes']
        candidate_keys = set(self.description['meta']['candidate_keys'])
        self.encoded_dataset = DataGenerator.generate_encoded_dataset(
            self.n, self.description)
        self.synthetic_dataset = DataFrame(columns=all_attributes)
        for attr in all_attributes:
            attr_info = self.description['attribute_description'][attr]
            column = parse_json(attr_info)

            if attr in self.encoded_dataset:
                self.synthetic_dataset[
                    attr] = column.sample_values_from_binning_indices(
                        self.encoded_dataset[attr])
            elif attr in candidate_keys:
                self.synthetic_dataset[
                    attr] = column.generate_values_as_candidate_key(n)
            else:
                # for attributes not in BN or candidate keys, use independent attribute mode.
                binning_indices = column.sample_binning_indices_in_independent_attribute_mode(
                    n)
                self.synthetic_dataset[
                    attr] = column.sample_values_from_binning_indices(
                        binning_indices)
Ejemplo n.º 4
0
    def generate_dataset_in_random_mode(self,
                                        n,
                                        description_file,
                                        seed=0,
                                        minimum=0,
                                        maximum=100):
        set_random_seed(seed)
        description = read_json_file(description_file)

        self.synthetic_dataset = DataFrame()
        for attr in description['attribute_description'].keys():
            attr_info = description['attribute_description'][attr]
            datatype = attr_info['data_type']
            is_categorical = attr_info['is_categorical']
            is_candidate_key = attr_info['is_candidate_key']
            if is_candidate_key:
                self.synthetic_dataset[attr] = parse_json(
                    attr_info).generate_values_as_candidate_key(n)
            elif is_categorical:
                self.synthetic_dataset[attr] = random.choice(
                    attr_info['distribution_bins'], n)
            elif datatype == 'String':
                length = random.randint(attr_info['min'], attr_info['max'] + 1)
                self.synthetic_dataset[attr] = length
                self.synthetic_dataset[attr] = self.synthetic_dataset[
                    attr].map(lambda x: generate_random_string(x))
            else:
                if datatype == 'Integer':
                    self.synthetic_dataset[attr] = random.randint(
                        minimum, maximum + 1, n)
                else:
                    self.synthetic_dataset[attr] = random.uniform(
                        minimum, maximum, n)
Ejemplo n.º 5
0
 def generate_dataset_in_correlated_attribute_mode(self,
                                                   n,
                                                   description_file,
                                                   seed=0):
     self.n = n
     set_random_seed(seed)
     self.description = read_json_file(description_file)
     self.encoded_dataset = DataGenerator.generate_encoded_dataset(
         self.n, self.description)
     self.sample_from_encoded_dataset()
Ejemplo n.º 6
0
 def describe_dataset_in_independent_attribute_mode(self,
                                                    dataset_file,
                                                    epsilon=0.1,
                                                    attribute_to_datatype={},
                                                    attribute_to_is_categorical={},
                                                    seed=0):
     utils.set_random_seed(seed)
     self.attribute_to_datatype = dict(attribute_to_datatype)
     self.attribute_to_is_categorical = dict(attribute_to_is_categorical)
     self.read_dataset_from_csv(dataset_file)
     self.get_dataset_meta_info()
     self.infer_attribute_datatypes()
     self.infer_domains()
     self.inject_laplace_noise_into_distribution_per_attribute(epsilon)
Ejemplo n.º 7
0
    def describe_dataset_in_random_mode(
            self,
            dataset_file: str,
            attribute_to_datatype: Dict[str, DataType] = None,
            attribute_to_is_categorical: Dict[str, bool] = None,
            attribute_to_is_candidate_key: Dict[str, bool] = None,
            categorical_attribute_domain_file: str = None,
            numerical_attribute_ranges: Dict[str, List] = None,
            seed=0):
        attribute_to_datatype = attribute_to_datatype or {}
        attribute_to_is_categorical = attribute_to_is_categorical or {}
        attribute_to_is_candidate_key = attribute_to_is_candidate_key or {}
        numerical_attribute_ranges = numerical_attribute_ranges or {}

        if categorical_attribute_domain_file:
            categorical_attribute_to_domain = utils.read_json_file(
                categorical_attribute_domain_file)
        else:
            categorical_attribute_to_domain = {}

        utils.set_random_seed(seed)
        self.attr_to_datatype = {
            attr: DataType(datatype)
            for attr, datatype in attribute_to_datatype.items()
        }
        self.attr_to_is_categorical = attribute_to_is_categorical
        self.attr_to_is_candidate_key = attribute_to_is_candidate_key
        self.read_dataset_from_csv(dataset_file)
        self.infer_attribute_data_types()
        self.analyze_dataset_meta()
        self.represent_input_dataset_by_columns()

        for column in self.attr_to_column.values():
            attr_name = column.name
            if attr_name in categorical_attribute_to_domain:
                column.infer_domain(
                    categorical_domain=categorical_attribute_to_domain[
                        attr_name])
            elif attr_name in numerical_attribute_ranges:
                column.infer_domain(
                    numerical_range=numerical_attribute_ranges[attr_name])
            else:
                column.infer_domain()

        # record attribute information in json format
        self.data_description['attribute_description'] = {}
        for attr, column in self.attr_to_column.items():
            self.data_description['attribute_description'][
                attr] = column.to_json()
Ejemplo n.º 8
0
    def generate_dataset_in_independent_mode(self, n, description_file, seed=0):
        set_random_seed(seed)
        self.description = read_json_file(description_file)

        all_attributes = self.description['meta']['all_attributes']
        candidate_keys = set(self.description['meta']['candidate_keys'])
        self.synthetic_dataset = DataFrame(columns=all_attributes)
        for attr in all_attributes:
            attr_info = self.description['attribute_description'][attr]
            column = parse_json(attr_info)

            if attr in candidate_keys:
                self.synthetic_dataset[attr] = column.generate_values_as_candidate_key(n)
            else:
                binning_indices = column.sample_binning_indices_in_independent_attribute_mode(n)
                self.synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices)
Ejemplo n.º 9
0
    def generate_dataset_in_correlated_attribute_mode(self,
                                                      n,
                                                      description_file,
                                                      seed=0):
        self.n = n
        set_random_seed(seed)
        self.description = read_json_file(description_file)
        self.encoded_dataset = DataGenerator.generate_encoded_dataset(
            self.n, self.description)

        # # use independent attribute mode for attributes ignored by BN, which are non-categorical strings.
        # for attr in self.description['meta']['attributes_ignored_by_BN']:
        #     attr_info = self.description['attribute_description'][attr]
        #     bins = attr_info['distribution_bins']
        #     probs = attr_info['distribution_probabilities']
        #     self.encoded_dataset[attr] = np.random.choice(list(range(len(bins))), size=n, p=probs)

        self.sample_from_encoded_dataset()
Ejemplo n.º 10
0
    def generate_dataset_in_independent_mode(self,
                                             n,
                                             description_file,
                                             seed=0):
        set_random_seed(seed)
        self.description = read_json_file(description_file)

        attributes = self.description['meta']['attribute_list']
        self.encoded_dataset = pd.DataFrame(columns=attributes,
                                            index=list(range(n)))
        for attr in attributes:
            attr_info = self.description['attribute_description'][attr]
            bins = attr_info['distribution_bins']
            probs = attr_info['distribution_probabilities']
            self.encoded_dataset[attr] = np.random.choice(list(range(
                len(bins))),
                                                          size=n,
                                                          p=probs)

        self.sample_from_encoded_dataset()