def describe_dataset_in_correlated_attribute_mode(
            self,
            dataset_file,
            k=0,
            epsilon=0.1,
            attribute_to_datatype_dict={},
            categorical_attributes={},
            seed=0):
        """Generate dataset description using correlated attribute mode.

        Users only need to call this function. It packages the rest functions.

        Args:
            dataset_file: string, directory and file name of the sensitive dataset as input in csv format.
            epsilon: float, a parameter in differential privacy.
            attribute_to_datatype_dict: Dict, mappings of {column_name: data_type}, e.g., {"gender": "string"}.
            categorical_attributes: Set or List, e.g., {"gender", "nationality"}
            seed: int or float, seeding the randomness.
        """

        self.describe_dataset_in_independent_attribute_mode(
            dataset_file, epsilon, attribute_to_datatype_dict,
            categorical_attributes, seed)
        self.encoded_dataset = self.encode_dataset_into_interval_indices()
        self.bayesian_network = greedy_bayes(
            self.input_dataset[self.encoded_dataset.columns], k, epsilon)
        self.dataset_description['bayesian_network'] = self.bayesian_network
        self.dataset_description[
            'conditional_probabilities'] = construct_noisy_conditional_distributions(
                self.bayesian_network, self.encoded_dataset, epsilon)
Exemple #2
0
    def describe_dataset_in_correlated_attribute_mode(self,
                                                      dataset_file,
                                                      k=0,
                                                      epsilon=0.1,
                                                      attribute_to_datatype: Dict[str, DataType] = None,
                                                      attribute_to_is_categorical: Dict[str, bool] = None,
                                                      attribute_to_is_candidate_key: Dict[str, bool] = None,
                                                      categorical_attribute_domain_file: str = None,
                                                      numerical_attribute_ranges: Dict[str, List] = None,
                                                      seed=0,
                                                      bayesian_network: List[tuple] = None,
                                                      user_pool: int = 1):
        """Generate dataset description using correlated attribute mode.

        Parameters
        ----------
        dataset_file : str
            File name (with directory) of the sensitive dataset as input in csv format.
        k : int
            Maximum number of parents in Bayesian network.
        epsilon : float
            A parameter in Differential Privacy. Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn
            off Differential Privacy.
        attribute_to_datatype : dict
            Dictionary of {attribute: datatype}, e.g., {"age": "Integer", "gender": "String"}.
        attribute_to_is_categorical : dict
            Dictionary of {attribute: boolean}, e.g., {"gender":True, "age":False}.
        attribute_to_is_candidate_key: dict
            Dictionary of {attribute: boolean}, e.g., {"id":True, "name":False}.
        categorical_attribute_domain_file: str
            File name of a JSON file of some categorical attribute domains.
        numerical_attribute_ranges: dict
            Dictionary of {attribute: [min, max]}, e.g., {"age": [25, 65]}
        seed : int or float
            Seed the random number generator.
        """
        self.describe_dataset_in_independent_attribute_mode(dataset_file,
                                                            epsilon,
                                                            attribute_to_datatype,
                                                            attribute_to_is_categorical,
                                                            attribute_to_is_candidate_key,
                                                            categorical_attribute_domain_file,
                                                            numerical_attribute_ranges,
                                                            seed)
        self.df_encoded = self.encode_dataset_into_binning_indices()
        if self.df_encoded.shape[1] < 2:
            raise Exception("Correlated Attribute Mode requires at least 2 attributes(i.e., columns) in dataset.")

        if len(bayesian_network) == 0:
            self.bayesian_network = greedy_bayes(self.df_encoded, k, epsilon / 2, user_pool)
        else:
            print('Pre-configured Bayesian Network (BN) is used')
            bn_list = []
            for var, parents in bayesian_network.items():
                bn_list.append((var, parents))
            self.bayesian_network = bn_list
        print(self.bayesian_network)
        self.data_description['bayesian_network'] = self.bayesian_network
        self.data_description['conditional_probabilities'] = construct_noisy_conditional_distributions(
            self.bayesian_network, self.df_encoded, epsilon / 2)
    def describe_dataset_in_correlated_attribute_mode(
            self,
            dataset_file,
            k=0,
            epsilon=0.1,
            attribute_to_datatype={},
            attribute_to_is_categorical={},
            attribute_to_is_candidate_key={},
            seed=0):
        """Generate dataset description using correlated attribute mode.

        Users only need to call this function. It packages the rest functions.

        Parameters
        ----------
            dataset_file : str
                File name (with directory) of the sensitive dataset as input in csv format.
            k : int
                Maximum number of parents in Bayesian network.
            epsilon : float
                A parameter in differential privacy.
            attribute_to_datatype : dict
                Mappings of {attribute: datatype}, e.g., {"age": "Integer", "gender": "String"}.
            attribute_to_is_categorical : dict
                Mappings of {attribute: boolean}, e.g., {"gender":True, "age":False}.
            attribute_to_is_candidate_key: dict
                Mappings of {attribute: boolean}, e.g., {"id":True, "name":False}.
            seed : int or float
                Seed the random number generator.
        """

        self.describe_dataset_in_independent_attribute_mode(
            dataset_file, epsilon, attribute_to_datatype,
            attribute_to_is_categorical, attribute_to_is_candidate_key, seed)
        self.encoded_dataset = self.encode_dataset_into_binning_indices()
        if self.encoded_dataset.shape[1] < 2:
            raise Exception(
                "Constructing Bayesian Network needs more attributes.")

        self.bayesian_network = greedy_bayes(self.encoded_dataset, k, epsilon)
        self.dataset_description['bayesian_network'] = self.bayesian_network
        self.dataset_description[
            'conditional_probabilities'] = construct_noisy_conditional_distributions(
                self.bayesian_network, self.encoded_dataset, epsilon)
Exemple #4
0
    def describe_dataset_in_correlated_attribute_mode(
            self,
            dataset_file,
            k=0,
            epsilon=0.1,
            attribute_to_datatype={},
            attribute_to_is_categorical={},
            seed=0):
        """Generate dataset description using correlated attribute mode.

        Users only need to call this function. It packages the rest functions.

        Parameters
        ----------
            dataset_file : str
                File name (with directory) of the sensitive dataset as input in csv format.
            k : int
                Maximum number of parents in Bayesian network.
            epsilon : float
                A parameter in differential privacy.
            attribute_to_datatype : dict
                Mappings of {attribute: datatype}, e.g., {"age": "int"}.
            attribute_to_is_categorical : dict
                Mappings of {attribute: boolean},, e.g., {"age":False}.
            seed : int or float
                Seed the random number generator.
        """

        self.describe_dataset_in_independent_attribute_mode(
            dataset_file, epsilon, attribute_to_datatype,
            attribute_to_is_categorical, seed)
        self.encoded_dataset = self.encode_dataset_into_interval_indices()

        if self.encoded_dataset.columns.size < 2:
            raise Exception(
                "Current dataset doesn't have enough attributes to build Bayesian network in correlated attribute mode."
            )

        self.bayesian_network = greedy_bayes(
            self.input_dataset[self.encoded_dataset.columns], k, epsilon)
        self.dataset_description['bayesian_network'] = self.bayesian_network
        self.dataset_description[
            'conditional_probabilities'] = construct_noisy_conditional_distributions(
                self.bayesian_network, self.encoded_dataset, epsilon)