def __init__(self, domain, jtreepy, _lambda=0.2): """ Using linear programming method to find a less noise variance. param: domain: the domain of the given data (note) the order of columns in domain should be same with the original jtree: the structure of junction tree _lambda: the balance number. TODO: 1. Move jt_rep to Junction Tree Module 2. Move Different Operator to a linear algebra package """ self.LOG = Base.get_logger("CliqueMerge") self.domain = domain self.node_card = [len(vals) for vals in domain.values()] self._lambda = float(_lambda) self.max_iter = 20 self.jtree = jtreepy self.nodes_num = len(self.node_card) self.cliques_num = len(jtreepy) self.cnum = range(2, len(jtreepy) + 1) if len(jtreepy) >= 2 else [1] self.jtree_in_node_index = [ self.find_subset_index(clique) for clique in jtreepy ]
def __init__( self, data = None, edges = None, noise_flag = True, white_list = [], eps1_val = c.EPSILON_1, cramer = 0.2): """ __init__ Input: 1. DataUtils.Data Procedure 1. Convert the given data frame to dataframe in R 2. Convert the given Domain(in python dict) to ListVector 3. Instantial the attributes dependency. """ self.LOG = Base.get_logger("DepGraph") self.noise_flag = noise_flag self.eps1_val = eps1_val self.cramer = cramer self.data = data if data is None: self.edges = edges else: self.edges = self._run() self.white_list = white_list
def __init__(self, sensitive_data): """ Import the original data and initialize the utility measurement object Parameters ---------- sensitive_data: string The path to the original data. """ self.LOG = Base.get_logger("UserQuery") sensitive = DataUtils(file_path=sensitive_data) self.sensitive_df = sensitive.get_pandas_df()
def __init__(self, sensitive_data): """ Import the original data and initialize the utility measurement object Parameters ---------- sensitive_data: string The path to the original data. """ self.LOG = Base.get_logger("UserQuery") sensitive = DataUtils(file_path = sensitive_data) self.sensitive_df = sensitive.get_pandas_df()
def __init__( self, file_path = None, selected_attrs = None, pandas_df = None, valbin_maps = None, names=None, specified_c_domain = None, chunk_size = -1, date_format = None ): """ Loading data to warpped python object Parameters ---------- file_path: string The path of original data selected_attrs: dict { "A":"C", "B":"D", ... } pandas_df: Pandas dataframe Initialize with a pandas dataframe(TODO: deprecated) valbin_maps: dict A mapping of original values with coarse value names: list, experiment A list to specifiy the attributes' names when the input file has no header specified_c_domain: dict, experiment A mapping of continuous type attributes with the specified edges in coarse """ self.LOG = Base.get_logger("DataUtils") self.valbin_maps = dict() if valbin_maps is None else valbin_maps self.chunk_size = chunk_size if chunk_size > 0: self.dataframe = self._loading_chunk(file_path, pandas_df, names) else: self.dataframe = self._loading(file_path, pandas_df, names) if selected_attrs is not None: self.selected_attrs = selected_attrs # the 'selected_attrs' is ordered self.dataframe = self.dataframe[selected_attrs.keys()] self.preview_count = 5 self.specified_c_domain = specified_c_domain self.date_format = date_format
def __init__(self, file_path=None, selected_attrs=None, pandas_df=None, valbin_maps=None, names=None, specified_c_domain=None, chunk_size=-1, date_format=None): """ Loading data to warpped python object Parameters ---------- file_path: string The path of original data selected_attrs: dict { "A":"C", "B":"D", ... } pandas_df: Pandas dataframe Initialize with a pandas dataframe(TODO: deprecated) valbin_maps: dict A mapping of original values with coarse value names: list, experiment A list to specifiy the attributes' names when the input file has no header specified_c_domain: dict, experiment A mapping of continuous type attributes with the specified edges in coarse """ self.LOG = Base.get_logger("DataUtils") self.valbin_maps = dict() if valbin_maps is None else valbin_maps self.chunk_size = chunk_size if chunk_size > 0: self.dataframe = self._loading_chunk(file_path, pandas_df, names) else: self.dataframe = self._loading(file_path, pandas_df, names) if selected_attrs is not None: self.selected_attrs = selected_attrs # the 'selected_attrs' is ordered self.dataframe = self.dataframe[selected_attrs.keys()] self.preview_count = 5 self.specified_c_domain = specified_c_domain self.date_format = date_format
def __init__(self, data, jtree_path, domain, cluster, histogramdds, epsilon = 0.0 ): """ Initialize the inference class. TODO: 1. refactor, the data_path, edges, nodes, domain are temporary to be here. param data: the pandas dataframe TODO: Because the DPTable algorithm construct lots of attributes when reading data, to using memory cache, one should refector the inference step of DPTable. param domain: data information with format in dictionary { "A":[1,2,3,4,5], "B":[2,3,4,5,6] } param cluster: the merged cluster structure param epsilon: the privacy budget """ self.LOG = Base.get_logger("Inference") self.data = data self.data_size = data.get_count() self.epsilon = epsilon self.rdomain = self.convert2rdomain(domain) sorted_internal = lambda ls2: [sorted(ls) for ls in ls2] self.cluster = sorted_internal(cluster) self.jtree_path = jtree_path self.histogramdds = histogramdds
def __init__(self, domain, jtreepy, _lambda=0.2): """ Using linear programming method to find a less noise variance. param: domain: the domain of the given data (note) the order of columns in domain should be same with the original jtree: the structure of junction tree _lambda: the balance number. TODO: 1. Move jt_rep to Junction Tree Module 2. Move Different Operator to a linear algebra package """ self.LOG = Base.get_logger("CliqueMerge") self.domain = domain self.node_card = [len(vals) for vals in domain.values()] self._lambda = float(_lambda) self.max_iter = 20 self.jtree = jtreepy self.nodes_num = len(self.node_card) self.cliques_num = len(jtreepy) self.cnum = range(2, len(jtreepy)+1) if len(jtreepy) >=2 else [1] self.jtree_in_node_index = [self.find_subset_index(clique) for clique in jtreepy]
def __init__(self, edges, nodes, jtree_path=None): edges = self.convert2rlistofvector(edges) self.LOG = Base.get_logger("JunctionTree") self.jtree = self._build_jtree(edges, nodes, jtree_path)
def __init__(self, edges, nodes, jtree_path = None): edges = self.convert2rlistofvector(edges) self.LOG = Base.get_logger("JunctionTree") self.jtree = self._build_jtree(edges, nodes, jtree_path)