예제 #1
0
class Anonymization(Base):

	def __init__(self, request, is_celery):
		self.privacy_level = request['privacy_level']
		self.epsilon = float(request['epsilon'])
		self.min_freq = float(request['min_freq']) if 'min_freq' in request.keys() else 0.
		self.exp_round = request['exp_round'] if 'exp_round' in request.keys() else None

		self.dp_id = request['dp_id']
		task = get_object_or_404(Task, pk = request['task_id']) if is_celery else request['task_id']
		self.task_id = task.task_id
		self.eps1_level = task.eps1_level
		self.data_path = task.data_path

		self.jtree_strct = ast.literal_eval(str(task.jtree_strct))
		self.opted_cluster = ast.literal_eval(str(task.opted_cluster))
		
		self.edges = ast.literal_eval(str(task.dep_graph))
		self.domain = task.domain if isinstance(task.domain, dict) else collections.OrderedDict(ast.literal_eval(task.domain)) # This is the corsed domain
		self.valbin_map = ast.literal_eval(str(task.valbin_map))
		self.selected_attrs = task.selected_attrs if isinstance(task.selected_attrs, dict) else self.convert_selected_attrs(task.selected_attrs)
		self.white_list = ast.literal_eval(str(task.white_list))
		self.nodes = self.domain.keys()

		self.histogramdds = None
		self.data = None
		self.sim_df = None
		self.statistics_err = None
		self.generalized_dataframe = None
		self.synthetic_path = None



	def get_coarse_data(self):
		# TODO: Read coarse data from memory cach.
		# TODO: To deal with failure.
		folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id}
		file_path = os.path.join(folder,c.COARSE_DATA_NAME)
		self.data = DataUtils(
			file_path = file_path, 
			valbin_maps = self.valbin_map,
			selected_attrs = self.selected_attrs
		)

	def kaggregate(self):
		if self.min_freq > 0:
			# cluster_num = len(self.jtree_strct)
			# thresh = self.get_freq_thresh(epsilon, cluster_num, min_freq)
			thresh = self.min_freq
			self.data.aggregation(thresh)
			self.domain = self.data.get_domain()
			self.valbin_map = self.data.get_valbin_maps()

	def get_histograms(self):
		combined_queries = self.combine_cliques_for_query(self.jtree_strct, self.opted_cluster)
		stats_func = StatsFunctions()
		self.histogramdds = stats_func.histogramdd_batch(self.data, combined_queries)

	def do_inference(self):
		inference = Inference(
			self.data, 
			self.get_jtree_file_path(self.task_id, self.eps1_level), 
			self.domain, 
			self.opted_cluster,
			self.histogramdds,
			self.epsilon)

		self.model = inference.execute()

	def simulate(self):
		simulator = Simulate(self.model, self.data.get_nrows())
		self.sim_df = simulator.run()

	def get_statistical_error(self):
		"""
		Compute the mean and standard varience error rates(Both coarse data).
		Parameters
			task_id:
				The task id to retrieve coarsed data.
			sim_coarsed_df:
				The noised sythetic data.
		Returns
			{
				"A":0.05,
				"B":0.12,
				...
			}
		"""
		eps1 = self.eps1_level
		eps2 = self.epsilon
		white_list = self.white_list
		k = self.min_freq
		nodes = self.nodes
		# read the original coarse data first.
		coarsed_df = self.data.get_pandas_df()

		# make sure the order
		sim_coarsed_df = self.sim_df[self.nodes]
		
		coarsed_df_mean = np.array(coarsed_df.mean(), dtype = float)
		coarsed_df_std = np.array(coarsed_df.std(), dtype = float)

		sim_df_mean = np.array(sim_coarsed_df.mean(), dtype = float)
		sim_df_std = np.array(sim_coarsed_df.std(), dtype = float)

		mean_error = np.abs((sim_df_mean - coarsed_df_mean)*100 / coarsed_df_mean)
		std_error = np.abs((sim_df_std - coarsed_df_std)*100 / coarsed_df_std)

		mean_error = [str(rate)+'%' for rate in np.round(mean_error, decimals = 2)]
		std_error = [str(rate)+'%' for rate in np.round(std_error, decimals = 2)]

		self.print_pretty_summary(nodes, mean_error, std_error, eps1, eps2, white_list, k)
		self.statistics_err = {
			'attrs':nodes,
			'measures':['mean', 'std'],
			'values':{
				'mean':mean_error,
				'std':std_error
			}
		}

	def data_generalize(self):
		data = DataUtils(pandas_df=self.sim_df, valbin_maps = self.valbin_map, selected_attrs = self.selected_attrs)
		data.data_generalize()
		self.generalized_dataframe = data.get_pandas_df()

	def save_data(self):
		if self.exp_round:
			self.synthetic_path = self.save_sim_data_exp()
		else:
			self.synthetic_path = self.save_sim_data()

	def save_sim_data(self, spec_file_name = None):
		# TODO: to deal with failure
		folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id}
		if not os.path.exists(folder):
			os.makedirs(folder)

		file_name = c.SIM_DATA_NAME_PATTERN % {'privacy_level': self.privacy_level}
		if spec_file_name is not None:
			file_name = spec_file_name
			# TODO: a parameter to specify no header output
			file_path = os.path.join(folder,file_name)
			self.generalized_dataframe.to_csv(file_path, index = False, header = False)
		else:
			file_path = os.path.join(folder,file_name)
			self.generalized_dataframe.to_csv(file_path, index = False)

		return c.SIM_DATA_URI_PATTERN % {'task_id':self.task_id, 'file_name':file_name}

	def save_sim_data_exp(self):
		spec_file_name = "sim_eps1lv_%(eps_lv)s_eps2lv_%(privacy_level)s_k_%(min_freq)s_round_%(exp_round)s.csv" % {
				'exp_round': self.exp_round,
				'privacy_level': self.privacy_level,
				'eps_lv': self.eps1_level,
				'min_freq': int(self.min_freq)
		}
		return self.save_sim_data(spec_file_name = spec_file_name)

	def print_pretty_summary(self, nodes, mean_error, std_error, eps1, eps2, white_list, k):
		LOG = Base.get_logger("Statical Accuracy Summary")
		import pandas as pd
		frame = pd.DataFrame({
			'Attribures': nodes,
			'Mean': mean_error,
			'STD': std_error
			})
		LOG.info("eps1: %.2f, eps2: %.2f" % (eps1, eps2))
		LOG.info("White List: %s" % str(white_list))
		LOG.info("k-aggregate value: %d" % k)
		LOG.info('\n'+str(frame))

	def create_instance(self):
		Job.objects.create(
			task_id = self.task_id,
			privacy_level = self.privacy_level,
			epsilon = self.epsilon,
			synthetic_path = self.synthetic_path,
			statistics_err = self.statistics_err
		)

	def update_instance(self, status, is_celery):
		if not is_celery:
			return

		instance = get_object_or_404(Job, pk = self.dp_id)
		instance.synthetic_path = self.synthetic_path
		instance.statistics_err = self.statistics_err
		instance.status = ProcessStatus.get_code(status)
		instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
		instance.save()
예제 #2
0
class Anonymization(Base):
    def __init__(self, request, is_celery):
        self.privacy_level = request['privacy_level']
        self.epsilon = float(request['epsilon'])
        self.min_freq = float(
            request['min_freq']) if 'min_freq' in request.keys() else 0.
        self.exp_round = request['exp_round'] if 'exp_round' in request.keys(
        ) else None

        self.dp_id = request['dp_id']
        task = get_object_or_404(
            Task, pk=request['task_id']) if is_celery else request['task_id']
        self.task_id = task.task_id
        self.eps1_level = task.eps1_level
        self.data_path = task.data_path

        self.jtree_strct = ast.literal_eval(str(task.jtree_strct))
        self.opted_cluster = ast.literal_eval(str(task.opted_cluster))

        self.edges = ast.literal_eval(str(task.dep_graph))
        self.domain = task.domain if isinstance(
            task.domain, dict) else collections.OrderedDict(
                ast.literal_eval(task.domain))  # This is the corsed domain
        self.valbin_map = ast.literal_eval(str(task.valbin_map))
        self.selected_attrs = task.selected_attrs if isinstance(
            task.selected_attrs, dict) else self.convert_selected_attrs(
                task.selected_attrs)
        self.white_list = ast.literal_eval(str(task.white_list))
        self.nodes = self.domain.keys()

        self.histogramdds = None
        self.data = None
        self.sim_df = None
        self.statistics_err = None
        self.generalized_dataframe = None
        self.synthetic_path = None

    def get_coarse_data(self):
        # TODO: Read coarse data from memory cach.
        # TODO: To deal with failure.
        folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id}
        file_path = os.path.join(folder, c.COARSE_DATA_NAME)
        self.data = DataUtils(file_path=file_path,
                              valbin_maps=self.valbin_map,
                              selected_attrs=self.selected_attrs)

    def kaggregate(self):
        if self.min_freq > 0:
            # cluster_num = len(self.jtree_strct)
            # thresh = self.get_freq_thresh(epsilon, cluster_num, min_freq)
            thresh = self.min_freq
            self.data.aggregation(thresh)
            self.domain = self.data.get_domain()
            self.valbin_map = self.data.get_valbin_maps()

    def get_histograms(self):
        combined_queries = self.combine_cliques_for_query(
            self.jtree_strct, self.opted_cluster)
        stats_func = StatsFunctions()
        self.histogramdds = stats_func.histogramdd_batch(
            self.data, combined_queries)

    def do_inference(self):
        inference = Inference(
            self.data, self.get_jtree_file_path(self.task_id, self.eps1_level),
            self.domain, self.opted_cluster, self.histogramdds, self.epsilon)

        self.model = inference.execute()

    def simulate(self):
        simulator = Simulate(self.model, self.data.get_nrows())
        self.sim_df = simulator.run()

    def get_statistical_error(self):
        """
		Compute the mean and standard varience error rates(Both coarse data).
		Parameters
			task_id:
				The task id to retrieve coarsed data.
			sim_coarsed_df:
				The noised sythetic data.
		Returns
			{
				"A":0.05,
				"B":0.12,
				...
			}
		"""
        eps1 = self.eps1_level
        eps2 = self.epsilon
        white_list = self.white_list
        k = self.min_freq
        nodes = self.nodes
        # read the original coarse data first.
        coarsed_df = self.data.get_pandas_df()

        # make sure the order
        sim_coarsed_df = self.sim_df[self.nodes]

        coarsed_df_mean = np.array(coarsed_df.mean(), dtype=float)
        coarsed_df_std = np.array(coarsed_df.std(), dtype=float)

        sim_df_mean = np.array(sim_coarsed_df.mean(), dtype=float)
        sim_df_std = np.array(sim_coarsed_df.std(), dtype=float)

        mean_error = np.abs(
            (sim_df_mean - coarsed_df_mean) * 100 / coarsed_df_mean)
        std_error = np.abs(
            (sim_df_std - coarsed_df_std) * 100 / coarsed_df_std)

        mean_error = [
            str(rate) + '%' for rate in np.round(mean_error, decimals=2)
        ]
        std_error = [
            str(rate) + '%' for rate in np.round(std_error, decimals=2)
        ]

        self.print_pretty_summary(nodes, mean_error, std_error, eps1, eps2,
                                  white_list, k)
        self.statistics_err = {
            'attrs': nodes,
            'measures': ['mean', 'std'],
            'values': {
                'mean': mean_error,
                'std': std_error
            }
        }

    def data_generalize(self):
        data = DataUtils(pandas_df=self.sim_df,
                         valbin_maps=self.valbin_map,
                         selected_attrs=self.selected_attrs)
        data.data_generalize()
        self.generalized_dataframe = data.get_pandas_df()

    def save_data(self):
        if self.exp_round:
            self.synthetic_path = self.save_sim_data_exp()
        else:
            self.synthetic_path = self.save_sim_data()

    def save_sim_data(self, spec_file_name=None):
        # TODO: to deal with failure
        folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id}
        if not os.path.exists(folder):
            os.makedirs(folder)

        file_name = c.SIM_DATA_NAME_PATTERN % {
            'privacy_level': self.privacy_level
        }
        if spec_file_name is not None:
            file_name = spec_file_name
            # TODO: a parameter to specify no header output
            file_path = os.path.join(folder, file_name)
            self.generalized_dataframe.to_csv(file_path,
                                              index=False,
                                              header=False)
        else:
            file_path = os.path.join(folder, file_name)
            self.generalized_dataframe.to_csv(file_path, index=False)

        return c.SIM_DATA_URI_PATTERN % {
            'task_id': self.task_id,
            'file_name': file_name
        }

    def save_sim_data_exp(self):
        spec_file_name = "sim_eps1lv_%(eps_lv)s_eps2lv_%(privacy_level)s_k_%(min_freq)s_round_%(exp_round)s.csv" % {
            'exp_round': self.exp_round,
            'privacy_level': self.privacy_level,
            'eps_lv': self.eps1_level,
            'min_freq': int(self.min_freq)
        }
        return self.save_sim_data(spec_file_name=spec_file_name)

    def print_pretty_summary(self, nodes, mean_error, std_error, eps1, eps2,
                             white_list, k):
        LOG = Base.get_logger("Statical Accuracy Summary")
        import pandas as pd
        frame = pd.DataFrame({
            'Attribures': nodes,
            'Mean': mean_error,
            'STD': std_error
        })
        LOG.info("eps1: %.2f, eps2: %.2f" % (eps1, eps2))
        LOG.info("White List: %s" % str(white_list))
        LOG.info("k-aggregate value: %d" % k)
        LOG.info('\n' + str(frame))

    def create_instance(self):
        Job.objects.create(task_id=self.task_id,
                           privacy_level=self.privacy_level,
                           epsilon=self.epsilon,
                           synthetic_path=self.synthetic_path,
                           statistics_err=self.statistics_err)

    def update_instance(self, status, is_celery):
        if not is_celery:
            return

        instance = get_object_or_404(Job, pk=self.dp_id)
        instance.synthetic_path = self.synthetic_path
        instance.statistics_err = self.statistics_err
        instance.status = ProcessStatus.get_code(status)
        instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
        instance.save()
예제 #3
0
class Preprocess(Base):
	def __init__(self, request):

		self.chunk_size = request['chunk_size'] if 'chunk_size' in request.keys() else -1
		self.coarse_data_path = None
		self.data = None
		self.data_path = request['data_path']
		self.date_format = request['selected_attrs']['date_format'] if 'date_format' in request['selected_attrs'].keys() else None
		self.dep_graph = None # original edges

		self.domain = None
		self.edges = None
		self.eps1_val = request['eps1_val'] if 'eps1_val' in request.keys() else c.EPSILON_1
		self.eps1_level = request['eps1_level'] if 'eps1_level' in request.keys() else 1
		self.jtree_strct = None
		self.jtree_file_path = None
		
		self.names = request['names'] if 'names' in request.keys() else None
		self.nodes = None
		self.opted_cluster = None
		self.selected_attrs = self.convert_selected_attrs(request['selected_attrs'])
		self.specified_c_domain = request['selected_attrs']['specified_c_domain'] if 'specified_c_domain' in request['selected_attrs'].keys() else None
		self.task_id = request['task_id']
		self.task_folder = self.create_task_folder(self.task_id)
		self.valbin_map = None

		self.white_list = self.get_white_list(request)

	def read_data(self):
		self.data = DataUtils(
			file_path = self.data_path, 
			selected_attrs = self.selected_attrs,
			names = self.names,
			specified_c_domain = self.specified_c_domain,
			chunk_size = self.chunk_size,
			date_format = self.date_format
		)

	def coarse(self):
		self.data.data_coarsilize()
		self.domain = self.data.get_domain()
		self.nodes = self.data.get_nodes_name()
		self.valbin_map = str(self.data.get_valbin_maps())

	def build_dep_graph(self):
		# dependency graph
		dep_graph_obj = DependencyGraph(self.data, eps1_val = self.eps1_val)
		self.edges = dep_graph_obj.get_dep_edges(display = True)
		self.cust_edges = dep_graph_obj.set_white_list(self.white_list) \
							.get_dep_edges(display = True)
		self.dep_graph = str(self.edges)

	def get_white_list(self, request):
		white_list = request['white_list'] if 'white_list' in request.keys() and len(request['white_list']) > 0 else "[]"
		if not isinstance(white_list, list):
			white_list = ast.literal_eval(white_list)
		return white_list

	def build_jtree(self):
		# junction tree
		jtree = JunctionTree(
			self.cust_edges, 
			self.nodes, 
			self.get_jtree_file_path(self.task_id, self.eps1_level), # the path to save junction tree file
		)

		# optimize marginal
		var_reduce = VarianceReduce(self.domain, jtree.get_jtree()['cliques'], 0.2)
		self.opted_cluster = var_reduce.main()
		self.jtree_strct = jtree.get_jtree()['cliques']
		self.jtree_file_path = self.save_merged_jtree(self.task_id, self.eps1_level, self.jtree_strct)

	def save_coarse_data(self):
		# TODO: to deal with failure
		file_path = os.path.join(self.task_folder,c.COARSE_DATA_NAME)
		if self.data is not None:
			self.data.save(file_path)
			self.coarse_data_path = file_path

	def update_instance(self, status, is_celery):
		if not is_celery:
			return

		instance = get_object_or_404(Task, pk = self.task_id)
		instance.eps1_val = self.eps1_val
		instance.eps1_level = self.eps1_level
		instance.dep_graph = str(self.edges)
		instance.valbin_map = str(self.valbin_map)
		instance.domain = str(self.domain.items()) if self.domain is not None else None
		instance.white_list = self.white_list
		instance.jtree_strct = str(self.jtree_strct)
		instance.opted_cluster = str(self.opted_cluster)
		instance.status = ProcessStatus.get_code(status)
		instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
		instance.save()
예제 #4
0
class Preprocess(Base):
    def __init__(self, request):

        self.chunk_size = request[
            'chunk_size'] if 'chunk_size' in request.keys() else -1
        self.coarse_data_path = None
        self.data = None
        self.data_path = request['data_path']
        self.date_format = request['selected_attrs'][
            'date_format'] if 'date_format' in request['selected_attrs'].keys(
            ) else None
        self.dep_graph = None  # original edges

        self.domain = None
        self.edges = None
        self.eps1_val = request['eps1_val'] if 'eps1_val' in request.keys(
        ) else c.EPSILON_1
        self.eps1_level = request[
            'eps1_level'] if 'eps1_level' in request.keys() else 1
        self.jtree_strct = None
        self.jtree_file_path = None

        self.names = request['names'] if 'names' in request.keys() else None
        self.nodes = None
        self.opted_cluster = None
        self.selected_attrs = self.convert_selected_attrs(
            request['selected_attrs'])
        self.specified_c_domain = request['selected_attrs'][
            'specified_c_domain'] if 'specified_c_domain' in request[
                'selected_attrs'].keys() else None
        self.task_id = request['task_id']
        self.task_folder = self.create_task_folder(self.task_id)
        self.valbin_map = None

        self.white_list = self.get_white_list(request)

    def read_data(self):
        self.data = DataUtils(file_path=self.data_path,
                              selected_attrs=self.selected_attrs,
                              names=self.names,
                              specified_c_domain=self.specified_c_domain,
                              chunk_size=self.chunk_size,
                              date_format=self.date_format)

    def coarse(self):
        self.data.data_coarsilize()
        self.domain = self.data.get_domain()
        self.nodes = self.data.get_nodes_name()
        self.valbin_map = str(self.data.get_valbin_maps())

    def build_dep_graph(self):
        # dependency graph
        dep_graph_obj = DependencyGraph(self.data, eps1_val=self.eps1_val)
        self.edges = dep_graph_obj.get_dep_edges(display=True)
        self.cust_edges = dep_graph_obj.set_white_list(self.white_list) \
             .get_dep_edges(display = True)
        self.dep_graph = str(self.edges)

    def get_white_list(self, request):
        white_list = request['white_list'] if 'white_list' in request.keys(
        ) and len(request['white_list']) > 0 else "[]"
        if not isinstance(white_list, list):
            white_list = ast.literal_eval(white_list)
        return white_list

    def build_jtree(self):
        # junction tree
        jtree = JunctionTree(
            self.cust_edges,
            self.nodes,
            self.get_jtree_file_path(
                self.task_id,
                self.eps1_level),  # the path to save junction tree file
        )

        # optimize marginal
        var_reduce = VarianceReduce(self.domain,
                                    jtree.get_jtree()['cliques'], 0.2)
        self.opted_cluster = var_reduce.main()
        self.jtree_strct = jtree.get_jtree()['cliques']
        self.jtree_file_path = self.save_merged_jtree(self.task_id,
                                                      self.eps1_level,
                                                      self.jtree_strct)

    def save_coarse_data(self):
        # TODO: to deal with failure
        file_path = os.path.join(self.task_folder, c.COARSE_DATA_NAME)
        if self.data is not None:
            self.data.save(file_path)
            self.coarse_data_path = file_path

    def update_instance(self, status, is_celery):
        if not is_celery:
            return

        instance = get_object_or_404(Task, pk=self.task_id)
        instance.eps1_val = self.eps1_val
        instance.eps1_level = self.eps1_level
        instance.dep_graph = str(self.edges)
        instance.valbin_map = str(self.valbin_map)
        instance.domain = str(
            self.domain.items()) if self.domain is not None else None
        instance.white_list = self.white_list
        instance.jtree_strct = str(self.jtree_strct)
        instance.opted_cluster = str(self.opted_cluster)
        instance.status = ProcessStatus.get_code(status)
        instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
        instance.save()