class JunctionTreeTests(TestCase):
    def setUp(self):
        self.data = DataUtils(TESTING_FILE)
        self.dep_graph = DependencyGraph(self.data)
        self.edges = self.dep_graph.get_dep_edges()
        self.nodes = self.data.get_nodes_name()
        self.jtree_path = c.TEST_JTREE_FILE_PATH

    def test_jtree_without_noise(self):
        dep_graph = DependencyGraph(self.data, noise_flag=False)
        edges = dep_graph.get_dep_edges()
        jtree = JunctionTree(edges, self.nodes, self.jtree_path)
        cliques = jtree.get_jtree()['cliques']
        self.assertEqual(
            cliques == [['HTN', 'Height'], ['HTN', 'Weight'],
                        ['Income', 'TRV'], ['Age'], ['DGF']], True)

    def test_jtree_with_white_list(self):
        dep_graph = DependencyGraph(self.data,
                                    white_list=[['Age', 'Income', 'TRV'],
                                                ['DGF', 'HTN']])
        edges = dep_graph.get_dep_edges()
        jtree = JunctionTree(edges, self.nodes, self.jtree_path)
        cliques = jtree.get_jtree()['cliques']
        self.assertEqual(
            cliques == [['HTN', 'Height'], ['HTN', 'Weight'], ['HTN', 'DGF'],
                        ['Income', 'TRV', 'Age']], True)

    def test_build_jtree_then_check_jtree_file(self):
        self.TestA()
        self.TestB()

    def TestA(self):
        """
		The dependency graph is a complete graph, 
		so there is only one clique in the junction tree
		"""
        jtree = JunctionTree(self.edges, self.nodes, self.jtree_path)
        jtreepy = jtree.get_jtree()
        #print jtreepy
        self.assertEqual(len(jtreepy) == 3, True)

    def TestB(self):
        import os, time
        from stat import *
        st = os.stat(self.jtree_path)
        now = time.time()
        # TODO: Need to know this file is new modified
        #self.assertEqual((st.st_mtime - now) < 100000, True)
class JunctionTreeTests(TestCase):

	def setUp(self):
		self.data = DataUtils(TESTING_FILE)
		self.dep_graph = DependencyGraph(self.data)
		self.edges = self.dep_graph.get_dep_edges()
		self.nodes = self.data.get_nodes_name()
		self.jtree_path = c.TEST_JTREE_FILE_PATH

	def test_jtree_without_noise(self):
		dep_graph = DependencyGraph(self.data, noise_flag = False)
		edges = dep_graph.get_dep_edges()
		jtree = JunctionTree(edges, self.nodes, self.jtree_path)
		cliques = jtree.get_jtree()['cliques']
		self.assertEqual(cliques == [['HTN', 'Height'], ['HTN', 'Weight'], ['Income', 'TRV'], ['Age'], ['DGF']], True)

	def test_jtree_with_white_list(self):
		dep_graph = DependencyGraph(self.data, white_list = [['Age', 'Income', 'TRV'], ['DGF', 'HTN']])
		edges = dep_graph.get_dep_edges()
		jtree = JunctionTree(edges, self.nodes, self.jtree_path)
		cliques = jtree.get_jtree()['cliques']
		self.assertEqual(cliques == [['HTN', 'Height'], ['HTN', 'Weight'], ['HTN', 'DGF'], ['Income', 'TRV', 'Age']], True)

	def test_build_jtree_then_check_jtree_file(self):
		self.TestA()
		self.TestB()

	def TestA(self):
		"""
		The dependency graph is a complete graph, 
		so there is only one clique in the junction tree
		"""
		jtree = JunctionTree(self.edges, self.nodes, self.jtree_path)
		jtreepy = jtree.get_jtree()
		#print jtreepy
		self.assertEqual(len(jtreepy) == 3, True)

	def TestB(self):
		import os, time
		from stat import *
		st = os.stat(self.jtree_path)
		now = time.time()
		# TODO: Need to know this file is new modified
		#self.assertEqual((st.st_mtime - now) < 100000, True)
class Preprocess(Base):
	def __init__(self, request):

		self.chunk_size = request['chunk_size'] if 'chunk_size' in request.keys() else -1
		self.coarse_data_path = None
		self.data = None
		self.data_path = request['data_path']
		self.date_format = request['selected_attrs']['date_format'] if 'date_format' in request['selected_attrs'].keys() else None
		self.dep_graph = None # original edges

		self.domain = None
		self.edges = None
		self.eps1_val = request['eps1_val'] if 'eps1_val' in request.keys() else c.EPSILON_1
		self.eps1_level = request['eps1_level'] if 'eps1_level' in request.keys() else 1
		self.jtree_strct = None
		self.jtree_file_path = None
		
		self.names = request['names'] if 'names' in request.keys() else None
		self.nodes = None
		self.opted_cluster = None
		self.selected_attrs = self.convert_selected_attrs(request['selected_attrs'])
		self.specified_c_domain = request['selected_attrs']['specified_c_domain'] if 'specified_c_domain' in request['selected_attrs'].keys() else None
		self.task_id = request['task_id']
		self.task_folder = self.create_task_folder(self.task_id)
		self.valbin_map = None

		self.white_list = self.get_white_list(request)

	def read_data(self):
		self.data = DataUtils(
			file_path = self.data_path, 
			selected_attrs = self.selected_attrs,
			names = self.names,
			specified_c_domain = self.specified_c_domain,
			chunk_size = self.chunk_size,
			date_format = self.date_format
		)

	def coarse(self):
		self.data.data_coarsilize()
		self.domain = self.data.get_domain()
		self.nodes = self.data.get_nodes_name()
		self.valbin_map = str(self.data.get_valbin_maps())

	def build_dep_graph(self):
		# dependency graph
		dep_graph_obj = DependencyGraph(self.data, eps1_val = self.eps1_val)
		self.edges = dep_graph_obj.get_dep_edges(display = True)
		self.cust_edges = dep_graph_obj.set_white_list(self.white_list) \
							.get_dep_edges(display = True)
		self.dep_graph = str(self.edges)

	def get_white_list(self, request):
		white_list = request['white_list'] if 'white_list' in request.keys() and len(request['white_list']) > 0 else "[]"
		if not isinstance(white_list, list):
			white_list = ast.literal_eval(white_list)
		return white_list

	def build_jtree(self):
		# junction tree
		jtree = JunctionTree(
			self.cust_edges, 
			self.nodes, 
			self.get_jtree_file_path(self.task_id, self.eps1_level), # the path to save junction tree file
		)

		# optimize marginal
		var_reduce = VarianceReduce(self.domain, jtree.get_jtree()['cliques'], 0.2)
		self.opted_cluster = var_reduce.main()
		self.jtree_strct = jtree.get_jtree()['cliques']
		self.jtree_file_path = self.save_merged_jtree(self.task_id, self.eps1_level, self.jtree_strct)

	def save_coarse_data(self):
		# TODO: to deal with failure
		file_path = os.path.join(self.task_folder,c.COARSE_DATA_NAME)
		if self.data is not None:
			self.data.save(file_path)
			self.coarse_data_path = file_path

	def update_instance(self, status, is_celery):
		if not is_celery:
			return

		instance = get_object_or_404(Task, pk = self.task_id)
		instance.eps1_val = self.eps1_val
		instance.eps1_level = self.eps1_level
		instance.dep_graph = str(self.edges)
		instance.valbin_map = str(self.valbin_map)
		instance.domain = str(self.domain.items()) if self.domain is not None else None
		instance.white_list = self.white_list
		instance.jtree_strct = str(self.jtree_strct)
		instance.opted_cluster = str(self.opted_cluster)
		instance.status = ProcessStatus.get_code(status)
		instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
		instance.save()
class DataUtilitiesTests(TestCase):
# TODO: The Data Coarse and Generalize step should seperate, to simulate a more real case.

	def setUp(self):
		self.selected_attrs = dict({
			'Age':'C', 
			'workclass':'D',
			'fnlwgt':'C',
			'education':'D',
			'education_num':'D',
			'marital_status':'D',
			'occupation':'D',
			'relationship':'D',
			'race':'D',
			'sex':'D',
			'capital_gain':'C',
			'capital_loss':'C',
			'hours_per_week':'C',
			'native_country':'D',
			'salary_class':'D'
		})		
		self.data = DataUtils(file_path = TESTING_FILE, selected_attrs = self.selected_attrs)
		self.data.data_coarsilize()

		
		self.base = Base()

	def test_data_preview(self):
		data = DataUtils(file_path = TESTING_FILE)
		preview = data.data_preview()
		self.assertEqual(len(preview.values[0]) > 0, True)

	def test_read_data_by_three_selected_column(self):
		"""
		Test the read data by user specified columns
		"""
		self.assertEqual(len(self.data.get_nodes_name()) == len(self.selected_attrs), True)

	def test_data_domain_keep_original_order(self):
		"""
		Test the order in domain object is in same order with 
		original raw data.
		"""
		df = self.data.get_pandas_df()
		domain = self.data.get_domain()
		cols = domain.keys()
		self.assertEqual(cols == list(df.columns.values), True)

	def test_data_coarsilization(self):
		print self.data.get_pandas_df()[:5]

	def test_data_generalization(self):
		self.data.data_generalize()
		print self.data.get_pandas_df()[:5]

	def test_is_skip_pre_processing_with_create(self):
		create_flag = True
		request = {
			'data_path':'/path/to/dummy/file.csv',
			'selected_attrs':{
				'names':['A', 'D', 'C', 'B'],
				'types':['C', 'C', 'C', 'D']
			}
		}
		instance = {
			'data_path':'/path/to/dummy/file.csv',
			'selected_attrs':{
				'names':['A', 'D', 'C', 'B'],
				'types':['C', 'C', 'C', 'D']
			}
		}
		skip_pre_process = self.base.is_pre_process_skip(request, request, create_flag)
		self.assertEqual(skip_pre_process == False, True)

	def test_is_skip_pre_processing_with_data_path_change(self):

		create_flag = False
		request = {
			'data_path':'/path/to/dummy/file.csv',
			'selected_attrs':{
				'names':['A', 'D', 'C', 'B'],
				'types':['C', 'C', 'C', 'D']
			}
		}
		instance = {
			'data_path':'/path/to/dummy/file22222222.csv',
			'selected_attrs':{
				'names':['A', 'D', 'C', 'B'],
				'types':['C', 'C', 'C', 'D']
			}
		}
		
		skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag)
		print skip_pre_process
		self.assertEqual(skip_pre_process == False, True)

	def test_is_skip_pre_processing_with_selected_attr_change(self):

		create_flag = False
		request = {
			'data_path':'/path/to/dummy/file.csv',
			'selected_attrs':{
				'names':['A', 'D', 'C', 'B'],
				'types':['C', 'C', 'C', 'D']
			}
		}
		instance = {
			'data_path':'/path/to/dummy/file.csv',
			'selected_attrs':{
				'names':['A', 'D', 'C', 'B'],
				'types':['D', 'D', 'D', 'D']
			}
		}
		skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag)
		print skip_pre_process
		self.assertEqual(skip_pre_process == False, True)

	def test_is_skip_pre_processing_without_change(self):
		
		create_flag = False
		request = {
			'data_path':'/path/to/dummy/file.csv',
			'selected_attrs':{
				'names':['A', 'D', 'C', 'B'],
				'types':['C', 'C', 'C', 'D']
			}
		}
		instance = {
			'data_path':'/path/to/dummy/file.csv',
			'selected_attrs':{
				'names':['A', 'D', 'C', 'B'],
				'types':['C', 'C', 'C', 'D']
			}
		}
		skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag)
		self.assertEqual(skip_pre_process == True, True)
class DataUtilitiesTests(TestCase):
    # TODO: The Data Coarse and Generalize step should seperate, to simulate a more real case.

    def setUp(self):
        self.selected_attrs = dict({
            'Age': 'C',
            'workclass': 'D',
            'fnlwgt': 'C',
            'education': 'D',
            'education_num': 'D',
            'marital_status': 'D',
            'occupation': 'D',
            'relationship': 'D',
            'race': 'D',
            'sex': 'D',
            'capital_gain': 'C',
            'capital_loss': 'C',
            'hours_per_week': 'C',
            'native_country': 'D',
            'salary_class': 'D'
        })
        self.data = DataUtils(file_path=TESTING_FILE,
                              selected_attrs=self.selected_attrs)
        self.data.data_coarsilize()

        self.base = Base()

    def test_data_preview(self):
        data = DataUtils(file_path=TESTING_FILE)
        preview = data.data_preview()
        self.assertEqual(len(preview.values[0]) > 0, True)

    def test_read_data_by_three_selected_column(self):
        """
		Test the read data by user specified columns
		"""
        self.assertEqual(
            len(self.data.get_nodes_name()) == len(self.selected_attrs), True)

    def test_data_domain_keep_original_order(self):
        """
		Test the order in domain object is in same order with 
		original raw data.
		"""
        df = self.data.get_pandas_df()
        domain = self.data.get_domain()
        cols = domain.keys()
        self.assertEqual(cols == list(df.columns.values), True)

    def test_data_coarsilization(self):
        print self.data.get_pandas_df()[:5]

    def test_data_generalization(self):
        self.data.data_generalize()
        print self.data.get_pandas_df()[:5]

    def test_is_skip_pre_processing_with_create(self):
        create_flag = True
        request = {
            'data_path': '/path/to/dummy/file.csv',
            'selected_attrs': {
                'names': ['A', 'D', 'C', 'B'],
                'types': ['C', 'C', 'C', 'D']
            }
        }
        instance = {
            'data_path': '/path/to/dummy/file.csv',
            'selected_attrs': {
                'names': ['A', 'D', 'C', 'B'],
                'types': ['C', 'C', 'C', 'D']
            }
        }
        skip_pre_process = self.base.is_pre_process_skip(
            request, request, create_flag)
        self.assertEqual(skip_pre_process == False, True)

    def test_is_skip_pre_processing_with_data_path_change(self):

        create_flag = False
        request = {
            'data_path': '/path/to/dummy/file.csv',
            'selected_attrs': {
                'names': ['A', 'D', 'C', 'B'],
                'types': ['C', 'C', 'C', 'D']
            }
        }
        instance = {
            'data_path': '/path/to/dummy/file22222222.csv',
            'selected_attrs': {
                'names': ['A', 'D', 'C', 'B'],
                'types': ['C', 'C', 'C', 'D']
            }
        }

        skip_pre_process = self.base.is_pre_process_skip(
            request, instance, create_flag)
        print skip_pre_process
        self.assertEqual(skip_pre_process == False, True)

    def test_is_skip_pre_processing_with_selected_attr_change(self):

        create_flag = False
        request = {
            'data_path': '/path/to/dummy/file.csv',
            'selected_attrs': {
                'names': ['A', 'D', 'C', 'B'],
                'types': ['C', 'C', 'C', 'D']
            }
        }
        instance = {
            'data_path': '/path/to/dummy/file.csv',
            'selected_attrs': {
                'names': ['A', 'D', 'C', 'B'],
                'types': ['D', 'D', 'D', 'D']
            }
        }
        skip_pre_process = self.base.is_pre_process_skip(
            request, instance, create_flag)
        print skip_pre_process
        self.assertEqual(skip_pre_process == False, True)

    def test_is_skip_pre_processing_without_change(self):

        create_flag = False
        request = {
            'data_path': '/path/to/dummy/file.csv',
            'selected_attrs': {
                'names': ['A', 'D', 'C', 'B'],
                'types': ['C', 'C', 'C', 'D']
            }
        }
        instance = {
            'data_path': '/path/to/dummy/file.csv',
            'selected_attrs': {
                'names': ['A', 'D', 'C', 'B'],
                'types': ['C', 'C', 'C', 'D']
            }
        }
        skip_pre_process = self.base.is_pre_process_skip(
            request, instance, create_flag)
        self.assertEqual(skip_pre_process == True, True)
Exemple #6
0
class Preprocess(Base):
    def __init__(self, request):

        self.chunk_size = request[
            'chunk_size'] if 'chunk_size' in request.keys() else -1
        self.coarse_data_path = None
        self.data = None
        self.data_path = request['data_path']
        self.date_format = request['selected_attrs'][
            'date_format'] if 'date_format' in request['selected_attrs'].keys(
            ) else None
        self.dep_graph = None  # original edges

        self.domain = None
        self.edges = None
        self.eps1_val = request['eps1_val'] if 'eps1_val' in request.keys(
        ) else c.EPSILON_1
        self.eps1_level = request[
            'eps1_level'] if 'eps1_level' in request.keys() else 1
        self.jtree_strct = None
        self.jtree_file_path = None

        self.names = request['names'] if 'names' in request.keys() else None
        self.nodes = None
        self.opted_cluster = None
        self.selected_attrs = self.convert_selected_attrs(
            request['selected_attrs'])
        self.specified_c_domain = request['selected_attrs'][
            'specified_c_domain'] if 'specified_c_domain' in request[
                'selected_attrs'].keys() else None
        self.task_id = request['task_id']
        self.task_folder = self.create_task_folder(self.task_id)
        self.valbin_map = None

        self.white_list = self.get_white_list(request)

    def read_data(self):
        self.data = DataUtils(file_path=self.data_path,
                              selected_attrs=self.selected_attrs,
                              names=self.names,
                              specified_c_domain=self.specified_c_domain,
                              chunk_size=self.chunk_size,
                              date_format=self.date_format)

    def coarse(self):
        self.data.data_coarsilize()
        self.domain = self.data.get_domain()
        self.nodes = self.data.get_nodes_name()
        self.valbin_map = str(self.data.get_valbin_maps())

    def build_dep_graph(self):
        # dependency graph
        dep_graph_obj = DependencyGraph(self.data, eps1_val=self.eps1_val)
        self.edges = dep_graph_obj.get_dep_edges(display=True)
        self.cust_edges = dep_graph_obj.set_white_list(self.white_list) \
             .get_dep_edges(display = True)
        self.dep_graph = str(self.edges)

    def get_white_list(self, request):
        white_list = request['white_list'] if 'white_list' in request.keys(
        ) and len(request['white_list']) > 0 else "[]"
        if not isinstance(white_list, list):
            white_list = ast.literal_eval(white_list)
        return white_list

    def build_jtree(self):
        # junction tree
        jtree = JunctionTree(
            self.cust_edges,
            self.nodes,
            self.get_jtree_file_path(
                self.task_id,
                self.eps1_level),  # the path to save junction tree file
        )

        # optimize marginal
        var_reduce = VarianceReduce(self.domain,
                                    jtree.get_jtree()['cliques'], 0.2)
        self.opted_cluster = var_reduce.main()
        self.jtree_strct = jtree.get_jtree()['cliques']
        self.jtree_file_path = self.save_merged_jtree(self.task_id,
                                                      self.eps1_level,
                                                      self.jtree_strct)

    def save_coarse_data(self):
        # TODO: to deal with failure
        file_path = os.path.join(self.task_folder, c.COARSE_DATA_NAME)
        if self.data is not None:
            self.data.save(file_path)
            self.coarse_data_path = file_path

    def update_instance(self, status, is_celery):
        if not is_celery:
            return

        instance = get_object_or_404(Task, pk=self.task_id)
        instance.eps1_val = self.eps1_val
        instance.eps1_level = self.eps1_level
        instance.dep_graph = str(self.edges)
        instance.valbin_map = str(self.valbin_map)
        instance.domain = str(
            self.domain.items()) if self.domain is not None else None
        instance.white_list = self.white_list
        instance.jtree_strct = str(self.jtree_strct)
        instance.opted_cluster = str(self.opted_cluster)
        instance.status = ProcessStatus.get_code(status)
        instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
        instance.save()