class JunctionTreeTests(TestCase): def setUp(self): self.data = DataUtils(TESTING_FILE) self.dep_graph = DependencyGraph(self.data) self.edges = self.dep_graph.get_dep_edges() self.nodes = self.data.get_nodes_name() self.jtree_path = c.TEST_JTREE_FILE_PATH def test_jtree_without_noise(self): dep_graph = DependencyGraph(self.data, noise_flag=False) edges = dep_graph.get_dep_edges() jtree = JunctionTree(edges, self.nodes, self.jtree_path) cliques = jtree.get_jtree()['cliques'] self.assertEqual( cliques == [['HTN', 'Height'], ['HTN', 'Weight'], ['Income', 'TRV'], ['Age'], ['DGF']], True) def test_jtree_with_white_list(self): dep_graph = DependencyGraph(self.data, white_list=[['Age', 'Income', 'TRV'], ['DGF', 'HTN']]) edges = dep_graph.get_dep_edges() jtree = JunctionTree(edges, self.nodes, self.jtree_path) cliques = jtree.get_jtree()['cliques'] self.assertEqual( cliques == [['HTN', 'Height'], ['HTN', 'Weight'], ['HTN', 'DGF'], ['Income', 'TRV', 'Age']], True) def test_build_jtree_then_check_jtree_file(self): self.TestA() self.TestB() def TestA(self): """ The dependency graph is a complete graph, so there is only one clique in the junction tree """ jtree = JunctionTree(self.edges, self.nodes, self.jtree_path) jtreepy = jtree.get_jtree() #print jtreepy self.assertEqual(len(jtreepy) == 3, True) def TestB(self): import os, time from stat import * st = os.stat(self.jtree_path) now = time.time() # TODO: Need to know this file is new modified #self.assertEqual((st.st_mtime - now) < 100000, True)
class JunctionTreeTests(TestCase): def setUp(self): self.data = DataUtils(TESTING_FILE) self.dep_graph = DependencyGraph(self.data) self.edges = self.dep_graph.get_dep_edges() self.nodes = self.data.get_nodes_name() self.jtree_path = c.TEST_JTREE_FILE_PATH def test_jtree_without_noise(self): dep_graph = DependencyGraph(self.data, noise_flag = False) edges = dep_graph.get_dep_edges() jtree = JunctionTree(edges, self.nodes, self.jtree_path) cliques = jtree.get_jtree()['cliques'] self.assertEqual(cliques == [['HTN', 'Height'], ['HTN', 'Weight'], ['Income', 'TRV'], ['Age'], ['DGF']], True) def test_jtree_with_white_list(self): dep_graph = DependencyGraph(self.data, white_list = [['Age', 'Income', 'TRV'], ['DGF', 'HTN']]) edges = dep_graph.get_dep_edges() jtree = JunctionTree(edges, self.nodes, self.jtree_path) cliques = jtree.get_jtree()['cliques'] self.assertEqual(cliques == [['HTN', 'Height'], ['HTN', 'Weight'], ['HTN', 'DGF'], ['Income', 'TRV', 'Age']], True) def test_build_jtree_then_check_jtree_file(self): self.TestA() self.TestB() def TestA(self): """ The dependency graph is a complete graph, so there is only one clique in the junction tree """ jtree = JunctionTree(self.edges, self.nodes, self.jtree_path) jtreepy = jtree.get_jtree() #print jtreepy self.assertEqual(len(jtreepy) == 3, True) def TestB(self): import os, time from stat import * st = os.stat(self.jtree_path) now = time.time() # TODO: Need to know this file is new modified #self.assertEqual((st.st_mtime - now) < 100000, True)
class Preprocess(Base): def __init__(self, request): self.chunk_size = request['chunk_size'] if 'chunk_size' in request.keys() else -1 self.coarse_data_path = None self.data = None self.data_path = request['data_path'] self.date_format = request['selected_attrs']['date_format'] if 'date_format' in request['selected_attrs'].keys() else None self.dep_graph = None # original edges self.domain = None self.edges = None self.eps1_val = request['eps1_val'] if 'eps1_val' in request.keys() else c.EPSILON_1 self.eps1_level = request['eps1_level'] if 'eps1_level' in request.keys() else 1 self.jtree_strct = None self.jtree_file_path = None self.names = request['names'] if 'names' in request.keys() else None self.nodes = None self.opted_cluster = None self.selected_attrs = self.convert_selected_attrs(request['selected_attrs']) self.specified_c_domain = request['selected_attrs']['specified_c_domain'] if 'specified_c_domain' in request['selected_attrs'].keys() else None self.task_id = request['task_id'] self.task_folder = self.create_task_folder(self.task_id) self.valbin_map = None self.white_list = self.get_white_list(request) def read_data(self): self.data = DataUtils( file_path = self.data_path, selected_attrs = self.selected_attrs, names = self.names, specified_c_domain = self.specified_c_domain, chunk_size = self.chunk_size, date_format = self.date_format ) def coarse(self): self.data.data_coarsilize() self.domain = self.data.get_domain() self.nodes = self.data.get_nodes_name() self.valbin_map = str(self.data.get_valbin_maps()) def build_dep_graph(self): # dependency graph dep_graph_obj = DependencyGraph(self.data, eps1_val = self.eps1_val) self.edges = dep_graph_obj.get_dep_edges(display = True) self.cust_edges = dep_graph_obj.set_white_list(self.white_list) \ .get_dep_edges(display = True) self.dep_graph = str(self.edges) def get_white_list(self, request): white_list = request['white_list'] if 'white_list' in request.keys() and len(request['white_list']) > 0 else "[]" if not isinstance(white_list, list): white_list = ast.literal_eval(white_list) return white_list def build_jtree(self): # junction tree jtree = JunctionTree( self.cust_edges, self.nodes, self.get_jtree_file_path(self.task_id, self.eps1_level), # the path to save junction tree file ) # optimize marginal var_reduce = VarianceReduce(self.domain, jtree.get_jtree()['cliques'], 0.2) self.opted_cluster = var_reduce.main() self.jtree_strct = jtree.get_jtree()['cliques'] self.jtree_file_path = self.save_merged_jtree(self.task_id, self.eps1_level, self.jtree_strct) def save_coarse_data(self): # TODO: to deal with failure file_path = os.path.join(self.task_folder,c.COARSE_DATA_NAME) if self.data is not None: self.data.save(file_path) self.coarse_data_path = file_path def update_instance(self, status, is_celery): if not is_celery: return instance = get_object_or_404(Task, pk = self.task_id) instance.eps1_val = self.eps1_val instance.eps1_level = self.eps1_level instance.dep_graph = str(self.edges) instance.valbin_map = str(self.valbin_map) instance.domain = str(self.domain.items()) if self.domain is not None else None instance.white_list = self.white_list instance.jtree_strct = str(self.jtree_strct) instance.opted_cluster = str(self.opted_cluster) instance.status = ProcessStatus.get_code(status) instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') instance.save()
class DataUtilitiesTests(TestCase): # TODO: The Data Coarse and Generalize step should seperate, to simulate a more real case. def setUp(self): self.selected_attrs = dict({ 'Age':'C', 'workclass':'D', 'fnlwgt':'C', 'education':'D', 'education_num':'D', 'marital_status':'D', 'occupation':'D', 'relationship':'D', 'race':'D', 'sex':'D', 'capital_gain':'C', 'capital_loss':'C', 'hours_per_week':'C', 'native_country':'D', 'salary_class':'D' }) self.data = DataUtils(file_path = TESTING_FILE, selected_attrs = self.selected_attrs) self.data.data_coarsilize() self.base = Base() def test_data_preview(self): data = DataUtils(file_path = TESTING_FILE) preview = data.data_preview() self.assertEqual(len(preview.values[0]) > 0, True) def test_read_data_by_three_selected_column(self): """ Test the read data by user specified columns """ self.assertEqual(len(self.data.get_nodes_name()) == len(self.selected_attrs), True) def test_data_domain_keep_original_order(self): """ Test the order in domain object is in same order with original raw data. """ df = self.data.get_pandas_df() domain = self.data.get_domain() cols = domain.keys() self.assertEqual(cols == list(df.columns.values), True) def test_data_coarsilization(self): print self.data.get_pandas_df()[:5] def test_data_generalization(self): self.data.data_generalize() print self.data.get_pandas_df()[:5] def test_is_skip_pre_processing_with_create(self): create_flag = True request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, request, create_flag) self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_data_path_change(self): create_flag = False request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file22222222.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_selected_attr_change(self): create_flag = False request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['D', 'D', 'D', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_without_change(self): create_flag = False request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag) self.assertEqual(skip_pre_process == True, True)
class DataUtilitiesTests(TestCase): # TODO: The Data Coarse and Generalize step should seperate, to simulate a more real case. def setUp(self): self.selected_attrs = dict({ 'Age': 'C', 'workclass': 'D', 'fnlwgt': 'C', 'education': 'D', 'education_num': 'D', 'marital_status': 'D', 'occupation': 'D', 'relationship': 'D', 'race': 'D', 'sex': 'D', 'capital_gain': 'C', 'capital_loss': 'C', 'hours_per_week': 'C', 'native_country': 'D', 'salary_class': 'D' }) self.data = DataUtils(file_path=TESTING_FILE, selected_attrs=self.selected_attrs) self.data.data_coarsilize() self.base = Base() def test_data_preview(self): data = DataUtils(file_path=TESTING_FILE) preview = data.data_preview() self.assertEqual(len(preview.values[0]) > 0, True) def test_read_data_by_three_selected_column(self): """ Test the read data by user specified columns """ self.assertEqual( len(self.data.get_nodes_name()) == len(self.selected_attrs), True) def test_data_domain_keep_original_order(self): """ Test the order in domain object is in same order with original raw data. """ df = self.data.get_pandas_df() domain = self.data.get_domain() cols = domain.keys() self.assertEqual(cols == list(df.columns.values), True) def test_data_coarsilization(self): print self.data.get_pandas_df()[:5] def test_data_generalization(self): self.data.data_generalize() print self.data.get_pandas_df()[:5] def test_is_skip_pre_processing_with_create(self): create_flag = True request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, request, create_flag) self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_data_path_change(self): create_flag = False request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file22222222.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_selected_attr_change(self): create_flag = False request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['D', 'D', 'D', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_without_change(self): create_flag = False request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, instance, create_flag) self.assertEqual(skip_pre_process == True, True)
class Preprocess(Base): def __init__(self, request): self.chunk_size = request[ 'chunk_size'] if 'chunk_size' in request.keys() else -1 self.coarse_data_path = None self.data = None self.data_path = request['data_path'] self.date_format = request['selected_attrs'][ 'date_format'] if 'date_format' in request['selected_attrs'].keys( ) else None self.dep_graph = None # original edges self.domain = None self.edges = None self.eps1_val = request['eps1_val'] if 'eps1_val' in request.keys( ) else c.EPSILON_1 self.eps1_level = request[ 'eps1_level'] if 'eps1_level' in request.keys() else 1 self.jtree_strct = None self.jtree_file_path = None self.names = request['names'] if 'names' in request.keys() else None self.nodes = None self.opted_cluster = None self.selected_attrs = self.convert_selected_attrs( request['selected_attrs']) self.specified_c_domain = request['selected_attrs'][ 'specified_c_domain'] if 'specified_c_domain' in request[ 'selected_attrs'].keys() else None self.task_id = request['task_id'] self.task_folder = self.create_task_folder(self.task_id) self.valbin_map = None self.white_list = self.get_white_list(request) def read_data(self): self.data = DataUtils(file_path=self.data_path, selected_attrs=self.selected_attrs, names=self.names, specified_c_domain=self.specified_c_domain, chunk_size=self.chunk_size, date_format=self.date_format) def coarse(self): self.data.data_coarsilize() self.domain = self.data.get_domain() self.nodes = self.data.get_nodes_name() self.valbin_map = str(self.data.get_valbin_maps()) def build_dep_graph(self): # dependency graph dep_graph_obj = DependencyGraph(self.data, eps1_val=self.eps1_val) self.edges = dep_graph_obj.get_dep_edges(display=True) self.cust_edges = dep_graph_obj.set_white_list(self.white_list) \ .get_dep_edges(display = True) self.dep_graph = str(self.edges) def get_white_list(self, request): white_list = request['white_list'] if 'white_list' in request.keys( ) and len(request['white_list']) > 0 else "[]" if not isinstance(white_list, list): white_list = ast.literal_eval(white_list) return white_list def build_jtree(self): # junction tree jtree = JunctionTree( self.cust_edges, self.nodes, self.get_jtree_file_path( self.task_id, self.eps1_level), # the path to save junction tree file ) # optimize marginal var_reduce = VarianceReduce(self.domain, jtree.get_jtree()['cliques'], 0.2) self.opted_cluster = var_reduce.main() self.jtree_strct = jtree.get_jtree()['cliques'] self.jtree_file_path = self.save_merged_jtree(self.task_id, self.eps1_level, self.jtree_strct) def save_coarse_data(self): # TODO: to deal with failure file_path = os.path.join(self.task_folder, c.COARSE_DATA_NAME) if self.data is not None: self.data.save(file_path) self.coarse_data_path = file_path def update_instance(self, status, is_celery): if not is_celery: return instance = get_object_or_404(Task, pk=self.task_id) instance.eps1_val = self.eps1_val instance.eps1_level = self.eps1_level instance.dep_graph = str(self.edges) instance.valbin_map = str(self.valbin_map) instance.domain = str( self.domain.items()) if self.domain is not None else None instance.white_list = self.white_list instance.jtree_strct = str(self.jtree_strct) instance.opted_cluster = str(self.opted_cluster) instance.status = ProcessStatus.get_code(status) instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') instance.save()