def process_node(node_i, level, loss, predictions, cur_lvl, top_k, alpha, loss_type, w, debug, enumerator): cur_enum_nodes = [] for node_j in level: if enumerator == "join": flag = approved_join_slice(node_i, node_j, cur_lvl) else: flag = approved_union_slice(node_i, node_j) if flag and int(node_i.name.split("&&")[0]) < int( node_j.name.split("&&")[0]): new_node = SparkNode(loss, predictions) parents_set = set(new_node.parents) parents_set.add(node_i) parents_set.add(node_j) new_node.parents = list(parents_set) parent1_attr = node_i.attributes parent2_attr = node_j.attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_node.key = new_node.make_key() new_node.calc_bounds(cur_lvl, w) to_slice = new_node.check_bounds(top_k, len(predictions), alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, loss, len(predictions), w) if new_node.check_constraint(top_k, len(predictions), alpha): cur_enum_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) return cur_enum_nodes
def join_enum_fun(node_a, list_b, predictions, f_l2, debug, alpha, w, loss_type, cur_lvl, top_k): x_size = len(predictions) nodes = [] for node_i in range(len(list_b)): flag = spark_utils.approved_join_slice(node_i, node_a, cur_lvl) if not flag: new_node = SparkNode(predictions, f_l2) parents_set = set(new_node.parents) parents_set.add(node_i) parents_set.add(node_a) new_node.parents = list(parents_set) parent1_attr = node_a.attributes parent2_attr = list_b[node_i].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_node.calc_bounds(cur_lvl, w) # check if concrete data should be extracted or not (only for those that have score upper # and if size of subset is big enough to_slice = new_node.check_bounds(top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, f_l2, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha) and new_node.key not in top_k.keys: top_k.add_new_top_slice(new_node) nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) return nodes
def test_parents_second(self): cur_lvl_nodes = {} all_nodes = {} b_update = True cur_lvl = 1 slice_index = (2, 'x0_3') combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w, self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes) parent1 = combined[0][('x0_3 && x1_3')] parent2 = combined[0][('x0_3 && x2_2')] new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) new_node.parents = [parent1, parent2] parent1_attr = parent1.attributes parent2_attr = parent2.attributes new_node_attr = slicer.union(parent1_attr, parent2_attr) self.assertEqual(new_node_attr, [('x0_3', 2), ('x1_3', 5), ('x2_2', 7)]) print("check2")
def test_nonsense(self): cur_lvl_nodes = {} all_nodes = {} b_update = True cur_lvl = 1 slice_index = (2, 'x0_3') combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w, self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes) parent1 = combined[0][('x0_3 && x1_3')] parent2 = combined[0][('x0_3 && x2_2')] new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) new_node.parents = [parent1, parent2] parent1_attr = parent1.attributes parent2_attr = parent2.attributes new_node_attr = slicer.union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() flagTrue = slicer.slice_name_nonsense(parent1, parent2, 2) self.assertEqual(True, flagTrue) print("check3")
def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update): top_k = Topk(k) # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds levels = [] first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, w, alpha, top_k) # double appending of first level nodes in order to enumerating second level in the same way as others levels.append((first_level[0], len(all_features))) all_nodes = first_level[1] # cur_lvl - index of current level, correlates with number of slice forming features cur_lvl = 1 # level that is planned to be filled later cur_lvl_nodes = first_level # currently for debug print("Level 1 had " + str(len(all_features)) + " candidates") print() print("Current topk are: ") top_k.print_topk() # DPSize algorithm approach of previous levels nodes combinations and updating bounds for those that already exist while len(cur_lvl_nodes) > 0: cur_lvl_nodes = [] count = 0 for left in range(int(cur_lvl / 2) + 1): right = cur_lvl - 1 - left for node_i in range(len(levels[left][0])): for node_j in range(len(levels[right][0])): flag = check_attributes(levels[left][0][node_i], levels[right][0][node_j]) if not flag: new_node = Node(complete_x, loss, x_size, y_test, errors) parents_set = set(new_node.parents) parents_set.add(levels[left][0][node_i]) parents_set.add(levels[right][0][node_j]) new_node.parents = list(parents_set) parent1_attr = levels[left][0][node_i].attributes parent2_attr = levels[right][0][node_j].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) if new_node.key[1] in all_nodes: existing_item = all_nodes[new_node.key[1]] parents_set = set(existing_item.parents) existing_item.parents = parents_set if b_update: s_upper = new_node.calc_s_upper(cur_lvl) s_lower = new_node.calc_s_lower(cur_lvl) e_upper = new_node.calc_e_upper() e_max_upper = new_node.calc_e_max_upper( cur_lvl) new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w) else: new_node.calc_bounds(cur_lvl, w) all_nodes[new_node.key[1]] = new_node # check if concrete data should be extracted or not (only for those that have score upper # big enough and if size of subset is big enough to_slice = new_node.check_bounds( top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun( new_node.loss, new_node.size, loss, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha ) and new_node.key not in top_k.keys: top_k.add_new_top_slice(new_node) cur_lvl_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) count = count + levels[left][1] * levels[right][1] print("Level " + str(cur_lvl) + " had " + str(count) + " candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level") cur_lvl = cur_lvl + 1 levels.append((cur_lvl_nodes, count)) top_k.print_topk() print("Program stopped at level " + str(cur_lvl)) print() print("Selected slices are: ") top_k.print_topk()