def process_node(node_i, level, loss, predictions, cur_lvl, top_k, alpha, loss_type, w, debug, enumerator): cur_enum_nodes = [] for node_j in level: if enumerator == "join": flag = approved_join_slice(node_i, node_j, cur_lvl) else: flag = approved_union_slice(node_i, node_j) if flag and int(node_i.name.split("&&")[0]) < int( node_j.name.split("&&")[0]): new_node = SparkNode(loss, predictions) parents_set = set(new_node.parents) parents_set.add(node_i) parents_set.add(node_j) new_node.parents = list(parents_set) parent1_attr = node_i.attributes parent2_attr = node_j.attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_node.key = new_node.make_key() new_node.calc_bounds(cur_lvl, w) to_slice = new_node.check_bounds(top_k, len(predictions), alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, loss, len(predictions), w) if new_node.check_constraint(top_k, len(predictions), alpha): cur_enum_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) return cur_enum_nodes
def make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, w, alpha, top_k): all_nodes = {} counter = 0 first_level = [] for feature in all_features: new_node = Node(complete_x, loss, x_size, y_test, errors) new_node.parents = [(feature, counter)] new_node.attributes.append((feature, counter)) new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) all_nodes[new_node.key] = new_node new_node.process_slice(loss_type) # for first level nodes all bounds are strict as concrete metrics new_node.s_upper = new_node.size new_node.s_lower = 0 new_node.e_upper = new_node.loss new_node.e_max_upper = new_node.e_max new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w) new_node.c_upper = new_node.score first_level.append(new_node) new_node.print_debug(top_k, 0) # constraints for 1st level nodes to be problematic candidates if new_node.score > 1 and new_node.size >= x_size / alpha: # this method updates top k slices if needed top_k.add_new_top_slice(new_node) counter = counter + 1 return first_level, all_nodes
def join_enum_fun(node_a, list_b, predictions, f_l2, debug, alpha, w, loss_type, cur_lvl, top_k): x_size = len(predictions) nodes = [] for node_i in range(len(list_b)): flag = spark_utils.approved_join_slice(node_i, node_a, cur_lvl) if not flag: new_node = SparkNode(predictions, f_l2) parents_set = set(new_node.parents) parents_set.add(node_i) parents_set.add(node_a) new_node.parents = list(parents_set) parent1_attr = node_a.attributes parent2_attr = list_b[node_i].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_node.calc_bounds(cur_lvl, w) # check if concrete data should be extracted or not (only for those that have score upper # and if size of subset is big enough to_slice = new_node.check_bounds(top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, f_l2, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha) and new_node.key not in top_k.keys: top_k.add_new_top_slice(new_node) nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) return nodes
def calc_bucket_metrics(bucket, loss, w, x_size, cur_lvl): bucket.calc_error() bucket.score = opt_fun(bucket.error, bucket.size, loss, x_size, w) if cur_lvl == 0: bucket.s_upper = bucket.size bucket.c_upper = bucket.score bucket.s_lower = 1 return bucket
def make_first_level(features, predictions, loss, top_k, w, loss_type): first_level = [] # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds for feature in features: new_node = SparkNode(loss, predictions) new_node.parents = [feature] new_node.attributes.append(feature) new_node.name = new_node.make_name() new_node.key = new_node.make_key() new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, loss, len(predictions), w) new_node.c_upper = new_node.score first_level.append(new_node) new_node.print_debug(top_k, 0) return first_level
def make_first_level(features, predictions, f_l2, top_k, alpha, k, w, loss_type): first_level = [] # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds for feature in features: new_node = SparkedNode(f_l2, predictions) new_node.parents = [feature] new_node.attributes.append(feature) new_node.name = new_node.make_name() new_node.key = new_node.make_key() new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, f_l2, len(predictions), w) new_node.c_upper = new_node.score first_level.append(new_node) new_node.print_debug(top_k, 0) # constraints for 1st level nodes to be problematic candidates if new_node.check_constraint(top_k, len(predictions), alpha): # this method updates top k slices if needed top_k.add_new_top_slice(new_node) return first_level
def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update): top_k = Topk(k) # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds levels = [] first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, w, alpha, top_k) # double appending of first level nodes in order to enumerating second level in the same way as others levels.append((first_level[0], len(all_features))) all_nodes = first_level[1] # cur_lvl - index of current level, correlates with number of slice forming features cur_lvl = 1 # level that is planned to be filled later cur_lvl_nodes = first_level # currently for debug print("Level 1 had " + str(len(all_features)) + " candidates") print() print("Current topk are: ") top_k.print_topk() # DPSize algorithm approach of previous levels nodes combinations and updating bounds for those that already exist while len(cur_lvl_nodes) > 0: cur_lvl_nodes = [] count = 0 for left in range(int(cur_lvl / 2) + 1): right = cur_lvl - 1 - left for node_i in range(len(levels[left][0])): for node_j in range(len(levels[right][0])): flag = check_attributes(levels[left][0][node_i], levels[right][0][node_j]) if not flag: new_node = Node(complete_x, loss, x_size, y_test, errors) parents_set = set(new_node.parents) parents_set.add(levels[left][0][node_i]) parents_set.add(levels[right][0][node_j]) new_node.parents = list(parents_set) parent1_attr = levels[left][0][node_i].attributes parent2_attr = levels[right][0][node_j].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) if new_node.key[1] in all_nodes: existing_item = all_nodes[new_node.key[1]] parents_set = set(existing_item.parents) existing_item.parents = parents_set if b_update: s_upper = new_node.calc_s_upper(cur_lvl) s_lower = new_node.calc_s_lower(cur_lvl) e_upper = new_node.calc_e_upper() e_max_upper = new_node.calc_e_max_upper( cur_lvl) new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w) else: new_node.calc_bounds(cur_lvl, w) all_nodes[new_node.key[1]] = new_node # check if concrete data should be extracted or not (only for those that have score upper # big enough and if size of subset is big enough to_slice = new_node.check_bounds( top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun( new_node.loss, new_node.size, loss, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha ) and new_node.key not in top_k.keys: top_k.add_new_top_slice(new_node) cur_lvl_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) count = count + levels[left][1] * levels[right][1] print("Level " + str(cur_lvl) + " had " + str(count) + " candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level") cur_lvl = cur_lvl + 1 levels.append((cur_lvl_nodes, count)) top_k.print_topk() print("Program stopped at level " + str(cur_lvl)) print() print("Selected slices are: ") top_k.print_topk()
def test_opt_fun(self): self.slice_member.score = slicer.opt_fun(self.slice_member.loss, self.slice_member.size, self.loss, len(self.x_test), self.w) print("check 8")