def union_enum(left_level, right_level, x_size, alpha, top_k, w, loss, cur_lvl): buckets = {} for node_i in range(len(left_level)): for node_j in range(len(right_level)): flag = approved_union_slice(left_level[node_i], right_level[node_j]) if flag: node = SparkNode(None, None) node.attributes = list(set(left_level[node_i].attributes) | set(right_level[node_j].attributes)) bucket = Bucket(node, cur_lvl, w, x_size, loss) bucket.parents.append(left_level[node_i]) bucket.parents.append(right_level[node_j]) bucket.calc_bounds(w, x_size, loss) if bucket.check_bounds(x_size, alpha, top_k): buckets[bucket.name] = bucket return buckets
def join_enum(cur_lvl_nodes, cur_lvl, x_size, alpha, top_k, w, loss): buckets = {} for node_i in range(len(cur_lvl_nodes)): for node_j in range(node_i + 1, len(cur_lvl_nodes)): flag = approved_join_slice(cur_lvl_nodes[node_i], cur_lvl_nodes[node_j], cur_lvl) if flag: node = SparkNode(None, None) node.attributes = list(set(cur_lvl_nodes[node_i].attributes) | set(cur_lvl_nodes[node_j].attributes)) bucket = Bucket(node, cur_lvl, w, x_size, loss) bucket.parents.append(cur_lvl_nodes[node_i]) bucket.parents.append(cur_lvl_nodes[node_j]) bucket.calc_bounds(w, x_size, loss) if bucket.check_bounds(x_size, alpha, top_k): buckets[bucket.name] = bucket return buckets
def make_first_level(features, predictions, loss, top_k, w, loss_type): first_level = [] # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds for feature in features: new_node = SparkNode(loss, predictions) new_node.parents = [feature] new_node.attributes.append(feature) new_node.name = new_node.make_name() new_node.key = new_node.make_key() new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, loss, len(predictions), w) new_node.c_upper = new_node.score first_level.append(new_node) new_node.print_debug(top_k, 0) return first_level
def process_node(node_i, level, loss, predictions, cur_lvl, top_k, alpha, loss_type, w, debug, enumerator): cur_enum_nodes = [] for node_j in level: if enumerator == "join": flag = approved_join_slice(node_i, node_j, cur_lvl) else: flag = approved_union_slice(node_i, node_j) if flag and int(node_i.name.split("&&")[0]) < int( node_j.name.split("&&")[0]): new_node = SparkNode(loss, predictions) parents_set = set(new_node.parents) parents_set.add(node_i) parents_set.add(node_j) new_node.parents = list(parents_set) parent1_attr = node_i.attributes parent2_attr = node_j.attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_node.key = new_node.make_key() new_node.calc_bounds(cur_lvl, w) to_slice = new_node.check_bounds(top_k, len(predictions), alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, loss, len(predictions), w) if new_node.check_constraint(top_k, len(predictions), alpha): cur_enum_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) return cur_enum_nodes
def join_enum_fun(node_a, list_b, predictions, f_l2, debug, alpha, w, loss_type, cur_lvl, top_k): x_size = len(predictions) nodes = [] for node_i in range(len(list_b)): flag = spark_utils.approved_join_slice(node_i, node_a, cur_lvl) if not flag: new_node = SparkNode(predictions, f_l2) parents_set = set(new_node.parents) parents_set.add(node_i) parents_set.add(node_a) new_node.parents = list(parents_set) parent1_attr = node_a.attributes parent2_attr = list_b[node_i].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_node.calc_bounds(cur_lvl, w) # check if concrete data should be extracted or not (only for those that have score upper # and if size of subset is big enough to_slice = new_node.check_bounds(top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, f_l2, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha) and new_node.key not in top_k.keys: top_k.add_new_top_slice(new_node) nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) return nodes