Esempio n. 1
0
 def test_non_nonsense(self):
     cur_lvl_nodes = {}
     all_nodes = {}
     b_update = True
     cur_lvl = 1
     slice_index = (2, 'x0_3')
     parent3 = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors)
     parent3.parents = [self.first_level_nodes[(4, 'x1_2')], self.first_level_nodes[(7, 'x2_2')]]
     parent3.attributes = [('x1_2', 4), ('x2_2', 7)]
     combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss,
                                 len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w,
                                 self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes)
     parent2 = combined[0]['x0_3 && x2_3']
     parent3.key = (8, 'x1_2 && x2_2')
     flag_nonsense = slicer.slice_name_nonsense(parent2, parent3, 2)
     self.assertEqual(True, flag_nonsense)
     print("check4")
Esempio n. 2
0
 def test_parents_second(self):
     cur_lvl_nodes = {}
     all_nodes = {}
     b_update = True
     cur_lvl = 1
     slice_index = (2, 'x0_3')
     combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss,
                                 len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w,
                                 self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes)
     parent1 = combined[0][('x0_3 && x1_3')]
     parent2 = combined[0][('x0_3 && x2_2')]
     new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors)
     new_node.parents = [parent1, parent2]
     parent1_attr = parent1.attributes
     parent2_attr = parent2.attributes
     new_node_attr = slicer.union(parent1_attr, parent2_attr)
     self.assertEqual(new_node_attr, [('x0_3', 2), ('x1_3', 5), ('x2_2', 7)])
     print("check2")
Esempio n. 3
0
 def test_uppers(self):
     cur_lvl_nodes = {}
     all_nodes = {}
     b_update = True
     cur_lvl = 1
     slice_index = (2, 'x0_3')
     parent3 = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors)
     parent3.parents = [self.first_level_nodes[(4, 'x1_2')], self.first_level_nodes[(7, 'x2_2')]]
     parent3.attributes = [('x1_2', 4), ('x2_2', 7)]
     combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss,
                                 len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w,
                                 self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes)
     parent1 = combined[0]['x0_3 && x1_3']
     parent2 = combined[0]['x0_3 && x2_3']
     new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors)
     new_node.parents = [parent1, parent2]
     new_node.calc_bounds(2, self.w)
     self.assertEqual(25, new_node.s_upper)
     print("check5")
     self.assertEqual(398, int(new_node.c_upper))
     print("check6")
Esempio n. 4
0
 def test_nonsense(self):
     cur_lvl_nodes = {}
     all_nodes = {}
     b_update = True
     cur_lvl = 1
     slice_index = (2, 'x0_3')
     combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss,
                                 len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w,
                                 self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes)
     parent1 = combined[0][('x0_3 && x1_3')]
     parent2 = combined[0][('x0_3 && x2_2')]
     new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors)
     new_node.parents = [parent1, parent2]
     parent1_attr = parent1.attributes
     parent2_attr = parent2.attributes
     new_node_attr = slicer.union(parent1_attr, parent2_attr)
     new_node.attributes = new_node_attr
     new_node.name = new_node.make_name()
     flagTrue = slicer.slice_name_nonsense(parent1, parent2, 2)
     self.assertEqual(True, flagTrue)
     print("check3")
Esempio n. 5
0
def join_enum(node_i, prev_lvl, complete_x, loss, x_size, y_test, errors,
              debug, alpha, w, loss_type, b_update, cur_lvl, all_nodes, top_k,
              cur_lvl_nodes):
    for node_j in range(len(prev_lvl)):
        flag = slice_name_nonsense(prev_lvl[node_i], prev_lvl[node_j], cur_lvl)
        if flag and prev_lvl[node_j].key[0] > prev_lvl[node_i].key[0]:
            new_node = Node(complete_x, loss, x_size, y_test, errors)
            parents_set = set(new_node.parents)
            parents_set.add(prev_lvl[node_i])
            parents_set.add(prev_lvl[node_j])
            new_node.parents = list(parents_set)
            parent1_attr = prev_lvl[node_i].attributes
            parent2_attr = prev_lvl[node_j].attributes
            new_node_attr = union(parent1_attr, parent2_attr)
            new_node.attributes = new_node_attr
            new_node.name = new_node.make_name()
            new_id = len(all_nodes)
            new_node.key = new_node.make_key(new_id)
            if new_node.key[1] in all_nodes:
                existing_item = all_nodes[new_node.key[1]]
                parents_set = set(existing_item.parents)
                existing_item.parents = parents_set
                if b_update:
                    s_upper = new_node.calc_s_upper(cur_lvl)
                    s_lower = new_node.calc_s_lower(cur_lvl)
                    e_upper = new_node.calc_e_upper()
                    e_max_upper = new_node.calc_e_max_upper(cur_lvl)
                    new_node.update_bounds(s_upper, s_lower, e_upper,
                                           e_max_upper, w)
            else:
                new_node.calc_bounds(cur_lvl, w)
                all_nodes[new_node.key[1]] = new_node
                # check if concrete data should be extracted or not (only for those that have score upper
                # big enough and if size of subset is big enough
                to_slice = new_node.check_bounds(top_k, x_size, alpha)
                if to_slice:
                    new_node.process_slice(loss_type)
                    new_node.score = opt_fun(new_node.loss, new_node.size,
                                             loss, x_size, w)
                    # we decide to add node to current level nodes (in order to make new combinations
                    # on the next one or not basing on its score value
                    if new_node.check_constraint(
                            top_k, x_size,
                            alpha) and new_node.key not in top_k.keys:
                        top_k.add_new_top_slice(new_node)
                    cur_lvl_nodes.append(new_node)
                if debug:
                    new_node.print_debug(top_k, cur_lvl)
    return cur_lvl_nodes, all_nodes
Esempio n. 6
0
def make_first_level(all_features, complete_x, loss, x_size, y_test, errors,
                     loss_type, top_k, alpha, w):
    first_level = []
    counter = 0
    all_nodes = {}
    # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds
    for feature in all_features:
        new_node = Node(complete_x, loss, x_size, y_test, errors)
        new_node.parents = [(feature, counter)]
        new_node.attributes.append((feature, counter))
        new_node.name = new_node.make_name()
        new_id = len(all_nodes)
        new_node.key = new_node.make_key(new_id)
        all_nodes[new_node.key] = new_node
        new_node.process_slice(loss_type)
        new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w)
        new_node.c_upper = new_node.score
        first_level.append(new_node)
        new_node.print_debug(top_k, 0)
        # constraints for 1st level nodes to be problematic candidates
        if new_node.check_constraint(top_k, x_size, alpha):
            # this method updates top k slices if needed
            top_k.add_new_top_slice(new_node)
        counter = counter + 1
    return first_level, all_nodes
Esempio n. 7
0
def process(all_features, complete_x, loss, x_size, y_test, errors, debug,
            alpha, k, w, loss_type, b_update):
    top_k = Topk(k)
    # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds
    levels = []
    first_level = make_first_level(all_features, complete_x, loss, x_size,
                                   y_test, errors, loss_type, w, alpha, top_k)
    # double appending of first level nodes in order to enumerating second level in the same way as others
    levels.append((first_level[0], len(all_features)))
    all_nodes = first_level[1]
    # cur_lvl - index of current level, correlates with number of slice forming features
    cur_lvl = 1  # level that is planned to be filled later
    cur_lvl_nodes = first_level
    # currently for debug
    print("Level 1 had " + str(len(all_features)) + " candidates")
    print()
    print("Current topk are: ")
    top_k.print_topk()
    # DPSize algorithm approach of previous levels nodes combinations and updating bounds for those that already exist
    while len(cur_lvl_nodes) > 0:
        cur_lvl_nodes = []
        count = 0
        for left in range(int(cur_lvl / 2) + 1):
            right = cur_lvl - 1 - left
            for node_i in range(len(levels[left][0])):
                for node_j in range(len(levels[right][0])):
                    flag = check_attributes(levels[left][0][node_i],
                                            levels[right][0][node_j])
                    if not flag:
                        new_node = Node(complete_x, loss, x_size, y_test,
                                        errors)
                        parents_set = set(new_node.parents)
                        parents_set.add(levels[left][0][node_i])
                        parents_set.add(levels[right][0][node_j])
                        new_node.parents = list(parents_set)
                        parent1_attr = levels[left][0][node_i].attributes
                        parent2_attr = levels[right][0][node_j].attributes
                        new_node_attr = union(parent1_attr, parent2_attr)
                        new_node.attributes = new_node_attr
                        new_node.name = new_node.make_name()
                        new_id = len(all_nodes)
                        new_node.key = new_node.make_key(new_id)
                        if new_node.key[1] in all_nodes:
                            existing_item = all_nodes[new_node.key[1]]
                            parents_set = set(existing_item.parents)
                            existing_item.parents = parents_set
                            if b_update:
                                s_upper = new_node.calc_s_upper(cur_lvl)
                                s_lower = new_node.calc_s_lower(cur_lvl)
                                e_upper = new_node.calc_e_upper()
                                e_max_upper = new_node.calc_e_max_upper(
                                    cur_lvl)
                                new_node.update_bounds(s_upper, s_lower,
                                                       e_upper, e_max_upper, w)
                        else:
                            new_node.calc_bounds(cur_lvl, w)
                            all_nodes[new_node.key[1]] = new_node
                            # check if concrete data should be extracted or not (only for those that have score upper
                            # big enough and if size of subset is big enough
                            to_slice = new_node.check_bounds(
                                top_k, x_size, alpha)
                            if to_slice:
                                new_node.process_slice(loss_type)
                                new_node.score = opt_fun(
                                    new_node.loss, new_node.size, loss, x_size,
                                    w)
                                # we decide to add node to current level nodes (in order to make new combinations
                                # on the next one or not basing on its score value
                                if new_node.check_constraint(
                                        top_k, x_size, alpha
                                ) and new_node.key not in top_k.keys:
                                    top_k.add_new_top_slice(new_node)
                                cur_lvl_nodes.append(new_node)
                            if debug:
                                new_node.print_debug(top_k, cur_lvl)
            count = count + levels[left][1] * levels[right][1]
        print("Level " + str(cur_lvl) + " had " + str(count) +
              " candidates but after pruning only " + str(len(cur_lvl_nodes)) +
              " go to the next level")
        cur_lvl = cur_lvl + 1
        levels.append((cur_lvl_nodes, count))
        top_k.print_topk()
    print("Program stopped at level " + str(cur_lvl))
    print()
    print("Selected slices are: ")
    top_k.print_topk()
Esempio n. 8
0
def make_first_level(all_features, complete_x, loss, x_size, y_test, errors,
                     loss_type, w, alpha, top_k):
    all_nodes = {}
    counter = 0
    first_level = []
    for feature in all_features:
        new_node = Node(complete_x, loss, x_size, y_test, errors)
        new_node.parents = [(feature, counter)]
        new_node.attributes.append((feature, counter))
        new_node.name = new_node.make_name()
        new_id = len(all_nodes)
        new_node.key = new_node.make_key(new_id)
        all_nodes[new_node.key] = new_node
        new_node.process_slice(loss_type)
        # for first level nodes all bounds are strict as concrete metrics
        new_node.s_upper = new_node.size
        new_node.s_lower = 0
        new_node.e_upper = new_node.loss
        new_node.e_max_upper = new_node.e_max
        new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w)
        new_node.c_upper = new_node.score
        first_level.append(new_node)
        new_node.print_debug(top_k, 0)
        # constraints for 1st level nodes to be problematic candidates
        if new_node.score > 1 and new_node.size >= x_size / alpha:
            # this method updates top k slices if needed
            top_k.add_new_top_slice(new_node)
        counter = counter + 1
    return first_level, all_nodes