Example #1
0
    def expand(self,
               data,
               data_positive,
               target_class,
               quality_measure=conf.QUALITY_MEASURE):
        random_object = random.sample(self.candidate_sequences_expand, 1)[0]
        self.candidate_sequences_expand.remove(random_object)

        if self.intent == None:
            # for the root node, it is directly database sequences
            sequence_children = sequence_mutable_to_immutable(random_object)
        else:
            sequence_children = sequence_mutable_to_immutable(
                find_LCS(random_object, self.intent))

        if sequence_children in self.node_hashmap:
            child = self.node_hashmap[sequence_children]
            child.parents.append(self)
            self.children.append(child)
        else:
            child = Node(sequence_children,
                         self,
                         data,
                         data_positive,
                         target_class,
                         self.node_hashmap,
                         quality_measure=quality_measure)
            self.node_hashmap[sequence_children] = child

        return child
Example #2
0
def test_is_subsequence():
    a = ({1, 2}, {2, 3})
    b = ({1, 2, 3}, {2, 4, 3})
    c = ({1}, {2}, {2})

    assert not is_subsequence(c, a)

    assert is_subsequence(a, b)

    a = [{1, 2}, {2, 3}]
    b = [{1, 2, 3}, {1}, {2, 4, 3}]

    assert is_subsequence(a, b)

    a = [{1, 5, 2}, {2, 3}]
    b = [{1, 2, 3}, {1}, {2, 4, 3}]

    assert not is_subsequence(a, b)

    a = [{1, 5, 2}, {2, 3}, {5}]
    b = [{1, 5, 2}, {2, 4, 3}]

    assert not is_subsequence(a, b)

    a = [{1}, {2}]
    b = [{1, 2, 3}, {1}, {2, 4, 3}]

    assert is_subsequence(a, b)
    assert not is_subsequence(b, a)

    a = sequence_mutable_to_immutable(a)
    b = sequence_mutable_to_immutable(b)
    assert is_subsequence(a, b)

    a = [{'1'}, {'2'}]
    b = [{'1', '2', '3'}, {'1'}, {'2', '4', '3'}]

    assert is_subsequence(a, b)
Example #3
0
def misere(data, target_class, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K, iterations_limit=conf.ITERATIONS_NUMBER,
           theta=conf.THETA, quality_measure=conf.QUALITY_MEASURE):
    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    sorted_patterns = PrioritySet(theta=theta)

    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1
    first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size)
    last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size)
    class_data_count = count_target_class_data(data, target_class)
    itemsets_bitsets = {}

    iterations_count = 0

    while datetime.datetime.utcnow() - begin < time_budget and iterations_count < iterations_limit:
        sequence = copy.deepcopy(random.choice(data))
        sequence = sequence[1:]

        ads = count_subsequences_number(sequence)

        for i in range(int(math.log(ads))):
            if iterations_count >= iterations_limit:
                break

            subsequence = copy.deepcopy(sequence)

            # we remove z items randomly
            seq_items_nb = len([i for j_set in subsequence for i in j_set])
            z = random.randint(1, seq_items_nb - 1)

            for _ in range(z):
                chosen_itemset_i = random.randint(0, len(subsequence) - 1)
                chosen_itemset = subsequence[chosen_itemset_i]

                chosen_itemset.remove(random.sample(chosen_itemset, 1)[0])

                if len(chosen_itemset) == 0:
                    subsequence.pop(chosen_itemset_i)

            quality, _ = compute_quality_vertical(data, subsequence, target_class,
                                                bitset_slot_size,
                                                itemsets_bitsets, class_data_count,
                                                first_zero_mask, last_ones_mask, quality_measure=quality_measure)

            iterations_count += 1
            sorted_patterns.add(sequence_mutable_to_immutable(subsequence),
                                quality)

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
Example #4
0
def launch_mcts(data, target_class, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K, theta=conf.THETA,
                iterations_limit=conf.ITERATIONS_NUMBER, quality_measure=conf.QUALITY_MEASURE):
    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    data_positive = filter_positive(data, target_class)
    data = filter_empty_sequences(data)

    node_hashmap = {}
    root_node = Node(None, None, data, data_positive, target_class, node_hashmap)
    node_hashmap[('.')] = root_node

    sorted_patterns = PrioritySet(k=top_k, theta=theta)
    iteration_count = 0

    while datetime.datetime.utcnow() - begin <= time_budget and iteration_count < iterations_limit:
        node_sel = select(root_node)

        if node_sel == 'finished':
            print('Finished')
            break

        node_expand = node_sel.expand(data, data_positive, target_class, quality_measure=quality_measure)
        sorted_patterns.add(sequence_mutable_to_immutable(node_expand.intent), node_expand.quality)

        sequence_reward, reward = roll_out(node_expand, data, target_class, quality_measure=quality_measure)

        sorted_patterns.add(sequence_mutable_to_immutable(sequence_reward), reward)

        update(node_expand, reward)
        iteration_count += 1

        # if iteration_count % int(iterations_limit * 0.1) == 0:
        #    print('{}%'.format(iteration_count / iterations_limit * 100))

    print('Number iteration mcts: {}'.format(iteration_count))
    return sorted_patterns.get_top_k_non_redundant(data, top_k)
Example #5
0
def test_all_lcs():
    seq1 = [{'a', 'b'}, {'e'}, {'c'}]
    seq2 = [{'a'}, {'d'}, {'a', 'b'}, {'f'}, {'e'}]

    lcs = find_LCS(seq1, seq2, all=True)
    immu_lcs = sequence_mutable_to_immutable([{'a', 'b'}, {'e'}])

    assert immu_lcs in lcs

    seq1 = [{'a'}, {'a', 'b'}, {'e'}, {'c'}, {'b', 'd'}]
    seq2 = [{'a'}, {'b', 'c', 'd'}, {'a', 'd'}]

    lcs = find_LCS(seq1, seq2, all=True)

    assert len(lcs) == 3
Example #6
0
def jaccard_measure_misere(sequence1, sequence2, data):
    intersection = 0
    union = 0
    for sequence in data:
        sequence = sequence[1:]
        sequence = sequence_mutable_to_immutable(sequence)
        seq1 = False
        seq2 = False

        if is_subsequence(sequence1, sequence):
            seq1 = True
        if is_subsequence(sequence2, sequence):
            seq2 = True

        if seq1 or seq2:
            union += 1

        if seq1 and seq2:
            intersection += 1

    try:
        return intersection / union
    except ZeroDivisionError:
        return 0
Example #7
0
def optimize_pattern(patterns,
                     items,
                     data,
                     itemsets_memory,
                     target_class,
                     top_k,
                     sorted_patterns,
                     enable_i=True,
                     quality_measure=conf.QUALITY_MEASURE):
    for pattern in patterns:
        pattern_mutable = sequence_immutable_to_mutable(pattern[1])
        optimized_pattern, optimized_quality = exploit_arm(
            pattern_mutable,
            pattern[0],
            items,
            data,
            itemsets_memory,
            target_class,
            enable_i=enable_i,
            quality_measure=quality_measure)
        optimized_pattern = sequence_mutable_to_immutable(optimized_pattern)
        sorted_patterns.add(optimized_pattern, optimized_quality)

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
Example #8
0
def seq_scout(data,
              target_class,
              time_budget=conf.TIME_BUDGET,
              top_k=conf.TOP_K,
              enable_i=True,
              vertical=True,
              iterations_limit=conf.ITERATIONS_NUMBER,
              theta=conf.THETA,
              quality_measure=conf.QUALITY_MEASURE):
    items = extract_items(data)
    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    data_target_class = filter_target_class(data, target_class)
    sorted_patterns = PrioritySet(k=top_k, theta=theta)
    UCB_scores = PrioritySetUCB()
    itemsets_memory = get_itemset_memory(data)

    # removing class
    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1

    global VERTICAL_RPZ
    VERTICAL_RPZ = vertical

    global VERTICAL_TOOLS
    VERTICAL_TOOLS = {
        "bitset_slot_size": bitset_slot_size,
        "first_zero_mask": compute_first_zero_mask(len(data),
                                                   bitset_slot_size),
        "last_ones_mask": compute_last_ones_mask(len(data), bitset_slot_size),
        "class_data_count": count_target_class_data(data, target_class),
        "itemsets_bitsets": {}
    }

    N = 1

    # init: we add objects with the best ucb so that they are all played one time in the main procedure.
    # By putting a null N, we ensure the mean of the quality will be correct
    for sequence in data_target_class:
        sequence_i = sequence_mutable_to_immutable(sequence[1:])
        UCB_score = UCB(float("inf"), 1, N)
        UCB_scores.add(sequence_i, (UCB_score, 0, 0))

    # play with time budget
    while datetime.datetime.utcnow(
    ) - begin < time_budget and N < iterations_limit:
        # we take the best UCB
        _, Ni, mean_quality, sequence = UCB_scores.pop()

        pattern, quality = play_arm(sequence,
                                    data,
                                    target_class,
                                    quality_measure=quality_measure)
        pattern = sequence_mutable_to_immutable(pattern)
        sorted_patterns.add(pattern, quality)

        # we update scores
        updated_quality = (Ni * mean_quality + quality) / (Ni + 1)
        UCB_score = UCB(updated_quality, Ni + 1, N)
        UCB_scores.add(sequence, (UCB_score, Ni + 1, updated_quality))

        N += 1

    # print("SeqScout iterations: {}".format(N))

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
    '''