Beispiel #1
0
def exhaustive(data, target_class, top_k=5, enable_i=True):
    begin = datetime.datetime.utcnow()

    # by storing this large element, we avoid the problem of adding problems elements
    sorted_patterns = PrioritySet(500)

    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1
    first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size)
    last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size)
    class_data_count = count_target_class_data(data, target_class)
    itemsets_bitsets = {}

    items = extract_items(data)

    fifo = [[]]

    # to know if elements have already been added
    fifo_elements = set()

    stage = 0
    compute_count = 0

    while len(fifo) != 0:
        seed = fifo.pop(0)
        children = compute_children(seed, items, enable_i)

        if k_length(seed) > stage:
            stage = k_length(seed)
            display_info(stage, compute_count, sorted_patterns, begin, data, top_k)

        for child in children:
            quality, bitset = compute_quality_vertical(data, child, target_class,
                                                       bitset_slot_size,
                                                       itemsets_bitsets,
                                                       class_data_count,
                                                       first_zero_mask,
                                                       last_ones_mask)


            sorted_patterns.add_preserve_memory(child, quality, data)

            # we do not explore elements with a null support
            if child not in fifo_elements and bitset != 0:
                fifo.append(child)
                fifo_elements.add(child)

            compute_count += len(children)

    print("The algorithm took:{}".format(datetime.datetime.utcnow() - begin))

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
Beispiel #2
0
def misere(data,
           target_class,
           time_budget=conf.TIME_BUDGET,
           top_k=conf.TOP_K,
           iterations_limit=conf.ITERATIONS_NUMBER,
           theta=conf.THETA,
           quality_measure=conf.QUALITY_MEASURE):
    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    sorted_patterns = PrioritySet(theta=theta)

    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1
    first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size)
    last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size)
    class_data_count = count_target_class_data(data, target_class)
    itemsets_bitsets = {}

    iterations_count = 0

    while datetime.datetime.utcnow(
    ) - begin < time_budget and iterations_count < iterations_limit:
        sequence = copy.deepcopy(random.choice(data))
        sequence = sequence[1:]

        ads = count_subsequences_number(sequence)

        for i in range(int(math.log(ads))):
            if iterations_count >= iterations_limit:
                break

            subsequence = copy.deepcopy(sequence)

            # we remove z items randomly
            seq_items_nb = len([i for j_set in subsequence for i in j_set])
            z = random.randint(1, seq_items_nb - 1)

            for _ in range(z):
                chosen_itemset_i = random.randint(0, len(subsequence) - 1)
                chosen_itemset = subsequence[chosen_itemset_i]

                chosen_itemset.remove(random.sample(chosen_itemset, 1)[0])

                if len(chosen_itemset) == 0:
                    subsequence.pop(chosen_itemset_i)

            quality, _ = compute_quality_vertical(
                data,
                subsequence,
                target_class,
                bitset_slot_size,
                itemsets_bitsets,
                class_data_count,
                first_zero_mask,
                last_ones_mask,
                quality_measure=quality_measure)

            iterations_count += 1
            sorted_patterns.add(sequence_mutable_to_immutable(subsequence),
                                quality)

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
Beispiel #3
0
def test_count_target_class_date():
    assert count_target_class_data(data, '+') == 1
Beispiel #4
0
def seq_scout(data,
              target_class,
              time_budget=conf.TIME_BUDGET,
              top_k=conf.TOP_K,
              enable_i=True,
              vertical=True,
              iterations_limit=conf.ITERATIONS_NUMBER,
              theta=conf.THETA,
              quality_measure=conf.QUALITY_MEASURE):
    items = extract_items(data)
    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    data_target_class = filter_target_class(data, target_class)
    sorted_patterns = PrioritySet(k=top_k, theta=theta)
    UCB_scores = PrioritySetUCB()
    itemsets_memory = get_itemset_memory(data)

    # removing class
    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1

    global VERTICAL_RPZ
    VERTICAL_RPZ = vertical

    global VERTICAL_TOOLS
    VERTICAL_TOOLS = {
        "bitset_slot_size": bitset_slot_size,
        "first_zero_mask": compute_first_zero_mask(len(data),
                                                   bitset_slot_size),
        "last_ones_mask": compute_last_ones_mask(len(data), bitset_slot_size),
        "class_data_count": count_target_class_data(data, target_class),
        "itemsets_bitsets": {}
    }

    N = 1

    # init: we add objects with the best ucb so that they are all played one time in the main procedure.
    # By putting a null N, we ensure the mean of the quality will be correct
    for sequence in data_target_class:
        sequence_i = sequence_mutable_to_immutable(sequence[1:])
        UCB_score = UCB(float("inf"), 1, N)
        UCB_scores.add(sequence_i, (UCB_score, 0, 0))

    # play with time budget
    while datetime.datetime.utcnow(
    ) - begin < time_budget and N < iterations_limit:
        # we take the best UCB
        _, Ni, mean_quality, sequence = UCB_scores.pop()

        pattern, quality = play_arm(sequence,
                                    data,
                                    target_class,
                                    quality_measure=quality_measure)
        pattern = sequence_mutable_to_immutable(pattern)
        sorted_patterns.add(pattern, quality)

        # we update scores
        updated_quality = (Ni * mean_quality + quality) / (Ni + 1)
        UCB_score = UCB(updated_quality, Ni + 1, N)
        UCB_scores.add(sequence, (UCB_score, Ni + 1, updated_quality))

        N += 1

    print("seqscout optimized iterations: {}".format(N))

    best_patterns = sorted_patterns.get_top_k_non_redundant(data, top_k)

    for pattern in best_patterns:
        pattern_mutable = sequence_immutable_to_mutable(pattern[1])
        optimized_pattern, optimized_quality = exploit_arm(
            pattern_mutable,
            pattern[0],
            items,
            data,
            itemsets_memory,
            target_class,
            enable_i=enable_i,
            quality_measure=quality_measure)
        optimized_pattern = sequence_mutable_to_immutable(optimized_pattern)
        sorted_patterns.add(optimized_pattern, optimized_quality)

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
Beispiel #5
0
def beam_search(data,
                target_class,
                time_budget=conf.TIME_BUDGET,
                enable_i=True,
                top_k=conf.TOP_K,
                beam_width=conf.BEAM_WIDTH,
                iterations_limit=conf.ITERATIONS_NUMBER,
                theta=conf.THETA,
                quality_measure=conf.QUALITY_MEASURE):

    items = extract_items(data)

    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1

    first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size)
    last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size)
    class_data_count = count_target_class_data(data, target_class)
    itemsets_bitsets = {}

    # candidate_queue = items_to_sequences(items)
    candidate_queue = [[]]

    sorted_patterns = PrioritySet(top_k, theta=theta)

    nb_iteration = 0
    while datetime.datetime.utcnow(
    ) - begin < time_budget and nb_iteration < iterations_limit:
        beam = PrioritySet()

        while (len(candidate_queue) != 0) and nb_iteration < iterations_limit:
            seed = candidate_queue.pop(0)
            children = compute_children(seed, items, enable_i)

            for child in children:
                if nb_iteration >= iterations_limit:
                    break

                quality, _ = compute_quality_vertical(
                    data,
                    child,
                    target_class,
                    bitset_slot_size,
                    itemsets_bitsets,
                    class_data_count,
                    first_zero_mask,
                    last_ones_mask,
                    quality_measure=quality_measure)

                # sorted_patterns.add_preserve_memory(child, quality, data)
                sorted_patterns.add(child, quality)
                beam.add(child, quality)
                nb_iteration += 1

        candidate_queue = [
            j for i, j in beam.get_top_k_non_redundant(data, beam_width)
        ]

    # print("Number iterations beam search: {}".format(nb_iteration))

    return sorted_patterns.get_top_k_non_redundant(data, top_k)