Example #1
0
def compute_dataset_size_raissy(DATA):
    ITEMS = extract_items(DATA)
    m = len(ITEMS)

    l_max = extract_l_max(DATA)

    w_k = 0

    for i in range(50):
        local = combination(l_max, m * i) / (2**(i + 1))
        w_k += local

    return w_k
Example #2
0
def exhaustive(data, target_class, top_k=5, enable_i=True):
    begin = datetime.datetime.utcnow()

    # by storing this large element, we avoid the problem of adding problems elements
    sorted_patterns = PrioritySet(500)

    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1
    first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size)
    last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size)
    class_data_count = count_target_class_data(data, target_class)
    itemsets_bitsets = {}

    items = extract_items(data)

    fifo = [[]]

    # to know if elements have already been added
    fifo_elements = set()

    stage = 0
    compute_count = 0

    while len(fifo) != 0:
        seed = fifo.pop(0)
        children = compute_children(seed, items, enable_i)

        if k_length(seed) > stage:
            stage = k_length(seed)
            display_info(stage, compute_count, sorted_patterns, begin, data, top_k)

        for child in children:
            quality, bitset = compute_quality_vertical(data, child, target_class,
                                                       bitset_slot_size,
                                                       itemsets_bitsets,
                                                       class_data_count,
                                                       first_zero_mask,
                                                       last_ones_mask)


            sorted_patterns.add_preserve_memory(child, quality, data)

            # we do not explore elements with a null support
            if child not in fifo_elements and bitset != 0:
                fifo.append(child)
                fifo_elements.add(child)

            compute_count += len(children)

    print("The algorithm took:{}".format(datetime.datetime.utcnow() - begin))

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
Example #3
0
def compute_dataset_size_raissy2(DATA):
    ITEMS = extract_items(DATA)
    m = len(ITEMS)

    l_max = extract_l_max(DATA)

    memo = {0: 1}

    somme = w_k(memo, l_max, m)

    for i, value in memo.items():
        somme += value

    return somme
Example #4
0
def compute_dataset_size(DATA):
    ITEMS = extract_items(DATA)
    m = len(ITEMS)
    stages = {}

    l_max = extract_l_max(DATA)

    pattern_number = 1  # we count the root

    for l in range(l_max + 1):
        stage_count = 0
        for k in range(l):
            decompositions = decompose(k)

            for decomposition in decompositions:
                # more set of balls to share than bags, impossible case
                if len(decomposition) <= l - k:
                    first_element = m**(l - k - len(decomposition))

                    histo = decomposition_histogram(decomposition)
                    histo_factorial_product = 1
                    for _, unique_factor in histo.items():
                        histo_factorial_product *= factorial(unique_factor)

                    second_element = factorial(l - k) / (
                        factorial(l - k - len(decomposition)) *
                        histo_factorial_product)

                    for elt in decomposition:
                        second_element *= combination(elt + 1, m)
                    stage_pattern = first_element * second_element
                    pattern_number += stage_pattern
                    stage_count += stage_pattern

            stages[l] = stage_count

    return pattern_number, stages
Example #5
0
def exhaustive(DATA, enable_i=True):
    begin = datetime.datetime.utcnow()

    items = extract_items(DATA)
    # we remove first element wich are useless
    for i in range(len(DATA)):
        DATA[i] = DATA[i][1:]

    l_max = extract_l_max(DATA)
    fifo = [[]]

    # to know if elements have already been added
    fifo_elements = set()

    stage = 0
    compute_count = 0

    while len(fifo) != 0:
        seed = fifo.pop(0)
        children = compute_children(seed, items, enable_i)

        if k_length(seed) > stage:
            stage = k_length(seed)

        for child in children:
            # we do not explore elements with a null support
            if k_length(child) <= l_max and child not in fifo_elements:
                fifo.append(child)
                fifo_elements.add(child)

            compute_count += len(children)

    print("The algorithm took:{}".format(datetime.datetime.utcnow() - begin))
    # we add the root
    print('The size is: {}'.format(len(fifo_elements) + 1))

    return fifo_elements
Example #6
0
            (read_data_kosarak('../data/blocks.data'), 'blocks'),
            (read_data_kosarak('../data/context.data'), 'context'),
            (read_data_sc2('../data/sequences-TZ-45.txt')[:5000], 'sc2'),
            (read_data_kosarak('../data/skating.data'), 'skating'),
            (read_jmlr('svm', '../data/jmlr/jmlr'), 'jmlr')]

for dataset, name in datasets:
    for i in range(len(dataset)):
        dataset[i] = dataset[i][1:]

    k_max = 0
    n_max = 0

    k_lengths = []

    for line in dataset:
        k_lengths.append(k_length(line))
        if k_length(line) > k_max:
            k_max = k_length(line)

        if len(line) > n_max:
            n_max = len(line)

    print('dataset: {}'.format(name))
    print('k_max: {}'.format(k_max))
    print('n_max: {}'.format(n_max))
    print('m: {}'.format(len(extract_items(dataset))))
    print('Variance on lengths: {}'.format(np.var(k_lengths)))
    print('Lines number : {}'.format(len(dataset)))
    print(" ")
Example #7
0
def seq_scout_api(dataset=conf.DATA,
                  time_budget=conf.TIME_BUDGET,
                  top_k=conf.TOP_K):
    '''
    Launch seq_scout.
    This function is for the simplicity of the user, so that she does not needs to specify iterations number,
    which is here only for experiments.
    '''

    if dataset == 'splice':
        data = read_data(
            pathlib.Path(__file__).parent.parent / 'data/splice.data')
        target_class = 'EI'
        enable_i = False
    elif dataset == 'alsbu':
        data = read_data_kosarak(
            pathlib.Path(__file__).parent.parent / 'data/aslbu.data')
        target_class = '195'
        enable_i = False
    elif dataset == 'alsbu':
        data = read_data_kosarak(
            pathlib.Path(__file__).parent.parent / 'data/blocks.data')
        target_class = '7'
        enable_i = False
    elif dataset == 'context':
        data = read_data_kosarak(
            pathlib.Path(__file__).parent.parent / 'data/context.data')
        target_class = '4'
        enable_i = False
    elif dataset == 'sc2':
        data = read_data_sc2(
            pathlib.Path(__file__).parent.parent /
            'data/sequences-TZ-45.txt')[:5000]
        target_class = '1'
        enable_i = True
    elif dataset == 'skating':
        data = read_data_kosarak(
            pathlib.Path(__file__).parent.parent / 'data/skating.data')
        target_class = '1'
        enable_i = False
    elif dataset == 'jmlr':
        data = read_jmlr(
            'svm',
            pathlib.Path(__file__).parent.parent / 'data/jmlr/jmlr')
        target_class = '+'
        enable_i = False
    else:
        data = read_data(
            pathlib.Path(__file__).parent.parent / 'data/promoters.data')
        target_class = '+'
        enable_i = False

    class_present = False
    for sequence in data:
        if target_class == sequence[0]:
            class_present = True
            break

    if not class_present:
        raise ValueError('The target class does not appear in data')

    items = extract_items(data)
    items, items_to_encoding, encoding_to_items = encode_items(items)
    data = encode_data(data, items_to_encoding)

    results = seq_scout(data,
                        target_class,
                        top_k=top_k,
                        vertical=False,
                        time_budget=time_budget,
                        iterations_limit=10000000000000,
                        enable_i=enable_i)

    print_results_decode(results, encoding_to_items)
    return results
Example #8
0
def seq_scout(data,
              target_class,
              time_budget=conf.TIME_BUDGET,
              top_k=conf.TOP_K,
              enable_i=True,
              vertical=True,
              iterations_limit=conf.ITERATIONS_NUMBER,
              theta=conf.THETA,
              quality_measure=conf.QUALITY_MEASURE):
    items = extract_items(data)
    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    data_target_class = filter_target_class(data, target_class)
    sorted_patterns = PrioritySet(k=top_k, theta=theta)
    UCB_scores = PrioritySetUCB()
    itemsets_memory = get_itemset_memory(data)

    # removing class
    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1

    global VERTICAL_RPZ
    VERTICAL_RPZ = vertical

    global VERTICAL_TOOLS
    VERTICAL_TOOLS = {
        "bitset_slot_size": bitset_slot_size,
        "first_zero_mask": compute_first_zero_mask(len(data),
                                                   bitset_slot_size),
        "last_ones_mask": compute_last_ones_mask(len(data), bitset_slot_size),
        "class_data_count": count_target_class_data(data, target_class),
        "itemsets_bitsets": {}
    }

    N = 1

    # init: we add objects with the best ucb so that they are all played one time in the main procedure.
    # By putting a null N, we ensure the mean of the quality will be correct
    for sequence in data_target_class:
        sequence_i = sequence_mutable_to_immutable(sequence[1:])
        UCB_score = UCB(float("inf"), 1, N)
        UCB_scores.add(sequence_i, (UCB_score, 0, 0))

    # play with time budget
    while datetime.datetime.utcnow(
    ) - begin < time_budget and N < iterations_limit:
        # we take the best UCB
        _, Ni, mean_quality, sequence = UCB_scores.pop()

        pattern, quality = play_arm(sequence,
                                    data,
                                    target_class,
                                    quality_measure=quality_measure)
        pattern = sequence_mutable_to_immutable(pattern)
        sorted_patterns.add(pattern, quality)

        # we update scores
        updated_quality = (Ni * mean_quality + quality) / (Ni + 1)
        UCB_score = UCB(updated_quality, Ni + 1, N)
        UCB_scores.add(sequence, (UCB_score, Ni + 1, updated_quality))

        N += 1

    print("seqscout optimized iterations: {}".format(N))

    best_patterns = sorted_patterns.get_top_k_non_redundant(data, top_k)

    for pattern in best_patterns:
        pattern_mutable = sequence_immutable_to_mutable(pattern[1])
        optimized_pattern, optimized_quality = exploit_arm(
            pattern_mutable,
            pattern[0],
            items,
            data,
            itemsets_memory,
            target_class,
            enable_i=enable_i,
            quality_measure=quality_measure)
        optimized_pattern = sequence_mutable_to_immutable(optimized_pattern)
        sorted_patterns.add(optimized_pattern, optimized_quality)

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
Example #9
0
def beam_search(data,
                target_class,
                time_budget=conf.TIME_BUDGET,
                enable_i=True,
                top_k=conf.TOP_K,
                beam_width=conf.BEAM_WIDTH,
                iterations_limit=conf.ITERATIONS_NUMBER,
                theta=conf.THETA,
                quality_measure=conf.QUALITY_MEASURE):

    items = extract_items(data)

    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1

    first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size)
    last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size)
    class_data_count = count_target_class_data(data, target_class)
    itemsets_bitsets = {}

    # candidate_queue = items_to_sequences(items)
    candidate_queue = [[]]

    sorted_patterns = PrioritySet(top_k, theta=theta)

    nb_iteration = 0
    while datetime.datetime.utcnow(
    ) - begin < time_budget and nb_iteration < iterations_limit:
        beam = PrioritySet()

        while (len(candidate_queue) != 0) and nb_iteration < iterations_limit:
            seed = candidate_queue.pop(0)
            children = compute_children(seed, items, enable_i)

            for child in children:
                if nb_iteration >= iterations_limit:
                    break

                quality, _ = compute_quality_vertical(
                    data,
                    child,
                    target_class,
                    bitset_slot_size,
                    itemsets_bitsets,
                    class_data_count,
                    first_zero_mask,
                    last_ones_mask,
                    quality_measure=quality_measure)

                # sorted_patterns.add_preserve_memory(child, quality, data)
                sorted_patterns.add(child, quality)
                beam.add(child, quality)
                nb_iteration += 1

        candidate_queue = [
            j for i, j in beam.get_top_k_non_redundant(data, beam_width)
        ]

    # print("Number iterations beam search: {}".format(nb_iteration))

    return sorted_patterns.get_top_k_non_redundant(data, top_k)