Example #1
0
def exhaustive(data, target_class, top_k=5, enable_i=True):
    begin = datetime.datetime.utcnow()

    # by storing this large element, we avoid the problem of adding problems elements
    sorted_patterns = PrioritySet(500)

    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1
    first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size)
    last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size)
    class_data_count = count_target_class_data(data, target_class)
    itemsets_bitsets = {}

    items = extract_items(data)

    fifo = [[]]

    # to know if elements have already been added
    fifo_elements = set()

    stage = 0
    compute_count = 0

    while len(fifo) != 0:
        seed = fifo.pop(0)
        children = compute_children(seed, items, enable_i)

        if k_length(seed) > stage:
            stage = k_length(seed)
            display_info(stage, compute_count, sorted_patterns, begin, data, top_k)

        for child in children:
            quality, bitset = compute_quality_vertical(data, child, target_class,
                                                       bitset_slot_size,
                                                       itemsets_bitsets,
                                                       class_data_count,
                                                       first_zero_mask,
                                                       last_ones_mask)

            sorted_patterns.add_preserve_memory(child, quality, data)

            # we do not explore elements with a null support
            if child not in fifo_elements and bitset != 0:
                fifo.append(child)
                fifo_elements.add(child)

            compute_count += len(children)

    print("The algorithm took:{}".format(datetime.datetime.utcnow() - begin))

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
Example #2
0
def add_lengths(patterns, dataset_name, data_final, algo):
    for pattern in patterns:
        k_length_p = k_length(pattern[1])
        data_add_generic(data_final,
                         Length=k_length_p,
                         dataset=dataset_name,
                         Algorithm=algo)
Example #3
0
def exhaustive(DATA, enable_i=True):
    begin = datetime.datetime.utcnow()

    items = extract_items(DATA)
    # we remove first element wich are useless
    for i in range(len(DATA)):
        DATA[i] = DATA[i][1:]

    l_max = extract_l_max(DATA)
    fifo = [[]]

    # to know if elements have already been added
    fifo_elements = set()

    stage = 0
    compute_count = 0

    while len(fifo) != 0:
        seed = fifo.pop(0)
        children = compute_children(seed, items, enable_i)

        if k_length(seed) > stage:
            stage = k_length(seed)

        for child in children:
            # we do not explore elements with a null support
            if k_length(child) <= l_max and child not in fifo_elements:
                fifo.append(child)
                fifo_elements.add(child)

            compute_count += len(children)

    print("The algorithm took:{}".format(datetime.datetime.utcnow() - begin))
    # we add the root
    print('The size is: {}'.format(len(fifo_elements) + 1))

    return fifo_elements
Example #4
0
def compute_quality_vertical(data,
                             subsequence,
                             target_class,
                             bitset_slot_size,
                             itemsets_bitsets,
                             class_data_count,
                             first_zero_mask,
                             last_ones_mask,
                             quality_measure=conf.QUALITY_MEASURE):
    seqscout.global_var.increase_it_number()
    length = k_length(subsequence)
    bitset = 0

    if length == 0:
        # the empty node is present everywhere
        # we just have to create a vector of ones
        bitset = 2**(len(data) * bitset_slot_size) - 1
    elif length == 1:
        singleton = frozenset(subsequence[0])
        bitset = generate_bitset(singleton, data, bitset_slot_size)
        itemsets_bitsets[singleton] = bitset
    else:
        # general case
        bitset = 2**(len(data) * bitset_slot_size) - 1
        first_iteration = True
        for itemset_i in subsequence:
            itemset = frozenset(itemset_i)

            try:
                itemset_bitset = itemsets_bitsets[itemset]
            except KeyError:
                # the bitset is not in the hashmap, we need to generate it
                itemset_bitset = generate_bitset(itemset, data,
                                                 bitset_slot_size)
                itemsets_bitsets[itemset] = itemset_bitset

            if first_iteration:
                first_iteration = False

                # aie aie aie !
                bitset = itemset_bitset
            else:
                bitset = following_ones(bitset, bitset_slot_size,
                                        first_zero_mask)

                bitset &= itemset_bitset

    # now we just need to extract support, supersequence and class_pattern_count
    class_pattern_count = 0

    support, bitset_simple = get_support_from_vector(bitset, bitset_slot_size,
                                                     first_zero_mask,
                                                     last_ones_mask)

    # find supersequences and count class pattern:
    i = bitset_simple.bit_length() - 1

    while i >= 0:
        if bitset_simple >> i & 1:
            index_data = len(data) - i - 1

            if data[index_data][0] == target_class:
                class_pattern_count += 1

        i -= 1

    occurency_ratio = support / len(data)

    if quality_measure == 'WRAcc':
        # we find the number of elements who have the right target_class
        try:
            class_pattern_ratio = class_pattern_count / support
        except ZeroDivisionError:
            return -0.25, 0

        class_data_ratio = class_data_count / len(data)
        wracc = occurency_ratio * (class_pattern_ratio - class_data_ratio)
        return wracc, bitset

    elif quality_measure == 'Informedness':
        tn = len(data) - support - (class_data_count - class_pattern_count)

        tpr = class_pattern_count / (class_pattern_count +
                                     (class_data_count - class_pattern_count))

        tnr = tn / (class_pattern_count + tn)
        return tnr + tpr - 1, bitset

    elif quality_measure == 'F1':
        try:
            class_pattern_ratio = class_pattern_count / support
        except ZeroDivisionError:
            return 0, 0
        precision = class_pattern_ratio
        recall = class_pattern_count / class_data_count
        try:
            f1 = 2 * precision * recall / (precision + recall)
        except:
            f1 = 0
        return f1, bitset
    else:
        raise ValueError('The quality measure name is not valid')
Example #5
0
            (read_data_kosarak('../data/context.data'), 'context'),
            (read_data_sc2('../data/sequences-TZ-45.txt')[:5000], 'sc2'),
            (read_data_kosarak('../data/skating.data'), 'skating'),
            (read_jmlr('svm', '../data/jmlr/jmlr'), 'jmlr'),
            (read_data_kosarak('../data/figures_rc.dat'), 'RocketLeague')]

for dataset, name in datasets:
    for i in range(len(dataset)):
        dataset[i] = dataset[i][1:]

    k_max = 0
    n_max = 0

    k_lengths = []

    for line in dataset:
        k_lengths.append(k_length(line))
        if k_length(line) > k_max:
            k_max = k_length(line)

        if len(line) > n_max:
            n_max = len(line)

    print('dataset: {}'.format(name))
    print('k_max: {}'.format(k_max))
    print('n_max: {}'.format(n_max))
    print('m: {}'.format(len(extract_items(dataset))))
    print('Variance on lengths: {}'.format(np.var(k_lengths)))
    print('Lines number : {}'.format(len(dataset)))
    print(" ")
Example #6
0
def compute_variations_better_quality(sequence,
                                      items,
                                      data,
                                      itemsets_memory,
                                      target_class,
                                      target_quality,
                                      enable_i=True,
                                      quality_measure=conf.QUALITY_MEASURE):
    '''
    Compute variations until quality increases
    :param sequence:
    :param items: the list of all possible items
    :return: the best new element (sequence, quality), or None if we are on a local optimum
    '''
    variations = []

    if VERTICAL_RPZ:
        bitset_slot_size = VERTICAL_TOOLS['bitset_slot_size']
        itemsets_bitsets = VERTICAL_TOOLS['itemsets_bitsets']
        class_data_count = VERTICAL_TOOLS['class_data_count']
        first_zero_mask = VERTICAL_TOOLS['first_zero_mask']
        last_ones_mask = VERTICAL_TOOLS['last_ones_mask']

    for itemset_i, itemset in enumerate(sequence):
        # i_extension
        if enable_i:
            for item_possible in items:
                new_variation_i_extension = copy.deepcopy(sequence)
                new_variation_i_extension[itemset_i].add(item_possible)

                # we check if created pattern is present in data before
                if is_included(new_variation_i_extension, itemsets_memory):
                    if VERTICAL_RPZ:
                        new_variation_i_quality, new_variation_i_bitset = compute_quality_vertical(
                            data,
                            new_variation_i_extension,
                            target_class,
                            bitset_slot_size,
                            itemsets_bitsets,
                            class_data_count,
                            first_zero_mask,
                            last_ones_mask,
                            quality_measure=quality_measure)
                    else:
                        new_variation_i_quality = compute_quality(
                            data, new_variation_i_extension, target_class)

                    variations.append(
                        (new_variation_i_extension, new_variation_i_quality))

                    if new_variation_i_quality > target_quality:
                        return variations[-1]

        # s_extension
        for item_possible in items:
            new_variation_s_extension = copy.deepcopy(sequence)
            new_variation_s_extension.insert(itemset_i, {item_possible})

            if VERTICAL_RPZ:
                new_variation_s_quality, new_variation_s_bitset = compute_quality_vertical(
                    data,
                    new_variation_s_extension,
                    target_class,
                    bitset_slot_size,
                    itemsets_bitsets,
                    class_data_count,
                    first_zero_mask,
                    last_ones_mask,
                    quality_measure=quality_measure)
            else:
                new_variation_s_quality = compute_quality(
                    data, new_variation_s_extension, target_class)

            variations.append(
                (new_variation_s_extension, new_variation_s_quality))

            if new_variation_s_quality > target_quality:
                return variations[-1]

        for item_i, item in enumerate(itemset):
            new_variation_remove = copy.deepcopy(sequence)

            # we can switch this item, remove it or add it as s or i-extension

            if (k_length(sequence) > 1):
                new_variation_remove[itemset_i].remove(item)

                if len(new_variation_remove[itemset_i]) == 0:
                    new_variation_remove.pop(itemset_i)

                if VERTICAL_RPZ:
                    new_variation_remove_quality, new_variation_remove_bitset = compute_quality_vertical(
                        data,
                        new_variation_remove,
                        target_class,
                        bitset_slot_size,
                        itemsets_bitsets,
                        class_data_count,
                        first_zero_mask,
                        last_ones_mask,
                        quality_measure=quality_measure)
                else:
                    new_variation_remove_quality = compute_quality(
                        data, new_variation_remove, target_class)

                variations.append(
                    (new_variation_remove, new_variation_remove_quality))
                if new_variation_remove_quality > target_quality:
                    return variations[-1]

    # s_extension for last element
    for item_possible in items:
        new_variation_s_extension = copy.deepcopy(sequence)
        new_variation_s_extension.append({item_possible})

        if VERTICAL_RPZ:
            new_variation_s_quality, new_variation_s_bitset = compute_quality_vertical(
                data,
                new_variation_s_extension,
                target_class,
                bitset_slot_size,
                itemsets_bitsets,
                class_data_count,
                first_zero_mask,
                last_ones_mask,
                quality_measure=quality_measure)
        else:
            new_variation_s_quality = compute_quality(
                data, new_variation_s_extension, target_class)

        variations.append((new_variation_s_extension, new_variation_s_quality))
        if new_variation_s_quality > target_quality:
            return variations[-1]

    return None