def exhaustive(data, target_class, top_k=5, enable_i=True): begin = datetime.datetime.utcnow() # by storing this large element, we avoid the problem of adding problems elements sorted_patterns = PrioritySet(500) bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1 first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size) last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size) class_data_count = count_target_class_data(data, target_class) itemsets_bitsets = {} items = extract_items(data) fifo = [[]] # to know if elements have already been added fifo_elements = set() stage = 0 compute_count = 0 while len(fifo) != 0: seed = fifo.pop(0) children = compute_children(seed, items, enable_i) if k_length(seed) > stage: stage = k_length(seed) display_info(stage, compute_count, sorted_patterns, begin, data, top_k) for child in children: quality, bitset = compute_quality_vertical(data, child, target_class, bitset_slot_size, itemsets_bitsets, class_data_count, first_zero_mask, last_ones_mask) sorted_patterns.add_preserve_memory(child, quality, data) # we do not explore elements with a null support if child not in fifo_elements and bitset != 0: fifo.append(child) fifo_elements.add(child) compute_count += len(children) print("The algorithm took:{}".format(datetime.datetime.utcnow() - begin)) return sorted_patterns.get_top_k_non_redundant(data, top_k)
def add_lengths(patterns, dataset_name, data_final, algo): for pattern in patterns: k_length_p = k_length(pattern[1]) data_add_generic(data_final, Length=k_length_p, dataset=dataset_name, Algorithm=algo)
def exhaustive(DATA, enable_i=True): begin = datetime.datetime.utcnow() items = extract_items(DATA) # we remove first element wich are useless for i in range(len(DATA)): DATA[i] = DATA[i][1:] l_max = extract_l_max(DATA) fifo = [[]] # to know if elements have already been added fifo_elements = set() stage = 0 compute_count = 0 while len(fifo) != 0: seed = fifo.pop(0) children = compute_children(seed, items, enable_i) if k_length(seed) > stage: stage = k_length(seed) for child in children: # we do not explore elements with a null support if k_length(child) <= l_max and child not in fifo_elements: fifo.append(child) fifo_elements.add(child) compute_count += len(children) print("The algorithm took:{}".format(datetime.datetime.utcnow() - begin)) # we add the root print('The size is: {}'.format(len(fifo_elements) + 1)) return fifo_elements
def compute_quality_vertical(data, subsequence, target_class, bitset_slot_size, itemsets_bitsets, class_data_count, first_zero_mask, last_ones_mask, quality_measure=conf.QUALITY_MEASURE): seqscout.global_var.increase_it_number() length = k_length(subsequence) bitset = 0 if length == 0: # the empty node is present everywhere # we just have to create a vector of ones bitset = 2**(len(data) * bitset_slot_size) - 1 elif length == 1: singleton = frozenset(subsequence[0]) bitset = generate_bitset(singleton, data, bitset_slot_size) itemsets_bitsets[singleton] = bitset else: # general case bitset = 2**(len(data) * bitset_slot_size) - 1 first_iteration = True for itemset_i in subsequence: itemset = frozenset(itemset_i) try: itemset_bitset = itemsets_bitsets[itemset] except KeyError: # the bitset is not in the hashmap, we need to generate it itemset_bitset = generate_bitset(itemset, data, bitset_slot_size) itemsets_bitsets[itemset] = itemset_bitset if first_iteration: first_iteration = False # aie aie aie ! bitset = itemset_bitset else: bitset = following_ones(bitset, bitset_slot_size, first_zero_mask) bitset &= itemset_bitset # now we just need to extract support, supersequence and class_pattern_count class_pattern_count = 0 support, bitset_simple = get_support_from_vector(bitset, bitset_slot_size, first_zero_mask, last_ones_mask) # find supersequences and count class pattern: i = bitset_simple.bit_length() - 1 while i >= 0: if bitset_simple >> i & 1: index_data = len(data) - i - 1 if data[index_data][0] == target_class: class_pattern_count += 1 i -= 1 occurency_ratio = support / len(data) if quality_measure == 'WRAcc': # we find the number of elements who have the right target_class try: class_pattern_ratio = class_pattern_count / support except ZeroDivisionError: return -0.25, 0 class_data_ratio = class_data_count / len(data) wracc = occurency_ratio * (class_pattern_ratio - class_data_ratio) return wracc, bitset elif quality_measure == 'Informedness': tn = len(data) - support - (class_data_count - class_pattern_count) tpr = class_pattern_count / (class_pattern_count + (class_data_count - class_pattern_count)) tnr = tn / (class_pattern_count + tn) return tnr + tpr - 1, bitset elif quality_measure == 'F1': try: class_pattern_ratio = class_pattern_count / support except ZeroDivisionError: return 0, 0 precision = class_pattern_ratio recall = class_pattern_count / class_data_count try: f1 = 2 * precision * recall / (precision + recall) except: f1 = 0 return f1, bitset else: raise ValueError('The quality measure name is not valid')
(read_data_kosarak('../data/context.data'), 'context'), (read_data_sc2('../data/sequences-TZ-45.txt')[:5000], 'sc2'), (read_data_kosarak('../data/skating.data'), 'skating'), (read_jmlr('svm', '../data/jmlr/jmlr'), 'jmlr'), (read_data_kosarak('../data/figures_rc.dat'), 'RocketLeague')] for dataset, name in datasets: for i in range(len(dataset)): dataset[i] = dataset[i][1:] k_max = 0 n_max = 0 k_lengths = [] for line in dataset: k_lengths.append(k_length(line)) if k_length(line) > k_max: k_max = k_length(line) if len(line) > n_max: n_max = len(line) print('dataset: {}'.format(name)) print('k_max: {}'.format(k_max)) print('n_max: {}'.format(n_max)) print('m: {}'.format(len(extract_items(dataset)))) print('Variance on lengths: {}'.format(np.var(k_lengths))) print('Lines number : {}'.format(len(dataset))) print(" ")
def compute_variations_better_quality(sequence, items, data, itemsets_memory, target_class, target_quality, enable_i=True, quality_measure=conf.QUALITY_MEASURE): ''' Compute variations until quality increases :param sequence: :param items: the list of all possible items :return: the best new element (sequence, quality), or None if we are on a local optimum ''' variations = [] if VERTICAL_RPZ: bitset_slot_size = VERTICAL_TOOLS['bitset_slot_size'] itemsets_bitsets = VERTICAL_TOOLS['itemsets_bitsets'] class_data_count = VERTICAL_TOOLS['class_data_count'] first_zero_mask = VERTICAL_TOOLS['first_zero_mask'] last_ones_mask = VERTICAL_TOOLS['last_ones_mask'] for itemset_i, itemset in enumerate(sequence): # i_extension if enable_i: for item_possible in items: new_variation_i_extension = copy.deepcopy(sequence) new_variation_i_extension[itemset_i].add(item_possible) # we check if created pattern is present in data before if is_included(new_variation_i_extension, itemsets_memory): if VERTICAL_RPZ: new_variation_i_quality, new_variation_i_bitset = compute_quality_vertical( data, new_variation_i_extension, target_class, bitset_slot_size, itemsets_bitsets, class_data_count, first_zero_mask, last_ones_mask, quality_measure=quality_measure) else: new_variation_i_quality = compute_quality( data, new_variation_i_extension, target_class) variations.append( (new_variation_i_extension, new_variation_i_quality)) if new_variation_i_quality > target_quality: return variations[-1] # s_extension for item_possible in items: new_variation_s_extension = copy.deepcopy(sequence) new_variation_s_extension.insert(itemset_i, {item_possible}) if VERTICAL_RPZ: new_variation_s_quality, new_variation_s_bitset = compute_quality_vertical( data, new_variation_s_extension, target_class, bitset_slot_size, itemsets_bitsets, class_data_count, first_zero_mask, last_ones_mask, quality_measure=quality_measure) else: new_variation_s_quality = compute_quality( data, new_variation_s_extension, target_class) variations.append( (new_variation_s_extension, new_variation_s_quality)) if new_variation_s_quality > target_quality: return variations[-1] for item_i, item in enumerate(itemset): new_variation_remove = copy.deepcopy(sequence) # we can switch this item, remove it or add it as s or i-extension if (k_length(sequence) > 1): new_variation_remove[itemset_i].remove(item) if len(new_variation_remove[itemset_i]) == 0: new_variation_remove.pop(itemset_i) if VERTICAL_RPZ: new_variation_remove_quality, new_variation_remove_bitset = compute_quality_vertical( data, new_variation_remove, target_class, bitset_slot_size, itemsets_bitsets, class_data_count, first_zero_mask, last_ones_mask, quality_measure=quality_measure) else: new_variation_remove_quality = compute_quality( data, new_variation_remove, target_class) variations.append( (new_variation_remove, new_variation_remove_quality)) if new_variation_remove_quality > target_quality: return variations[-1] # s_extension for last element for item_possible in items: new_variation_s_extension = copy.deepcopy(sequence) new_variation_s_extension.append({item_possible}) if VERTICAL_RPZ: new_variation_s_quality, new_variation_s_bitset = compute_quality_vertical( data, new_variation_s_extension, target_class, bitset_slot_size, itemsets_bitsets, class_data_count, first_zero_mask, last_ones_mask, quality_measure=quality_measure) else: new_variation_s_quality = compute_quality( data, new_variation_s_extension, target_class) variations.append((new_variation_s_extension, new_variation_s_quality)) if new_variation_s_quality > target_quality: return variations[-1] return None