def compute_dataset_size_raissy(DATA): ITEMS = extract_items(DATA) m = len(ITEMS) l_max = extract_l_max(DATA) w_k = 0 for i in range(50): local = combination(l_max, m * i) / (2**(i + 1)) w_k += local return w_k
def exhaustive(data, target_class, top_k=5, enable_i=True): begin = datetime.datetime.utcnow() # by storing this large element, we avoid the problem of adding problems elements sorted_patterns = PrioritySet(500) bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1 first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size) last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size) class_data_count = count_target_class_data(data, target_class) itemsets_bitsets = {} items = extract_items(data) fifo = [[]] # to know if elements have already been added fifo_elements = set() stage = 0 compute_count = 0 while len(fifo) != 0: seed = fifo.pop(0) children = compute_children(seed, items, enable_i) if k_length(seed) > stage: stage = k_length(seed) display_info(stage, compute_count, sorted_patterns, begin, data, top_k) for child in children: quality, bitset = compute_quality_vertical(data, child, target_class, bitset_slot_size, itemsets_bitsets, class_data_count, first_zero_mask, last_ones_mask) sorted_patterns.add_preserve_memory(child, quality, data) # we do not explore elements with a null support if child not in fifo_elements and bitset != 0: fifo.append(child) fifo_elements.add(child) compute_count += len(children) print("The algorithm took:{}".format(datetime.datetime.utcnow() - begin)) return sorted_patterns.get_top_k_non_redundant(data, top_k)
def compute_dataset_size_raissy2(DATA): ITEMS = extract_items(DATA) m = len(ITEMS) l_max = extract_l_max(DATA) memo = {0: 1} somme = w_k(memo, l_max, m) for i, value in memo.items(): somme += value return somme
def compute_dataset_size(DATA): ITEMS = extract_items(DATA) m = len(ITEMS) stages = {} l_max = extract_l_max(DATA) pattern_number = 1 # we count the root for l in range(l_max + 1): stage_count = 0 for k in range(l): decompositions = decompose(k) for decomposition in decompositions: # more set of balls to share than bags, impossible case if len(decomposition) <= l - k: first_element = m**(l - k - len(decomposition)) histo = decomposition_histogram(decomposition) histo_factorial_product = 1 for _, unique_factor in histo.items(): histo_factorial_product *= factorial(unique_factor) second_element = factorial(l - k) / ( factorial(l - k - len(decomposition)) * histo_factorial_product) for elt in decomposition: second_element *= combination(elt + 1, m) stage_pattern = first_element * second_element pattern_number += stage_pattern stage_count += stage_pattern stages[l] = stage_count return pattern_number, stages
def exhaustive(DATA, enable_i=True): begin = datetime.datetime.utcnow() items = extract_items(DATA) # we remove first element wich are useless for i in range(len(DATA)): DATA[i] = DATA[i][1:] l_max = extract_l_max(DATA) fifo = [[]] # to know if elements have already been added fifo_elements = set() stage = 0 compute_count = 0 while len(fifo) != 0: seed = fifo.pop(0) children = compute_children(seed, items, enable_i) if k_length(seed) > stage: stage = k_length(seed) for child in children: # we do not explore elements with a null support if k_length(child) <= l_max and child not in fifo_elements: fifo.append(child) fifo_elements.add(child) compute_count += len(children) print("The algorithm took:{}".format(datetime.datetime.utcnow() - begin)) # we add the root print('The size is: {}'.format(len(fifo_elements) + 1)) return fifo_elements
(read_data_kosarak('../data/blocks.data'), 'blocks'), (read_data_kosarak('../data/context.data'), 'context'), (read_data_sc2('../data/sequences-TZ-45.txt')[:5000], 'sc2'), (read_data_kosarak('../data/skating.data'), 'skating'), (read_jmlr('svm', '../data/jmlr/jmlr'), 'jmlr')] for dataset, name in datasets: for i in range(len(dataset)): dataset[i] = dataset[i][1:] k_max = 0 n_max = 0 k_lengths = [] for line in dataset: k_lengths.append(k_length(line)) if k_length(line) > k_max: k_max = k_length(line) if len(line) > n_max: n_max = len(line) print('dataset: {}'.format(name)) print('k_max: {}'.format(k_max)) print('n_max: {}'.format(n_max)) print('m: {}'.format(len(extract_items(dataset)))) print('Variance on lengths: {}'.format(np.var(k_lengths))) print('Lines number : {}'.format(len(dataset))) print(" ")
def seq_scout_api(dataset=conf.DATA, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K): ''' Launch seq_scout. This function is for the simplicity of the user, so that she does not needs to specify iterations number, which is here only for experiments. ''' if dataset == 'splice': data = read_data( pathlib.Path(__file__).parent.parent / 'data/splice.data') target_class = 'EI' enable_i = False elif dataset == 'alsbu': data = read_data_kosarak( pathlib.Path(__file__).parent.parent / 'data/aslbu.data') target_class = '195' enable_i = False elif dataset == 'alsbu': data = read_data_kosarak( pathlib.Path(__file__).parent.parent / 'data/blocks.data') target_class = '7' enable_i = False elif dataset == 'context': data = read_data_kosarak( pathlib.Path(__file__).parent.parent / 'data/context.data') target_class = '4' enable_i = False elif dataset == 'sc2': data = read_data_sc2( pathlib.Path(__file__).parent.parent / 'data/sequences-TZ-45.txt')[:5000] target_class = '1' enable_i = True elif dataset == 'skating': data = read_data_kosarak( pathlib.Path(__file__).parent.parent / 'data/skating.data') target_class = '1' enable_i = False elif dataset == 'jmlr': data = read_jmlr( 'svm', pathlib.Path(__file__).parent.parent / 'data/jmlr/jmlr') target_class = '+' enable_i = False else: data = read_data( pathlib.Path(__file__).parent.parent / 'data/promoters.data') target_class = '+' enable_i = False class_present = False for sequence in data: if target_class == sequence[0]: class_present = True break if not class_present: raise ValueError('The target class does not appear in data') items = extract_items(data) items, items_to_encoding, encoding_to_items = encode_items(items) data = encode_data(data, items_to_encoding) results = seq_scout(data, target_class, top_k=top_k, vertical=False, time_budget=time_budget, iterations_limit=10000000000000, enable_i=enable_i) print_results_decode(results, encoding_to_items) return results
def seq_scout(data, target_class, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K, enable_i=True, vertical=True, iterations_limit=conf.ITERATIONS_NUMBER, theta=conf.THETA, quality_measure=conf.QUALITY_MEASURE): items = extract_items(data) begin = datetime.datetime.utcnow() time_budget = datetime.timedelta(seconds=time_budget) data_target_class = filter_target_class(data, target_class) sorted_patterns = PrioritySet(k=top_k, theta=theta) UCB_scores = PrioritySetUCB() itemsets_memory = get_itemset_memory(data) # removing class bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1 global VERTICAL_RPZ VERTICAL_RPZ = vertical global VERTICAL_TOOLS VERTICAL_TOOLS = { "bitset_slot_size": bitset_slot_size, "first_zero_mask": compute_first_zero_mask(len(data), bitset_slot_size), "last_ones_mask": compute_last_ones_mask(len(data), bitset_slot_size), "class_data_count": count_target_class_data(data, target_class), "itemsets_bitsets": {} } N = 1 # init: we add objects with the best ucb so that they are all played one time in the main procedure. # By putting a null N, we ensure the mean of the quality will be correct for sequence in data_target_class: sequence_i = sequence_mutable_to_immutable(sequence[1:]) UCB_score = UCB(float("inf"), 1, N) UCB_scores.add(sequence_i, (UCB_score, 0, 0)) # play with time budget while datetime.datetime.utcnow( ) - begin < time_budget and N < iterations_limit: # we take the best UCB _, Ni, mean_quality, sequence = UCB_scores.pop() pattern, quality = play_arm(sequence, data, target_class, quality_measure=quality_measure) pattern = sequence_mutable_to_immutable(pattern) sorted_patterns.add(pattern, quality) # we update scores updated_quality = (Ni * mean_quality + quality) / (Ni + 1) UCB_score = UCB(updated_quality, Ni + 1, N) UCB_scores.add(sequence, (UCB_score, Ni + 1, updated_quality)) N += 1 print("seqscout optimized iterations: {}".format(N)) best_patterns = sorted_patterns.get_top_k_non_redundant(data, top_k) for pattern in best_patterns: pattern_mutable = sequence_immutable_to_mutable(pattern[1]) optimized_pattern, optimized_quality = exploit_arm( pattern_mutable, pattern[0], items, data, itemsets_memory, target_class, enable_i=enable_i, quality_measure=quality_measure) optimized_pattern = sequence_mutable_to_immutable(optimized_pattern) sorted_patterns.add(optimized_pattern, optimized_quality) return sorted_patterns.get_top_k_non_redundant(data, top_k)
def beam_search(data, target_class, time_budget=conf.TIME_BUDGET, enable_i=True, top_k=conf.TOP_K, beam_width=conf.BEAM_WIDTH, iterations_limit=conf.ITERATIONS_NUMBER, theta=conf.THETA, quality_measure=conf.QUALITY_MEASURE): items = extract_items(data) begin = datetime.datetime.utcnow() time_budget = datetime.timedelta(seconds=time_budget) bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1 first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size) last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size) class_data_count = count_target_class_data(data, target_class) itemsets_bitsets = {} # candidate_queue = items_to_sequences(items) candidate_queue = [[]] sorted_patterns = PrioritySet(top_k, theta=theta) nb_iteration = 0 while datetime.datetime.utcnow( ) - begin < time_budget and nb_iteration < iterations_limit: beam = PrioritySet() while (len(candidate_queue) != 0) and nb_iteration < iterations_limit: seed = candidate_queue.pop(0) children = compute_children(seed, items, enable_i) for child in children: if nb_iteration >= iterations_limit: break quality, _ = compute_quality_vertical( data, child, target_class, bitset_slot_size, itemsets_bitsets, class_data_count, first_zero_mask, last_ones_mask, quality_measure=quality_measure) # sorted_patterns.add_preserve_memory(child, quality, data) sorted_patterns.add(child, quality) beam.add(child, quality) nb_iteration += 1 candidate_queue = [ j for i, j in beam.get_top_k_non_redundant(data, beam_width) ] # print("Number iterations beam search: {}".format(nb_iteration)) return sorted_patterns.get_top_k_non_redundant(data, top_k)