def findInclusiveItemPairs(self, minsup, exclusive_item_filter): inclusive_items_dict = {} apriori_model = Apriori() L1 = apriori_model.generate_L1(self.data_set, minsup) freq_one_item_itemset_dict = L1.get_itemset_dictionary() freq_one_item_itemset_dict.ntransactions = self.data_set.size() L2 = HashTable() apriori_model.generate_Lk(minsup, L1, L2, k=2) freq_two_item_itemset_dict = L2.get_itemset_dictionary() freq_two_item_itemset_dict.ntransactions = self.data_set.size() nitems = len(freq_one_item_itemset_dict.itemsets) all_items = list(freq_one_item_itemset_dict.itemsets.keys()) for i in range(nitems - 1): first_item = all_items[i] nfirst = freq_one_item_itemset_dict.get_frequency(first_item[0]) if exclusive_item_filter(first_item): continue for j in range(i + 1, nitems): second_item = all_items[j] if exclusive_item_filter(second_item): continue merge_key = itemset_2_string( merge_itemsets(first_item, second_item)) nboth = freq_two_item_itemset_dict.get_frequency(merge_key) if nboth == 0: continue nsecond = freq_one_item_itemset_dict(second_item[0]) if nboth / nfirst >= 0.999 or nboth / nsecond >= 0.999: inclusive_items_dict[merge_key] = True return inclusive_items_dict
def generate_freq_itemsets_w(self, data_set, min_sup_src, nthreads, end_index, inclusive_items_dict, output_file): ''' Step 1: Generate frequent item-sets with 1 item and write to file ''' ntransactions = data_set.size() with open(output_file, 'w') as text_file: text_file.write(str(ntransactions)) text_file.write('\n') L1 = Apriori.generate_L1(data_set, min_sup_src) L1.get_itemset_dictionary_w(output_file, 'a') ''' Step 2: Generate frequent item-sets with more than 1 item and append to the file ''' k = 2 L_k1 = L1 while not L_k1.isEmpty() and (end_index == -1 or k <= end_index): #print('extracting item-sets with ' + str(k) + ' items ....') ''' Divide data into many parts and create processes to generate frequent item-sets ''' chunks = L_k1.split(nthreads) L_k1 = None processes = [] index = 0 for L_k_1_chunk in chunks: chunk_output_file = self.freq_itemsets_tmp_file + '.' + str( index) process_i = Process(target=Apriori.generate_Lk_w, args=(min_sup_src, L_k_1_chunk, chunk_output_file, k, inclusive_items_dict)) processes.append(process_i) index += 1 # wait for all thread completes for process_i in processes: process_i.start() process_i.join() ''' Merge results which returns from processes ''' L_k1 = HashTable() for index in range(len(chunks)): chunk_input_file = self.freq_itemsets_tmp_file + '.' + str( index) L_k1.deserialize(chunk_input_file, False) ''' Append frequent item-sets with k items to file ''' #print ('Writing frequent itemset to file....') L_k1.get_itemset_dictionary_w(output_file, 'a') #x = L_k1.get_itemset_dictionary_w(output_file, 'a') #print ('#item-sets: ' + str(x)) k += 1
def generate_L1(self, min_sup): C_1 = HashTable() itemset_key = '' C_1.insert_key(itemset_key) n = self.data_set.size() print ('size of data-set: ' + str(n)) for tid in range(n): transaction = self.data_set.get_transaction(tid) for item in transaction: C_1.add_tid(itemset_key, item, tid) print ('get frequent item sets with 1 item') self.L1 = C_1.generate_frequent_itemsets(min_sup)
def generate_C1(data_set): itemset_key = '' C_1 = HashTable() C_1.insert_key(itemset_key) ntransactions = data_set.size() for tid in range(ntransactions): transaction = data_set.get_transaction(tid) for item in transaction: C_1.add_tid(itemset_key, item, tid) return C_1
def generate_frequent_itemsets(self, min_sup, nthreads, end, output_file, write_support = False): ''' Step 1: Generate frequent item-sets with 1 item and write to file ''' nTransactions = self.data_set.size() with open(output_file, 'w') as text_file: text_file.write(str(nTransactions)) text_file.write('\n') self.generate_L1(min_sup) freq_itemsets_dict = self.L1.generate_itemset_dictionary() freq_itemsets_dict.ntransactions = nTransactions freq_itemsets_dict.save_2_file(output_file, 'a', write_support) freq_itemsets_dict.clear() ''' Step 2: Generate frequent item-sets with more than 1 item and append to the file ''' k = 2 L_k1 = self.L1 while not L_k1.is_empty() and (end == -1 or k <= end): print('extracting item-sets with ' + str(k) + ' items ....') ''' Divide data into many parts and create processes to generate frequent item-sets ''' L_k = HashTable() chunks = L_k1.split(nthreads) processes = [] C_ks = [] BaseManager.register("AprioriHash", HashTable) manager = BaseManager() manager.start() C_ks.append(manager.AprioriHash()) index = 0 for L_k_1_chunk in chunks: process_i = Process(target = Apriori.generate_Lk, args=(min_sup, L_k_1_chunk,C_ks[index], k)) processes.append(process_i) index += 1 # wait for all thread completes for process_i in processes: process_i.start() process_i.join() ''' Merge results which returns from processes ''' for new_C_k in C_ks: L_k.append(new_C_k) L_k1.clear() L_k1 = L_k ''' Append frequent item-sets with k items to file ''' freq_itemsets_dict = L_k1.generate_itemset_dictionary() print ('Writing frequent itemset to file ' + str(freq_itemsets_dict.size())) freq_itemsets_dict.ntransactions = nTransactions freq_itemsets_dict.save_2_file(output_file, 'a', write_support) freq_itemsets_dict.clear() k += 1 print ('stop at k = ' + str(k))