Example #1
0
    def findInclusiveItemPairs(self, minsup, exclusive_item_filter):
        inclusive_items_dict = {}
        apriori_model = Apriori()

        L1 = apriori_model.generate_L1(self.data_set, minsup)
        freq_one_item_itemset_dict = L1.get_itemset_dictionary()
        freq_one_item_itemset_dict.ntransactions = self.data_set.size()

        L2 = HashTable()
        apriori_model.generate_Lk(minsup, L1, L2, k=2)
        freq_two_item_itemset_dict = L2.get_itemset_dictionary()
        freq_two_item_itemset_dict.ntransactions = self.data_set.size()

        nitems = len(freq_one_item_itemset_dict.itemsets)
        all_items = list(freq_one_item_itemset_dict.itemsets.keys())
        for i in range(nitems - 1):
            first_item = all_items[i]
            nfirst = freq_one_item_itemset_dict.get_frequency(first_item[0])
            if exclusive_item_filter(first_item): continue

            for j in range(i + 1, nitems):
                second_item = all_items[j]
                if exclusive_item_filter(second_item): continue

                merge_key = itemset_2_string(
                    merge_itemsets(first_item, second_item))
                nboth = freq_two_item_itemset_dict.get_frequency(merge_key)
                if nboth == 0: continue

                nsecond = freq_one_item_itemset_dict(second_item[0])
                if nboth / nfirst >= 0.999 or nboth / nsecond >= 0.999:
                    inclusive_items_dict[merge_key] = True
        return inclusive_items_dict
Example #2
0
    def generate_freq_itemsets_w(self, data_set, min_sup_src, nthreads,
                                 end_index, inclusive_items_dict, output_file):
        '''
        Step 1: Generate frequent item-sets with 1 item and write to file
        '''
        ntransactions = data_set.size()
        with open(output_file, 'w') as text_file:
            text_file.write(str(ntransactions))
            text_file.write('\n')

        L1 = Apriori.generate_L1(data_set, min_sup_src)
        L1.get_itemset_dictionary_w(output_file, 'a')
        '''
        Step 2: Generate frequent item-sets with more than 1 item and append to the file
        '''
        k = 2
        L_k1 = L1

        while not L_k1.isEmpty() and (end_index == -1 or k <= end_index):

            #print('extracting item-sets with ' + str(k) + ' items ....')
            '''
            Divide data into many parts and create processes to generate frequent item-sets
            '''
            chunks = L_k1.split(nthreads)
            L_k1 = None
            processes = []

            index = 0
            for L_k_1_chunk in chunks:
                chunk_output_file = self.freq_itemsets_tmp_file + '.' + str(
                    index)
                process_i = Process(target=Apriori.generate_Lk_w,
                                    args=(min_sup_src, L_k_1_chunk,
                                          chunk_output_file, k,
                                          inclusive_items_dict))
                processes.append(process_i)
                index += 1

            # wait for all thread completes
            for process_i in processes:
                process_i.start()
                process_i.join()
            '''
            Merge results which returns from processes
            '''
            L_k1 = HashTable()
            for index in range(len(chunks)):
                chunk_input_file = self.freq_itemsets_tmp_file + '.' + str(
                    index)
                L_k1.deserialize(chunk_input_file, False)
            '''
            Append frequent item-sets with k items to file
            '''
            #print ('Writing frequent itemset to file....')
            L_k1.get_itemset_dictionary_w(output_file, 'a')
            #x = L_k1.get_itemset_dictionary_w(output_file, 'a')
            #print ('#item-sets: ' + str(x))
            k += 1
Example #3
0
 def generate_L1(self, min_sup):
     C_1 = HashTable()
     itemset_key = ''
     C_1.insert_key(itemset_key)
 
     n = self.data_set.size()
     print ('size of data-set: ' + str(n))
     
     for tid in range(n):
         transaction = self.data_set.get_transaction(tid)
         for item in transaction:
             C_1.add_tid(itemset_key, item, tid)
         
     print ('get frequent item sets with 1 item')
     self.L1 = C_1.generate_frequent_itemsets(min_sup)
Example #4
0
 def generate_C1(data_set):
     itemset_key = ''
     C_1 = HashTable()
     C_1.insert_key(itemset_key)
 
     ntransactions = data_set.size()
     
     for tid in range(ntransactions):
         transaction = data_set.get_transaction(tid)
         for item in transaction:
             C_1.add_tid(itemset_key, item, tid)
     return C_1
Example #5
0
 def generate_frequent_itemsets(self, min_sup, nthreads, end, output_file, write_support = False):
     
     '''
     Step 1: Generate frequent item-sets with 1 item and write to file
     '''
     nTransactions = self.data_set.size()
     with open(output_file, 'w') as text_file:
         text_file.write(str(nTransactions))
         text_file.write('\n')
     
     
     self.generate_L1(min_sup)
     freq_itemsets_dict = self.L1.generate_itemset_dictionary()
     freq_itemsets_dict.ntransactions = nTransactions
     freq_itemsets_dict.save_2_file(output_file, 'a', write_support)
     freq_itemsets_dict.clear()
     
     '''
     Step 2: Generate frequent item-sets with more than 1 item and append to the file
     '''
     k = 2    
     L_k1 = self.L1
     
     while not L_k1.is_empty() and (end == -1 or k <= end):
         
         print('extracting item-sets with ' + str(k) + ' items ....')
         
         '''
         Divide data into many parts and create processes to generate frequent item-sets
         '''
         L_k = HashTable()
         chunks = L_k1.split(nthreads)
         processes = []
         
         C_ks = []
         BaseManager.register("AprioriHash", HashTable)
         manager = BaseManager()
         manager.start()
         C_ks.append(manager.AprioriHash())
         
         index = 0
         for L_k_1_chunk in chunks:
             process_i = Process(target = Apriori.generate_Lk, 
                                 args=(min_sup, L_k_1_chunk,C_ks[index], k))
             processes.append(process_i)
             index += 1
         
         # wait for all thread completes
         for process_i in processes:
             process_i.start()
             process_i.join()
          
         '''
         Merge results which returns from processes
         '''
         for new_C_k in C_ks:
             L_k.append(new_C_k)
         L_k1.clear()
         L_k1 = L_k
 
         '''
         Append frequent item-sets with k items to file
         '''
         freq_itemsets_dict = L_k1.generate_itemset_dictionary()
         
         print ('Writing frequent itemset to file ' + str(freq_itemsets_dict.size()))
         freq_itemsets_dict.ntransactions = nTransactions
         freq_itemsets_dict.save_2_file(output_file, 'a', write_support)
         freq_itemsets_dict.clear()
         
         k += 1
         
     print ('stop at k = ' + str(k))