def compute(self): statistic_summary = [] class_list = self.class_dict.keys() for itemset_key, freq in self.freq_itemset_dict.itemsets.items(): ''' Compute entropy for each item-set (not contain class item) ''' itemset = string_2_itemset(itemset_key) if self.itemset_formatter(itemset) == True: continue entropy_value = 0 statistic_detail = {} flag = False for class_name in class_list: p = self.lookup_frequency(itemset, class_name) / freq statistic_detail[class_name] = p if p != 0: flag = True entropy_value += (-p * math.log2(p)) ''' Only add value when at least one class has value. ''' if (flag == True): statistic_detail['entropy'] = entropy_value statistic_detail['freq'] = freq statistic_summary.append( (itemset_2_string(itemset), statistic_detail)) return sorted(statistic_summary, key=lambda x: (x[1]['entropy']))
def generate_rules_for_class(self, general_summary, class_name): special_summary = [] for summary_detail in general_summary: if summary_detail[1][class_name] > 0: special_summary.append(summary_detail) ''' Compute p-value ''' item_set = string_2_itemset(summary_detail[0]) satisfy_rule = self.freq_itemset_dict.get_frequency( summary_detail[0]) no_satisfy_rule = self.freq_itemset_dict.ntransactions - satisfy_rule correct_predict = self.lookup_frequency(item_set, class_name) incorrect_predict = satisfy_rule - correct_predict belong_to_class = self.freq_itemset_dict.get_frequency( class_name) no_rule_belong_to_class = belong_to_class - correct_predict contingency_matrix = np.array( [[correct_predict, incorrect_predict], [ no_rule_belong_to_class, no_satisfy_rule - no_rule_belong_to_class ]]) _, p_value = stats.fisher_exact(contingency_matrix) summary_detail[1]['p-value'] = p_value return special_summary
def split(self, nChunk): itemsets_names = self.itemsets.keys() nItemsets = len(itemsets_names) #print ('Number of frequent item-sets: ' + str(nItemsets)) itemset_chunks = [[] for _ in range(nChunk)] size_of_chunk = (int)(nItemsets / nChunk) + 1 index = 0 counter = 0 for itemset_key in itemsets_names: if counter < size_of_chunk: itemset_chunks[index].append(string_2_itemset(itemset_key)) counter += 1 elif counter == size_of_chunk: index += 1 itemset_chunks[index].append(string_2_itemset(itemset_key)) counter = 1 return itemset_chunks
def load_file(self, file_name): self.itemsets.clear() with open(file_name, "r") as text_file: self.ntransactions = int(text_file.readline()) for line in text_file: subStrings = line.split(':') itemset_key = subStrings[0].strip() frequency = int(subStrings[1].strip()) self.itemsets[itemset_key] = frequency m = len(string_2_itemset(itemset_key)) if m > self.length_of_max_itemset: self.length_of_max_itemset = m
def generate_network(self, special_summary, class_name): item_pairs_and_frequency = {} for summary_detail in special_summary: if (summary_detail[1]['p-value']) <= 0.05: item_set = string_2_itemset(summary_detail[0]) for i in range(len(item_set) - 1): for j in range(i + 1, len(item_set)): combination = [item_set[i], item_set[j]] combination_key = itemset_2_string(combination) if combination_key in item_pairs_and_frequency: continue item_pairs_and_frequency[ combination_key] = self.lookup_frequency( combination, class_name) return item_pairs_and_frequency '''
def generate_Lk_w(min_sup_src, L_k1, C_k_file, k, inclusive_items_dict): #print('generate candidates with ' + str(k) + ' items') file_writer = open(C_k_file, 'w') for key, hash_item_collection in L_k1.get_items(): for index in range(hash_item_collection.size() - 1): index_th_item = hash_item_collection.get_item(index) new_key = '' if key == '': new_key = index_th_item.last_item else: new_key = key + ',' + index_th_item.last_item new_hash_collection = HashItemCollection() #check if it is infrequent item-set previous_itemset = string_2_itemset(new_key) for item in hash_item_collection.get_items_from(index + 1): ''' Check if the itemset contains any inclusive pair of items. ''' if Apriori.checkInclusiveItems(previous_itemset, item.last_item, inclusive_items_dict): continue ''' Create new itemset and check its support ''' new_item = HashItem(item.last_item) inter_items = set(index_th_item.tids).intersection( item.tids) if len(inter_items) >= min_sup_src: new_item.add_tids(list(inter_items)) new_hash_collection.add_item(new_item) ''' Write the new itemsets into file if there's any. ''' if new_hash_collection.size() > 0: file_writer.write(new_key) file_writer.write('\n') file_writer.write(new_hash_collection.serialize()) file_writer.write('\n') file_writer.close()
def string_2_rule(s): subStrings = s.split(">") left = string_2_itemset(subStrings[0].strip()) right = string_2_itemset(subStrings[1].strip()) return AssociationRule(left, right)