Example #1
0
    def compute(self):
        statistic_summary = []

        class_list = self.class_dict.keys()
        for itemset_key, freq in self.freq_itemset_dict.itemsets.items():
            '''
            Compute entropy for each item-set (not contain class item)
            '''
            itemset = string_2_itemset(itemset_key)
            if self.itemset_formatter(itemset) == True: continue

            entropy_value = 0
            statistic_detail = {}
            flag = False

            for class_name in class_list:
                p = self.lookup_frequency(itemset, class_name) / freq
                statistic_detail[class_name] = p
                if p != 0:
                    flag = True
                    entropy_value += (-p * math.log2(p))
            '''
            Only add value when at least one class has value.
            '''
            if (flag == True):
                statistic_detail['entropy'] = entropy_value
                statistic_detail['freq'] = freq

                statistic_summary.append(
                    (itemset_2_string(itemset), statistic_detail))

        return sorted(statistic_summary, key=lambda x: (x[1]['entropy']))
Example #2
0
    def findInclusiveItemPairs(self, minsup, exclusive_item_filter):
        inclusive_items_dict = {}
        apriori_model = Apriori()

        L1 = apriori_model.generate_L1(self.data_set, minsup)
        freq_one_item_itemset_dict = L1.get_itemset_dictionary()
        freq_one_item_itemset_dict.ntransactions = self.data_set.size()

        L2 = HashTable()
        apriori_model.generate_Lk(minsup, L1, L2, k=2)
        freq_two_item_itemset_dict = L2.get_itemset_dictionary()
        freq_two_item_itemset_dict.ntransactions = self.data_set.size()

        nitems = len(freq_one_item_itemset_dict.itemsets)
        all_items = list(freq_one_item_itemset_dict.itemsets.keys())
        for i in range(nitems - 1):
            first_item = all_items[i]
            nfirst = freq_one_item_itemset_dict.get_frequency(first_item[0])
            if exclusive_item_filter(first_item): continue

            for j in range(i + 1, nitems):
                second_item = all_items[j]
                if exclusive_item_filter(second_item): continue

                merge_key = itemset_2_string(
                    merge_itemsets(first_item, second_item))
                nboth = freq_two_item_itemset_dict.get_frequency(merge_key)
                if nboth == 0: continue

                nsecond = freq_one_item_itemset_dict(second_item[0])
                if nboth / nfirst >= 0.999 or nboth / nsecond >= 0.999:
                    inclusive_items_dict[merge_key] = True
        return inclusive_items_dict
Example #3
0
    def _complement_condition(self, r1, r2):
        merged_itemset = merge_itemsets(r1.left_items, r2.left_items)

        s = self.get_frequency(itemset_2_string(merged_itemset))
        sl = self.get_frequency(r1.lhs_string())
        sr = self.get_frequency(r2.lhs_string())

        #if s > 0: return True
        return max(s / sl, s / sr)
Example #4
0
    def enumerate_subsets(self, bit_mask, item_set, position, rule_collection,
                          both_frequency):
        '''
        Run out of items --> create rule and check format criterion
        '''
        if position >= len(item_set):
            lhs = []
            rhs = []

            for index in range(len(bit_mask)):
                if bit_mask[index] == True:
                    lhs.append(item_set[index])
                else:
                    rhs.append(item_set[index])

            if (len(lhs) > 0 and len(rhs) > 0):
                rule = AssociationRule(lhs, rhs)

                if (self.rule_formatter == None
                        or self.rule_formatter(rule) == True):
                    rule_collection.add(rule)

            return

        value_domain = [True, False]
        '''
        Include position-th item into LHS 
        '''

        for value in value_domain:
            bit_mask[position] = value

            if (value == False):
                lhs_itemset = []
                for index in range(len(bit_mask)):
                    if bit_mask[index] == True:
                        lhs_itemset.append(item_set[index])

                lhs_frequency = self.freq_itemset_dict.get_frequency(
                    itemset_2_string(lhs_itemset))
                confidence = 0
                if lhs_frequency > 0:
                    confidence = both_frequency / lhs_frequency

                if confidence < self.min_conf:
                    bit_mask[position] = True
                    continue

                self.enumerate_subsets(bit_mask, item_set, position + 1,
                                       rule_collection, both_frequency)
            else:
                self.enumerate_subsets(bit_mask, item_set, position + 1,
                                       rule_collection, both_frequency)

            bit_mask[position] = True
Example #5
0
    def subsets(self, bits, item_set, k, rule_collection, total_freq):
        '''
        Run out of items --> create rule and check format criterion
        '''
        if k >= len(item_set):
            left = []
            right = []

            for index in range(len(bits)):
                if bits[index] == True:
                    left.append(item_set[index])
                else:
                    right.append(item_set[index])

            if (len(left) > 0 and len(right) > 0):
                rule = AssociationRule(left, right)
                if (self.rule_formatter == None
                        or self.rule_formatter(rule) == True):
                    rule_collection.add(rule)

            return

        value_domain = [True, False]
        '''
        Include k-th item into LHS 
        '''

        for value in value_domain:
            bits[k] = value

            if (value == False):
                left_itemset = []
                for index in range(len(bits)):
                    if bits[index] == True:
                        left_itemset.append(item_set[index])

                left_value = self.freq_itemset_dict.get_frequency(
                    itemset_2_string(left_itemset))
                confident = 0
                if left_value > 0: confident = total_freq / left_value

                if confident < self.min_conf:
                    bits[k] = True
                    continue
                self.subsets(bits, item_set, k + 1, rule_collection,
                             total_freq)
            else:
                self.subsets(bits, item_set, k + 1, rule_collection,
                             total_freq)

            bits[k] = True
Example #6
0
    def generate_rules(self, freq_itemsets_collection, output_file_name):
        total_rules = 0
        remaining_rules = 0
        k = 0
        rule_collection = RulesCollection()
        with open(output_file_name, 'w') as _:
            print('clear old file...')

        for itemset in freq_itemsets_collection:
            '''
            Check item-set first if it can generate a rule
            '''
            if len(itemset) == 1:
                continue


            if self.itemset_formatter is not None and \
            self.itemset_formatter(itemset) == False:
                continue
            '''
            Write generated rule_collection into file
            '''
            k += 1
            if k % 200 == 0:
                print('writing some rule_collection to file: ' + str(k))
                total_rules += rule_collection.size()
                rule_collection.remove_redundancy(self.freq_itemset_dict)
                rule_collection.save(output_file_name, True)
                remaining_rules += rule_collection.size()
                rule_collection.clear()
            '''
            Generating association rule_collection.
            '''
            total_freq = self.freq_itemset_dict.get_frequency(
                itemset_2_string(itemset))
            bits = [True] * len(itemset)
            self.subsets(bits, itemset, 0, rule_collection, total_freq)

        print('writing last rule_collection to file: ' + str(k))
        total_rules += rule_collection.size()
        rule_collection.remove_redundancy(self.freq_itemset_dict)
        rule_collection.save(output_file_name, True)
        remaining_rules += rule_collection.size()
        rule_collection.clear()

        print('Finish for sub frequent item-sets!!!')
        print('Number of redundant rules ' +
              str(total_rules - remaining_rules) + '/' + str(total_rules))
Example #7
0
    def generate_network(self, special_summary, class_name):
        item_pairs_and_frequency = {}
        for summary_detail in special_summary:
            if (summary_detail[1]['p-value']) <= 0.05:
                item_set = string_2_itemset(summary_detail[0])
                for i in range(len(item_set) - 1):
                    for j in range(i + 1, len(item_set)):
                        combination = [item_set[i], item_set[j]]
                        combination_key = itemset_2_string(combination)
                        if combination_key in item_pairs_and_frequency:
                            continue
                        item_pairs_and_frequency[
                            combination_key] = self.lookup_frequency(
                                combination, class_name)

        return item_pairs_and_frequency
        '''
Example #8
0
    def generate_rules(self, freq_itemsets_collection, output_file_name):
        total_rules = 0
        remaining_rules = 0
        k = 0
        rule_collection = RulesCollection()

        x = open(output_file_name, 'w')
        x.close()

        for itemset in freq_itemsets_collection:
            '''
            Check item-set first if it can generate a rule
            '''
            if len(itemset) == 1:
                continue

            if self.itemset_formatter is not None and self.itemset_formatter(
                    itemset) == False:
                continue
            '''
            Write generated rule_collection into file
            '''
            k += 1
            if k % 200 == 0:
                #print ('writing some rule_collection to file: ' + str(k))
                total_rules += rule_collection.size()
                rule_collection.remove_redundancy(self.freq_itemset_dict)
                rule_collection.save(output_file_name, True)
                remaining_rules += rule_collection.size()
                rule_collection.clear()
            '''
            Generating association rule_collection.
            '''
            both_frequency = self.freq_itemset_dict.get_frequency(
                itemset_2_string(itemset))
            bit_mask = [True] * len(itemset)
            self.enumerate_subsets(bit_mask, itemset, 0, rule_collection,
                                   both_frequency)

        #print ('writing last rule_collection to file: ' + str(k))
        total_rules += rule_collection.size()
        rule_collection.remove_redundancy(self.freq_itemset_dict)
        rule_collection.save(output_file_name, True)
        remaining_rules += rule_collection.size()
        rule_collection.clear()
 def itemset_string(self):
     itemset = self.get_itemset()
     return itemset_2_string(itemset)
 def rhs_string(self):
     return itemset_2_string(self.right_items)
 def lhs_string(self):
     return itemset_2_string(self.left_items)
Example #12
0
 def lookup_frequency(self, item_set, class_name):
     merge_itemset = merge_itemsets(item_set, [class_name])
     merge_itemset_key = itemset_2_string(merge_itemset)
     if self.freq_itemset_dict.exists(merge_itemset_key):
         return self.freq_itemset_dict.get_frequency(merge_itemset_key)
     return 0