def extract_features_4_all_rules(self):
     left_features, right_features  = self._get_feature_names()
     left_count = len(left_features)
     right_count = len(right_features)
     
     print('Write number of features for LHS and RHS')
     features_writer = open(self.files_info.non_redundant_rule_tmp_file, 'w')
     features_writer.write(str(left_count))
     features_writer.write('\n')
     features_writer.write(str(right_count))
     features_writer.write('\n')
     
     print('Starting extraction...')
     for i in range(self.nthreads):
         input_file = self.files_info.get_rule_file(i)
         
         with open(input_file, 'r') as rules_reader:
             for line in rules_reader:
                 rule = AssociationRule.string_2_rule(line.strip())
                 a = self._extract_features_4_itemset(rule.left_items, left_features)
                 b = self._extract_features_4_itemset(rule.right_items, right_features)
                 f_vector = np.concatenate((a, b))
                 '''
                 Write a feature vector to file
                 '''               
                 features_writer.write(json.dumps((rule.serialize(),f_vector.tolist())))
                 features_writer.write('\n')
     features_writer.close()
Esempio n. 2
0
    def extract_feature_vectors_(self):
        #print ('Determine feature vector ....')
        '''
        measures = [om.support, om.confidence, om.coverage, om.prevalence, om.recall, om.specificity, 
                    om.classificationError, om.lift, om.leverage, om.change_of_support, om.relative_risk, 
                    om.jaccard, om.certainty_factor, om.odd_ratio, om.yuleQ, om.yuleY, 
                    om.klosgen, om.conviction, om.weighting_dependency, 
                    om.collective_strength, om.jmeasure, 
                    om.one_way_support, om.two_ways_support, om.two_ways_support_variation, 
                    om.linear_coefficient, om.piatetsky_shapiro, om.loevinger,
                    om.information_gain, om.sebag_schoenauner, om.least_contradiction, 
                    om.odd_multiplier, om.counter_example_rate, om.zhang]
        '''
        '''
        measures = [om.support, om.confidence, om.coverage, om.prevalence, om.recall, om.specificity, 
                    om.classificationError, om.lift, om.leverage, om.change_of_support, 
                    om.jaccard, om.certainty_factor, 
                    om.klosgen, om.weighting_dependency, 
                    om.jmeasure, 
                    om.one_way_support, om.two_ways_support, 
                    om.piatetsky_shapiro,
                    om.information_gain, om.least_contradiction, 
                    om.counter_example_rate, om.zhang]
        '''
        '''    
        measures = [om.support, om.confidence, om.specificity, 
                    om.weighting_dependency, om.klosgen,
                    om.piatetsky_shapiro,
                    om.zhang]
        '''
        #measures = [om.support, om.confidence, om.zhang]
        observations_dict = self.load_freq_itemset_dictionary()
        #ntransactions = observations_dict.ntransactions

        features_writer = open(self.non_redundant_rule_tmp_file, 'w')
        for i in range(self.nthreads):
            input_file = self.rules_tmp_file + '.' + str(i)

            with open(input_file, 'r') as rules_reader:
                for line in rules_reader:

                    rule = AssociationRule.string_2_rule(line.strip())
                    '''
                    f_vector = []
                    lhs_frequency, rhs_frequency, both_frequency = observations_dict.get_frequency_tuple(rule)
                    
                    for index in range(len(measures)):
                        value = measures[index](lhs_frequency/ntransactions, 
                                            rhs_frequency/ntransactions, 
                                            both_frequency/ntransactions)
                        f_vector.append(value)
                    '''
                    f_vector = rule.compute_probs(observations_dict)
                    #f_vector.append(len(rule.left_items))

                    features_writer.write(
                        json.dumps((rule.serialize(), f_vector)))
                    features_writer.write('\n')

        features_writer.close()
Esempio n. 3
0
 def load_association_rules(self):
     association_rules_list = []
     with open(self.non_redundant_rule_tmp_file, 'r') as rules_reader:
         for line in rules_reader:
             rule_text, _ = json.loads(line.strip())
             association_rules_list.append(
                 AssociationRule.string_2_rule(rule_text.strip()))
     return association_rules_list
Esempio n. 4
0
def preprocessRuleFeatureDict(rule_feature_dict):
    '''
    Normalize feature using min-max scaler
    '''
    rules, features = separateRulesAndFeatures(rule_feature_dict)
    rule_full_list = [AssociationRule.string_2_rule(x) for x in rules]

    return rule_full_list, features, features[:, 0]
Esempio n. 5
0
    def enumerate_subsets(self, bit_mask, item_set, position, rule_collection,
                          both_frequency):
        '''
        Run out of items --> create rule and check format criterion
        '''
        if position >= len(item_set):
            lhs = []
            rhs = []

            for index in range(len(bit_mask)):
                if bit_mask[index] == True:
                    lhs.append(item_set[index])
                else:
                    rhs.append(item_set[index])

            if (len(lhs) > 0 and len(rhs) > 0):
                rule = AssociationRule(lhs, rhs)

                if (self.rule_formatter == None
                        or self.rule_formatter(rule) == True):
                    rule_collection.add(rule)

            return

        value_domain = [True, False]
        '''
        Include position-th item into LHS 
        '''

        for value in value_domain:
            bit_mask[position] = value

            if (value == False):
                lhs_itemset = []
                for index in range(len(bit_mask)):
                    if bit_mask[index] == True:
                        lhs_itemset.append(item_set[index])

                lhs_frequency = self.freq_itemset_dict.get_frequency(
                    itemset_2_string(lhs_itemset))
                confidence = 0
                if lhs_frequency > 0:
                    confidence = both_frequency / lhs_frequency

                if confidence < self.min_conf:
                    bit_mask[position] = True
                    continue

                self.enumerate_subsets(bit_mask, item_set, position + 1,
                                       rule_collection, both_frequency)
            else:
                self.enumerate_subsets(bit_mask, item_set, position + 1,
                                       rule_collection, both_frequency)

            bit_mask[position] = True
Esempio n. 6
0
    def generate_rules_spect(self, freq_itemsets_collection, output_file_name):
        total_rules = 0
        remaining_rules = 0
        k = 0
        rule_collection = RulesCollection()

        x = open(output_file_name, 'w')
        x.close()

        for itemset in freq_itemsets_collection:
            '''
            Check item-set first if it can generate a rule
            '''
            if len(itemset) == 1:
                continue

            if self.itemset_formatter is not None and self.itemset_formatter(
                    itemset) == False:
                continue
            '''
            Write generated rule_collection into file
            '''
            k += 1
            if k % 200 == 0:
                #print ('writing some rule_collection to file: ' + str(k))
                total_rules += rule_collection.size()
                rule_collection.remove_redundancy(self.freq_itemset_dict)
                rule_collection.save(output_file_name, True)
                remaining_rules += rule_collection.size()
                rule_collection.clear()
            '''
            Generating association rule_collection.
            '''
            rhs = []
            lhs = []

            for item in itemset:
                if 'class@' in item:
                    rhs.append(item)
                else:
                    lhs.append(item)

            if (len(lhs) > 0 and len(rhs) > 0):
                rule = AssociationRule(lhs, rhs)
                confidence = self.freq_itemset_dict.get_confidence(rule)
                if confidence >= self.min_conf:
                    rule_collection.add(rule)

        #print ('writing last rule_collection to file: ' + str(k))
        total_rules += rule_collection.size()
        rule_collection.remove_redundancy(self.freq_itemset_dict)
        rule_collection.save(output_file_name, True)
        remaining_rules += rule_collection.size()
        rule_collection.clear()
Esempio n. 7
0
    def subsets(self, bits, item_set, k, rule_collection, total_freq):
        '''
        Run out of items --> create rule and check format criterion
        '''
        if k >= len(item_set):
            left = []
            right = []

            for index in range(len(bits)):
                if bits[index] == True:
                    left.append(item_set[index])
                else:
                    right.append(item_set[index])

            if (len(left) > 0 and len(right) > 0):
                rule = AssociationRule(left, right)
                if (self.rule_formatter == None
                        or self.rule_formatter(rule) == True):
                    rule_collection.add(rule)

            return

        value_domain = [True, False]
        '''
        Include k-th item into LHS 
        '''

        for value in value_domain:
            bits[k] = value

            if (value == False):
                left_itemset = []
                for index in range(len(bits)):
                    if bits[index] == True:
                        left_itemset.append(item_set[index])

                left_value = self.freq_itemset_dict.get_frequency(
                    itemset_2_string(left_itemset))
                confident = 0
                if left_value > 0: confident = total_freq / left_value

                if confident < self.min_conf:
                    bits[k] = True
                    continue
                self.subsets(bits, item_set, k + 1, rule_collection,
                             total_freq)
            else:
                self.subsets(bits, item_set, k + 1, rule_collection,
                             total_freq)

            bits[k] = True
 def load_feature_vectors(self):
     data = []
     lengths = []
     
     with open(self.files_info.non_redundant_rule_tmp_file, 'r') as feature_reader:
         print('Loading number of LHS and RHS features...')
         lhs_count = int(feature_reader.readline())
         rhs_count = int(feature_reader.readline())
         print('Loading feature vectors... ')
         for line in feature_reader:
             rule_text, f_vector = json.loads(line.strip())
             rule = AssociationRule.string_2_rule(rule_text.strip())
             lengths.append(rule.length())
             data.append(f_vector)
             
             
     return np.array(data), lengths, lhs_count, rhs_count
Esempio n. 9
0
    def _extract_feature_vectors(self):

        #lhs_features, rhs_features = self._get_feature_names()

        observations_dict = self.load_freq_itemset_dictionary()
        features_writer = open(self.non_redundant_rule_tmp_file, 'w')

        for i in range(self.nthreads):
            input_file = self.rules_tmp_file + '.' + str(i)

            with open(input_file, 'r') as rules_reader:
                for line in rules_reader:

                    rule = AssociationRule.string_2_rule(line.strip())
                    f_vector = rule.compute_probs(observations_dict)
                    #lhs_vector = self._extract_one_feature_vector(rule.left_items, lhs_features)
                    #rhs_vector = self._extract_one_feature_vector(rule.right_items, rhs_features)
                    #f_vector = lhs_vector + rhs_vector

                    features_writer.write(
                        json.dumps((rule.serialize(), f_vector)))
                    features_writer.write('\n')

        features_writer.close()
Esempio n. 10
0
def preprocessRuleFeatureDict(rule_feature_dict):
    rules, features = separateRulesAndFeatures(rule_feature_dict)
    rule_full_list = [AssociationRule.string_2_rule(x) for x in rules]
    return rule_full_list, features, features[:, 0]
def filter_association_rules(unexpected_rules, delta_1=0):
    rules = []
    for x in unexpected_rules:
        if x[2][0][1] > delta_1:
            rules.append(AssociationRule.string_2_rule(x[0]))
    return rules
 def load_from_file(self, file_name):
     with open(file_name, "r") as text_file:
         for line in text_file:
             rule = AssociationRule.string_2_rule(line)
             self.rules[line.strip()] = rule
Esempio n. 13
0
 def load_file(self, file_name):
     with open(file_name, "r") as text_file:
         for line in text_file:
             rule = AssociationRule.string_2_rule(line)
             self.rules.append(rule)