def extract_features_4_all_rules(self): left_features, right_features = self._get_feature_names() left_count = len(left_features) right_count = len(right_features) print('Write number of features for LHS and RHS') features_writer = open(self.files_info.non_redundant_rule_tmp_file, 'w') features_writer.write(str(left_count)) features_writer.write('\n') features_writer.write(str(right_count)) features_writer.write('\n') print('Starting extraction...') for i in range(self.nthreads): input_file = self.files_info.get_rule_file(i) with open(input_file, 'r') as rules_reader: for line in rules_reader: rule = AssociationRule.string_2_rule(line.strip()) a = self._extract_features_4_itemset(rule.left_items, left_features) b = self._extract_features_4_itemset(rule.right_items, right_features) f_vector = np.concatenate((a, b)) ''' Write a feature vector to file ''' features_writer.write(json.dumps((rule.serialize(),f_vector.tolist()))) features_writer.write('\n') features_writer.close()
def extract_feature_vectors_(self): #print ('Determine feature vector ....') ''' measures = [om.support, om.confidence, om.coverage, om.prevalence, om.recall, om.specificity, om.classificationError, om.lift, om.leverage, om.change_of_support, om.relative_risk, om.jaccard, om.certainty_factor, om.odd_ratio, om.yuleQ, om.yuleY, om.klosgen, om.conviction, om.weighting_dependency, om.collective_strength, om.jmeasure, om.one_way_support, om.two_ways_support, om.two_ways_support_variation, om.linear_coefficient, om.piatetsky_shapiro, om.loevinger, om.information_gain, om.sebag_schoenauner, om.least_contradiction, om.odd_multiplier, om.counter_example_rate, om.zhang] ''' ''' measures = [om.support, om.confidence, om.coverage, om.prevalence, om.recall, om.specificity, om.classificationError, om.lift, om.leverage, om.change_of_support, om.jaccard, om.certainty_factor, om.klosgen, om.weighting_dependency, om.jmeasure, om.one_way_support, om.two_ways_support, om.piatetsky_shapiro, om.information_gain, om.least_contradiction, om.counter_example_rate, om.zhang] ''' ''' measures = [om.support, om.confidence, om.specificity, om.weighting_dependency, om.klosgen, om.piatetsky_shapiro, om.zhang] ''' #measures = [om.support, om.confidence, om.zhang] observations_dict = self.load_freq_itemset_dictionary() #ntransactions = observations_dict.ntransactions features_writer = open(self.non_redundant_rule_tmp_file, 'w') for i in range(self.nthreads): input_file = self.rules_tmp_file + '.' + str(i) with open(input_file, 'r') as rules_reader: for line in rules_reader: rule = AssociationRule.string_2_rule(line.strip()) ''' f_vector = [] lhs_frequency, rhs_frequency, both_frequency = observations_dict.get_frequency_tuple(rule) for index in range(len(measures)): value = measures[index](lhs_frequency/ntransactions, rhs_frequency/ntransactions, both_frequency/ntransactions) f_vector.append(value) ''' f_vector = rule.compute_probs(observations_dict) #f_vector.append(len(rule.left_items)) features_writer.write( json.dumps((rule.serialize(), f_vector))) features_writer.write('\n') features_writer.close()
def load_association_rules(self): association_rules_list = [] with open(self.non_redundant_rule_tmp_file, 'r') as rules_reader: for line in rules_reader: rule_text, _ = json.loads(line.strip()) association_rules_list.append( AssociationRule.string_2_rule(rule_text.strip())) return association_rules_list
def preprocessRuleFeatureDict(rule_feature_dict): ''' Normalize feature using min-max scaler ''' rules, features = separateRulesAndFeatures(rule_feature_dict) rule_full_list = [AssociationRule.string_2_rule(x) for x in rules] return rule_full_list, features, features[:, 0]
def enumerate_subsets(self, bit_mask, item_set, position, rule_collection, both_frequency): ''' Run out of items --> create rule and check format criterion ''' if position >= len(item_set): lhs = [] rhs = [] for index in range(len(bit_mask)): if bit_mask[index] == True: lhs.append(item_set[index]) else: rhs.append(item_set[index]) if (len(lhs) > 0 and len(rhs) > 0): rule = AssociationRule(lhs, rhs) if (self.rule_formatter == None or self.rule_formatter(rule) == True): rule_collection.add(rule) return value_domain = [True, False] ''' Include position-th item into LHS ''' for value in value_domain: bit_mask[position] = value if (value == False): lhs_itemset = [] for index in range(len(bit_mask)): if bit_mask[index] == True: lhs_itemset.append(item_set[index]) lhs_frequency = self.freq_itemset_dict.get_frequency( itemset_2_string(lhs_itemset)) confidence = 0 if lhs_frequency > 0: confidence = both_frequency / lhs_frequency if confidence < self.min_conf: bit_mask[position] = True continue self.enumerate_subsets(bit_mask, item_set, position + 1, rule_collection, both_frequency) else: self.enumerate_subsets(bit_mask, item_set, position + 1, rule_collection, both_frequency) bit_mask[position] = True
def generate_rules_spect(self, freq_itemsets_collection, output_file_name): total_rules = 0 remaining_rules = 0 k = 0 rule_collection = RulesCollection() x = open(output_file_name, 'w') x.close() for itemset in freq_itemsets_collection: ''' Check item-set first if it can generate a rule ''' if len(itemset) == 1: continue if self.itemset_formatter is not None and self.itemset_formatter( itemset) == False: continue ''' Write generated rule_collection into file ''' k += 1 if k % 200 == 0: #print ('writing some rule_collection to file: ' + str(k)) total_rules += rule_collection.size() rule_collection.remove_redundancy(self.freq_itemset_dict) rule_collection.save(output_file_name, True) remaining_rules += rule_collection.size() rule_collection.clear() ''' Generating association rule_collection. ''' rhs = [] lhs = [] for item in itemset: if 'class@' in item: rhs.append(item) else: lhs.append(item) if (len(lhs) > 0 and len(rhs) > 0): rule = AssociationRule(lhs, rhs) confidence = self.freq_itemset_dict.get_confidence(rule) if confidence >= self.min_conf: rule_collection.add(rule) #print ('writing last rule_collection to file: ' + str(k)) total_rules += rule_collection.size() rule_collection.remove_redundancy(self.freq_itemset_dict) rule_collection.save(output_file_name, True) remaining_rules += rule_collection.size() rule_collection.clear()
def subsets(self, bits, item_set, k, rule_collection, total_freq): ''' Run out of items --> create rule and check format criterion ''' if k >= len(item_set): left = [] right = [] for index in range(len(bits)): if bits[index] == True: left.append(item_set[index]) else: right.append(item_set[index]) if (len(left) > 0 and len(right) > 0): rule = AssociationRule(left, right) if (self.rule_formatter == None or self.rule_formatter(rule) == True): rule_collection.add(rule) return value_domain = [True, False] ''' Include k-th item into LHS ''' for value in value_domain: bits[k] = value if (value == False): left_itemset = [] for index in range(len(bits)): if bits[index] == True: left_itemset.append(item_set[index]) left_value = self.freq_itemset_dict.get_frequency( itemset_2_string(left_itemset)) confident = 0 if left_value > 0: confident = total_freq / left_value if confident < self.min_conf: bits[k] = True continue self.subsets(bits, item_set, k + 1, rule_collection, total_freq) else: self.subsets(bits, item_set, k + 1, rule_collection, total_freq) bits[k] = True
def load_feature_vectors(self): data = [] lengths = [] with open(self.files_info.non_redundant_rule_tmp_file, 'r') as feature_reader: print('Loading number of LHS and RHS features...') lhs_count = int(feature_reader.readline()) rhs_count = int(feature_reader.readline()) print('Loading feature vectors... ') for line in feature_reader: rule_text, f_vector = json.loads(line.strip()) rule = AssociationRule.string_2_rule(rule_text.strip()) lengths.append(rule.length()) data.append(f_vector) return np.array(data), lengths, lhs_count, rhs_count
def _extract_feature_vectors(self): #lhs_features, rhs_features = self._get_feature_names() observations_dict = self.load_freq_itemset_dictionary() features_writer = open(self.non_redundant_rule_tmp_file, 'w') for i in range(self.nthreads): input_file = self.rules_tmp_file + '.' + str(i) with open(input_file, 'r') as rules_reader: for line in rules_reader: rule = AssociationRule.string_2_rule(line.strip()) f_vector = rule.compute_probs(observations_dict) #lhs_vector = self._extract_one_feature_vector(rule.left_items, lhs_features) #rhs_vector = self._extract_one_feature_vector(rule.right_items, rhs_features) #f_vector = lhs_vector + rhs_vector features_writer.write( json.dumps((rule.serialize(), f_vector))) features_writer.write('\n') features_writer.close()
def preprocessRuleFeatureDict(rule_feature_dict): rules, features = separateRulesAndFeatures(rule_feature_dict) rule_full_list = [AssociationRule.string_2_rule(x) for x in rules] return rule_full_list, features, features[:, 0]
def filter_association_rules(unexpected_rules, delta_1=0): rules = [] for x in unexpected_rules: if x[2][0][1] > delta_1: rules.append(AssociationRule.string_2_rule(x[0])) return rules
def load_from_file(self, file_name): with open(file_name, "r") as text_file: for line in text_file: rule = AssociationRule.string_2_rule(line) self.rules[line.strip()] = rule
def load_file(self, file_name): with open(file_name, "r") as text_file: for line in text_file: rule = AssociationRule.string_2_rule(line) self.rules.append(rule)