def __get_next_rule__(self, tokens, instances): best_pattern = [] best_match_count = -1 best_score = float('-inf') un_matched_instances = [] _, ys = zip(*instances) improved = True while improved: improved = False current_best_pattern = best_pattern[:] insertion_indices = range(len(current_best_pattern) + 1) for token in tokens: for insertion_index in insertion_indices: new_pattern = best_pattern[:insertion_index] + [token] + best_pattern[insertion_index:] new_rule = Rule(new_pattern) un_matched = [] predictions = [] match_count = 0 for x, y in instances: if new_rule.matches(x): match_count += 1 predictions.append(self.positive_label) else: predictions.append(0) un_matched.append((x, y)) score = self.rule_score_fn(ys, predictions) if score >= best_score and match_count >= self.min_positive_rules_covered: # If tied, always prefer ones that match more instances if score == best_score and match_count <= best_match_count: continue current_best_pattern = new_pattern best_match_count = match_count best_score = score improved = True un_matched_instances = un_matched pass # End for pass # End for best_pattern = current_best_pattern pass if len(best_pattern) == 0: return None, None, None best_rule = Rule(best_pattern) print "\tNew rule added: {0}\n\t\tRule Score: {1}\n\t\tMatches: {2}".format(best_rule, best_score, best_match_count) """ Compute remaining tokens """ un_matched_positives, un_matched_negatives = self.__partition_by_class__(un_matched_instances) positive_doc_freq = compute_document_frequency(un_matched_positives) remaining_tokens = self.__above_doc_freq__(positive_doc_freq, self.min_positive_rules_covered) return best_rule, un_matched_instances, remaining_tokens
def __get_next_rule__(self, tokens, instances): best_pattern = [] best_match_count = -1 best_score = float('-inf') un_matched_instances = [] _, ys = zip(*instances) improved = True while improved: improved = False current_best_pattern = best_pattern[:] insertion_indices = range(len(current_best_pattern) + 1) for token in tokens: for insertion_index in insertion_indices: new_pattern = best_pattern[:insertion_index] + [ token ] + best_pattern[insertion_index:] new_rule = Rule(new_pattern) un_matched = [] predictions = [] match_count = 0 for x, y in instances: if new_rule.matches(x): match_count += 1 predictions.append(self.positive_label) else: predictions.append(0) un_matched.append((x, y)) score = self.rule_score_fn(ys, predictions) if score >= best_score and match_count >= self.min_positive_rules_covered: # If tied, always prefer ones that match more instances if score == best_score and match_count <= best_match_count: continue current_best_pattern = new_pattern best_match_count = match_count best_score = score improved = True un_matched_instances = un_matched pass # End for pass # End for best_pattern = current_best_pattern pass if len(best_pattern) == 0: return None, None, None best_rule = Rule(best_pattern) print "\tNew rule added: {0}\n\t\tRule Score: {1}\n\t\tMatches: {2}".format( best_rule, best_score, best_match_count) """ Compute remaining tokens """ un_matched_positives, un_matched_negatives = self.__partition_by_class__( un_matched_instances) positive_doc_freq = compute_document_frequency(un_matched_positives) remaining_tokens = self.__above_doc_freq__( positive_doc_freq, self.min_positive_rules_covered) return best_rule, un_matched_instances, remaining_tokens