def __get_next_rule__(self, tokens, instances):

        best_pattern = []
        best_match_count = -1

        best_score = float('-inf')
        un_matched_instances = []

        _, ys = zip(*instances)

        improved = True
        while improved:
            improved = False
            current_best_pattern = best_pattern[:]
            insertion_indices = range(len(current_best_pattern) + 1)

            for token in tokens:
                for insertion_index in insertion_indices:
                    new_pattern = best_pattern[:insertion_index] + [token] + best_pattern[insertion_index:]

                    new_rule = Rule(new_pattern)
                    un_matched = []
                    predictions = []
                    match_count = 0
                    for x, y in instances:
                        if new_rule.matches(x):
                            match_count += 1
                            predictions.append(self.positive_label)
                        else:
                            predictions.append(0)
                            un_matched.append((x, y))

                    score = self.rule_score_fn(ys, predictions)
                    if score >= best_score and match_count >= self.min_positive_rules_covered:

                        # If tied, always prefer ones that match more instances
                        if score == best_score and match_count <= best_match_count:
                            continue

                        current_best_pattern = new_pattern
                        best_match_count = match_count

                        best_score = score
                        improved = True
                        un_matched_instances = un_matched
                    pass # End for
                pass # End for
            best_pattern = current_best_pattern
            pass

        if len(best_pattern) == 0:
            return None, None, None

        best_rule = Rule(best_pattern)

        print "\tNew rule added: {0}\n\t\tRule Score: {1}\n\t\tMatches: {2}".format(best_rule, best_score,
                                                                                    best_match_count)

        """ Compute remaining tokens """
        un_matched_positives, un_matched_negatives = self.__partition_by_class__(un_matched_instances)
        positive_doc_freq = compute_document_frequency(un_matched_positives)
        remaining_tokens = self.__above_doc_freq__(positive_doc_freq, self.min_positive_rules_covered)

        return best_rule, un_matched_instances, remaining_tokens
Beispiel #2
0
    def __get_next_rule__(self, tokens, instances):

        best_pattern = []
        best_match_count = -1

        best_score = float('-inf')
        un_matched_instances = []

        _, ys = zip(*instances)

        improved = True
        while improved:
            improved = False
            current_best_pattern = best_pattern[:]
            insertion_indices = range(len(current_best_pattern) + 1)

            for token in tokens:
                for insertion_index in insertion_indices:
                    new_pattern = best_pattern[:insertion_index] + [
                        token
                    ] + best_pattern[insertion_index:]

                    new_rule = Rule(new_pattern)
                    un_matched = []
                    predictions = []
                    match_count = 0
                    for x, y in instances:
                        if new_rule.matches(x):
                            match_count += 1
                            predictions.append(self.positive_label)
                        else:
                            predictions.append(0)
                            un_matched.append((x, y))

                    score = self.rule_score_fn(ys, predictions)
                    if score >= best_score and match_count >= self.min_positive_rules_covered:

                        # If tied, always prefer ones that match more instances
                        if score == best_score and match_count <= best_match_count:
                            continue

                        current_best_pattern = new_pattern
                        best_match_count = match_count

                        best_score = score
                        improved = True
                        un_matched_instances = un_matched
                    pass  # End for
                pass  # End for
            best_pattern = current_best_pattern
            pass

        if len(best_pattern) == 0:
            return None, None, None

        best_rule = Rule(best_pattern)

        print "\tNew rule added: {0}\n\t\tRule Score: {1}\n\t\tMatches: {2}".format(
            best_rule, best_score, best_match_count)
        """ Compute remaining tokens """
        un_matched_positives, un_matched_negatives = self.__partition_by_class__(
            un_matched_instances)
        positive_doc_freq = compute_document_frequency(un_matched_positives)
        remaining_tokens = self.__above_doc_freq__(
            positive_doc_freq, self.min_positive_rules_covered)

        return best_rule, un_matched_instances, remaining_tokens