Ejemplo n.º 1
0
def max_item(dictionary):
    """
    This conceptually like an argmax function -- it finds the key whose
    corresponding value is the largest in the dictionary.
    """
    the_max = t4k.Max()
    for key, value in dictionary.iteritems(): the_max.add(value, key)
    large_val, large_key = the_max.get()
    return large_key, large_val
Ejemplo n.º 2
0
def find_closest_near_matching_token(annotated_doc, sentence_id, token_id,
                                     lemma):

    # Absolutize the location that we're expecting the token
    absolute_token_id = annotated_doc.absolutize([(sentence_id, token_id,
                                                   token_id + 1)])[0][1]

    distance = 0
    max_overlap = t4k.Max()
    for distance in range(MAX_TOKEN_NEAR_MATCH_DISTANCE + 1):

        try:
            check_token_id = absolute_token_id + distance
            check_token = annotated_doc.tokens[check_token_id]
            if check_token['pos'].startswith('VB'):
                found_lemma = check_token['lemma']
                overlap_amount = character_overlap(lemma, found_lemma)
                max_overlap.add(overlap_amount, check_token_id)
        except IndexError:
            pass

        check_token_id = absolute_token_id - distance
        if check_token_id >= 0:
            check_token = annotated_doc.tokens[check_token_id]
            if check_token['pos'].startswith('VB'):
                found_lemma = annotated_doc.tokens[check_token_id]['lemma']
                overlap_amount = character_overlap(lemma, found_lemma)
                max_overlap.add(overlap_amount, check_token_id)

    sentence_lemmas = ' '.join(
        [t['lemma'] for t in annotated_doc.get_sentence_tokens(sentence_id)])

    max_overlap_amount, max_overlap_token_id = max_overlap.get()

    # Handle the case where there weren't even any candidates among the tokens
    # searched.
    if max_overlap_token_id is None:
        return None

    token = annotated_doc.tokens[max_overlap_token_id]
    central_token = annotated_doc.tokens[absolute_token_id]
    print central_token
    print('found max overlap: %.2f %s %s %s: \n\n%s' %
          (max_overlap_amount, lemma, central_token['lemma'], token['lemma'],
           sentence_lemmas))
    return token
Ejemplo n.º 3
0
    def _align(self, expected_attributions, found_attributions):

        # Before attempting to do alignments, make a map from each
        # attribution to its sentences.
        found_attrs_lookup = AttrSentenceLookup(found_attributions)

        # Find best predicted attribution for each reference attribution.
        all_alignments = defaultdict(dict)
        best_alignments = {}
        for expected_attr_id in expected_attributions:

            # Find eligible extracted attributions that overlap with
            # the same sentences (if any)
            expected_attr = expected_attributions[expected_attr_id]
            exp_sentences = expected_attr.get_sentence_ids()
            eligible_found_attrs = found_attrs_lookup.lookup(exp_sentences)

            # There may be no overlap at all with this attribution...
            if len(eligible_found_attrs) == 0:
                best_alignments[expected_attr_id] = None

            # If there is exactly one eligible predicted attribution, then
            # select it to be the best match.
            elif len(eligible_found_attrs) == 1:
                found_attr_id = list(eligible_found_attrs)[0]
                found_attr = found_attributions[found_attr_id]
                overlap = attribution_overlap(expected_attr, found_attr)
                best_alignments[expected_attr_id] = (found_attr_id, overlap)
                all_alignments[expected_attr_id][found_attr_id] = overlap

            # If there are many eligible attributions, take the one with
            # the best overlap score.
            else:
                maxx = t4k.Max()
                for found_attr_id in eligible_found_attrs:
                    found_attr = found_attributions[found_attr_id]
                    overlap = attribution_overlap(expected_attr, found_attr)
                    all_alignments[expected_attr_id][found_attr_id] = overlap
                    maxx.add(overlap['overall'], (found_attr_id, overlap))
                overall, (best_found_attr_id, overlap) = maxx.get()
                best_alignments[expected_attr_id] = (best_found_attr_id,
                                                     overlap)

        return all_alignments, best_alignments
Ejemplo n.º 4
0
def calculate_best_score(scored_typed, positive=set([1]), negative=set([-1])):
    """
    This function helps to convert from a scoring function to a classification
    function.  Given some function which provides scores to items that are
    either "positive" or "negative", find the best threshold score that gives
    the highest f1 score, when used to label any items whose score is higher as
    "positive" and lower as "negative"

    INPUTS
        ``scored_typed`` should be a list of tuples of scored items, where the
        first element of the tuple is the score, and the second element is the
        true class of the item, which should be 'pos' or 'neg'

    OUTPUTS 
        ``(best_f1, threshold)`` where best_metric is the best value for
        the chosen metric, achieved when threshold is used to label items
        according to their assigned scores.
    """

    # Tally up the number of positive and negative examples having given scores
    # This simplifies finding the best threshold if there are repeated scores.
    labels_by_score = defaultdict(lambda: {1:0, -1:0})
    for score, label in scored_typed:
        binary_label = binarize(label, positive, negative)
        labels_by_score[score][binary_label] += 1

    # Get a sorted list of the *unique* scores
    sorted_scores = sorted(labels_by_score.keys())

    # Start with the threshold lower than the minimum score, then gradually 
    # raise it, keeping track of the performance metric
    num_pos = sum([v[1] for v in labels_by_score.values()])
    num_neg = sum([v[-1] for v in labels_by_score.values()])

    # We start with the threshold below the minimum score, so that all items
    # are considered '1's. The initial number correct then is just the number
    # of positives in total
    true_pos = num_pos
    false_pos = num_neg
    initial_f1 = f1(true_pos, false_pos, num_pos)
    initial_pointer = -1

    maximum = t4k.Max()
    maximum.add(initial_f1, initial_pointer)
    for pointer, score in enumerate(sorted_scores):

        # Determine the effect of moving the threshold just *above* this score
        # Any positives at this score are now mis-labelled as negatives
        true_pos -= labels_by_score[score][1]

        # Any negatives at this score are now correctly labelled as negatives
        false_pos -= labels_by_score[score][-1]

        # Recalculate the F1 score now.
        this_f1 = f1(true_pos, false_pos, num_pos)

        # If this is an improvement over the previous best value, keep it
        maximum.add(this_f1, pointer)

    best_f1, best_pointer = maximum.get()
    if best_pointer == -1:
        threshold = min(sorted_scores) - 1
    elif best_pointer == len(sorted_scores)-1:
        threshold = max(sorted_scores) + 1
    else:
        threshold = (
            sorted_scores[best_pointer] + sorted_scores[best_pointer+1]) / 2.0

    return best_f1, threshold
Ejemplo n.º 5
0
    def test_constituency_token_spans(self):
        """
        Test that sentence constituents are still absolutely addressed, and 
        that all other constituents have been converted to sentence-relative 
        addressing.
        """

        # Choose anything except the first sentence.  We need to make sure that 
        # the constituents' token_spans are addressed relative to the start of
        # the sentence
        TEST_SENTENCE_INDEX = 1

        # First check that the sentence constituent has correct absolute token
        # span
        found_token_span = self.doc.sentences[TEST_SENTENCE_INDEX]['token_span']
        sent_abs_start, sent_abs_end = 36, 68
        expected_token_span = [(None, sent_abs_start, sent_abs_end)]
        self.assertEqual(found_token_span, expected_token_span)

        
        # Now check that all other constituents have correct sentence-relative
        # token spans
        nodes_in_dfs_order = pr.spans.get_dfs_constituents(
            self.doc.sentences[TEST_SENTENCE_INDEX])

        max_end = t4k.Max()

        past_first_token = False
        for depth, node in nodes_in_dfs_order:

            if depth == 0:
                continue

            self.assertEqual(len(node['token_span']), 1)
            sentence_id, start, end = node['token_span'][0]
            max_end.add(end)

            last_end = None
            #print depth, node['constituent_type']
            #print 'start, end\t', start, end, last_end

            # All token_spans should be addressed to the test sentence's ID
            self.assertEqual(sentence_id, TEST_SENTENCE_INDEX)

            # The first constituents and tokens should have an index of zero
            if not past_first_token:
                self.assertEqual(start, 0)

            for child in pr.spans.get_constituency_children(node):
                self.assertEqual(len(child['token_span']), 1)
                token_span = child['token_span'][0]
                child_sentence_id, child_start, child_end = token_span
                self.assertEqual(child_sentence_id, sentence_id)

                # One child should pick up where the preceeding child left off
                # (Except for the first child, which has no preceeding child)
                if last_end is not None:
                    self.assertEqual(child_start, last_end)

                self.assertTrue(child_end <= end)
                last_end = child_end
                #print ' '.join(t4k.strings([],
                #    '\tchild_start, child_end, last_end\t',
                #    child_start, child_end, last_end
                #))

            if last_end is not None:
                #print 'did final child line up?\t', last_end, end
                self.assertEqual(last_end, end)

            if node['constituent_type'] == 'token':
                past_first_token = True

        self.assertEqual(max_end.max_key(), sent_abs_end - sent_abs_start)