def max_item(dictionary): """ This conceptually like an argmax function -- it finds the key whose corresponding value is the largest in the dictionary. """ the_max = t4k.Max() for key, value in dictionary.iteritems(): the_max.add(value, key) large_val, large_key = the_max.get() return large_key, large_val
def find_closest_near_matching_token(annotated_doc, sentence_id, token_id, lemma): # Absolutize the location that we're expecting the token absolute_token_id = annotated_doc.absolutize([(sentence_id, token_id, token_id + 1)])[0][1] distance = 0 max_overlap = t4k.Max() for distance in range(MAX_TOKEN_NEAR_MATCH_DISTANCE + 1): try: check_token_id = absolute_token_id + distance check_token = annotated_doc.tokens[check_token_id] if check_token['pos'].startswith('VB'): found_lemma = check_token['lemma'] overlap_amount = character_overlap(lemma, found_lemma) max_overlap.add(overlap_amount, check_token_id) except IndexError: pass check_token_id = absolute_token_id - distance if check_token_id >= 0: check_token = annotated_doc.tokens[check_token_id] if check_token['pos'].startswith('VB'): found_lemma = annotated_doc.tokens[check_token_id]['lemma'] overlap_amount = character_overlap(lemma, found_lemma) max_overlap.add(overlap_amount, check_token_id) sentence_lemmas = ' '.join( [t['lemma'] for t in annotated_doc.get_sentence_tokens(sentence_id)]) max_overlap_amount, max_overlap_token_id = max_overlap.get() # Handle the case where there weren't even any candidates among the tokens # searched. if max_overlap_token_id is None: return None token = annotated_doc.tokens[max_overlap_token_id] central_token = annotated_doc.tokens[absolute_token_id] print central_token print('found max overlap: %.2f %s %s %s: \n\n%s' % (max_overlap_amount, lemma, central_token['lemma'], token['lemma'], sentence_lemmas)) return token
def _align(self, expected_attributions, found_attributions): # Before attempting to do alignments, make a map from each # attribution to its sentences. found_attrs_lookup = AttrSentenceLookup(found_attributions) # Find best predicted attribution for each reference attribution. all_alignments = defaultdict(dict) best_alignments = {} for expected_attr_id in expected_attributions: # Find eligible extracted attributions that overlap with # the same sentences (if any) expected_attr = expected_attributions[expected_attr_id] exp_sentences = expected_attr.get_sentence_ids() eligible_found_attrs = found_attrs_lookup.lookup(exp_sentences) # There may be no overlap at all with this attribution... if len(eligible_found_attrs) == 0: best_alignments[expected_attr_id] = None # If there is exactly one eligible predicted attribution, then # select it to be the best match. elif len(eligible_found_attrs) == 1: found_attr_id = list(eligible_found_attrs)[0] found_attr = found_attributions[found_attr_id] overlap = attribution_overlap(expected_attr, found_attr) best_alignments[expected_attr_id] = (found_attr_id, overlap) all_alignments[expected_attr_id][found_attr_id] = overlap # If there are many eligible attributions, take the one with # the best overlap score. else: maxx = t4k.Max() for found_attr_id in eligible_found_attrs: found_attr = found_attributions[found_attr_id] overlap = attribution_overlap(expected_attr, found_attr) all_alignments[expected_attr_id][found_attr_id] = overlap maxx.add(overlap['overall'], (found_attr_id, overlap)) overall, (best_found_attr_id, overlap) = maxx.get() best_alignments[expected_attr_id] = (best_found_attr_id, overlap) return all_alignments, best_alignments
def calculate_best_score(scored_typed, positive=set([1]), negative=set([-1])): """ This function helps to convert from a scoring function to a classification function. Given some function which provides scores to items that are either "positive" or "negative", find the best threshold score that gives the highest f1 score, when used to label any items whose score is higher as "positive" and lower as "negative" INPUTS ``scored_typed`` should be a list of tuples of scored items, where the first element of the tuple is the score, and the second element is the true class of the item, which should be 'pos' or 'neg' OUTPUTS ``(best_f1, threshold)`` where best_metric is the best value for the chosen metric, achieved when threshold is used to label items according to their assigned scores. """ # Tally up the number of positive and negative examples having given scores # This simplifies finding the best threshold if there are repeated scores. labels_by_score = defaultdict(lambda: {1:0, -1:0}) for score, label in scored_typed: binary_label = binarize(label, positive, negative) labels_by_score[score][binary_label] += 1 # Get a sorted list of the *unique* scores sorted_scores = sorted(labels_by_score.keys()) # Start with the threshold lower than the minimum score, then gradually # raise it, keeping track of the performance metric num_pos = sum([v[1] for v in labels_by_score.values()]) num_neg = sum([v[-1] for v in labels_by_score.values()]) # We start with the threshold below the minimum score, so that all items # are considered '1's. The initial number correct then is just the number # of positives in total true_pos = num_pos false_pos = num_neg initial_f1 = f1(true_pos, false_pos, num_pos) initial_pointer = -1 maximum = t4k.Max() maximum.add(initial_f1, initial_pointer) for pointer, score in enumerate(sorted_scores): # Determine the effect of moving the threshold just *above* this score # Any positives at this score are now mis-labelled as negatives true_pos -= labels_by_score[score][1] # Any negatives at this score are now correctly labelled as negatives false_pos -= labels_by_score[score][-1] # Recalculate the F1 score now. this_f1 = f1(true_pos, false_pos, num_pos) # If this is an improvement over the previous best value, keep it maximum.add(this_f1, pointer) best_f1, best_pointer = maximum.get() if best_pointer == -1: threshold = min(sorted_scores) - 1 elif best_pointer == len(sorted_scores)-1: threshold = max(sorted_scores) + 1 else: threshold = ( sorted_scores[best_pointer] + sorted_scores[best_pointer+1]) / 2.0 return best_f1, threshold
def test_constituency_token_spans(self): """ Test that sentence constituents are still absolutely addressed, and that all other constituents have been converted to sentence-relative addressing. """ # Choose anything except the first sentence. We need to make sure that # the constituents' token_spans are addressed relative to the start of # the sentence TEST_SENTENCE_INDEX = 1 # First check that the sentence constituent has correct absolute token # span found_token_span = self.doc.sentences[TEST_SENTENCE_INDEX]['token_span'] sent_abs_start, sent_abs_end = 36, 68 expected_token_span = [(None, sent_abs_start, sent_abs_end)] self.assertEqual(found_token_span, expected_token_span) # Now check that all other constituents have correct sentence-relative # token spans nodes_in_dfs_order = pr.spans.get_dfs_constituents( self.doc.sentences[TEST_SENTENCE_INDEX]) max_end = t4k.Max() past_first_token = False for depth, node in nodes_in_dfs_order: if depth == 0: continue self.assertEqual(len(node['token_span']), 1) sentence_id, start, end = node['token_span'][0] max_end.add(end) last_end = None #print depth, node['constituent_type'] #print 'start, end\t', start, end, last_end # All token_spans should be addressed to the test sentence's ID self.assertEqual(sentence_id, TEST_SENTENCE_INDEX) # The first constituents and tokens should have an index of zero if not past_first_token: self.assertEqual(start, 0) for child in pr.spans.get_constituency_children(node): self.assertEqual(len(child['token_span']), 1) token_span = child['token_span'][0] child_sentence_id, child_start, child_end = token_span self.assertEqual(child_sentence_id, sentence_id) # One child should pick up where the preceeding child left off # (Except for the first child, which has no preceeding child) if last_end is not None: self.assertEqual(child_start, last_end) self.assertTrue(child_end <= end) last_end = child_end #print ' '.join(t4k.strings([], # '\tchild_start, child_end, last_end\t', # child_start, child_end, last_end #)) if last_end is not None: #print 'did final child line up?\t', last_end, end self.assertEqual(last_end, end) if node['constituent_type'] == 'token': past_first_token = True self.assertEqual(max_end.max_key(), sent_abs_end - sent_abs_start)