def test_spans_overlapping_tokens(): text = 'This is a sentence.' tokens = [(0, 4), (5, 7), (8, 9), (10, 18), (18, 19)] to = TokenOverlap(text, tokens) assert to.overlapping_tokens(6, 11) == [(5, 7), (8, 9), (10, 18)] assert to.overlapping_tokens(5, 15) == [(5, 7), (8, 9), (10, 18)]
def test_empty_overlap_between_tokens(): text = 'This is a sentence.' tokens = [(0, 4), (5, 7), (8, 9), (10, 18), (18, 19)] to = TokenOverlap(text, tokens) assert len(to.overlapping_tokens(4, 5)) == 0 assert len(to.overlapping_tokens(7, 8)) == 0 assert len(to.overlapping_tokens(18, 18)) == 0
def test_empty_overlap_at_beginning(): text = ' This is a sentence.' tokens = [(3, 7), (8, 10), (11, 12), (13, 21), (21, 22)] to = TokenOverlap(text, tokens) assert len(to.overlapping_tokens(0, 2)) == 0 assert len(to.overlapping_tokens(1, 3)) == 0 assert len(to.overlapping_tokens(0, 3)) == 0
def test_identical_spans(): text = 'This is a sentence.' tokens = [(0, 4), (5, 7), (8, 9), (10, 18), (18, 19)] to = TokenOverlap(text, tokens) for start, end in tokens: overlap = to.overlapping_tokens(start, end) assert len(overlap) == 1 assert overlap[0] == (start, end)
def _compute_tp_total(self, input_gen): exp_total = [] pred_total = [] for doc_index, document in enumerate(input_gen()): assert doc_index < len( self._documents ), 'Input generator yields more documents than expected!' to = None if self._token_func: text = read(document.txt_path) tokens = list(self._token_func(text)) to = TokenOverlap(text, tokens) for anno_file_1, anno_file_2 in combinations( document.ann_files, 2): # tp, exp, pred, exp_list, pred_list = self._eval_func(anno_file_1.ann_path, anno_file_2.ann_path, tokens=to) tp, exp, pred = self._eval_func(anno_file_1.ann_path, anno_file_2.ann_path, tokens=to) # exp_total.append(exp_list) # pred_total.append(pred_list) pair_idx = self._pair2idx[(anno_file_1.annotator_id, anno_file_2.annotator_id)] doc_idx = self._doc2idx[document.doc_id] self._increment_counts(tp, pair_idx, doc_idx, 0) self._increment_counts(exp, pair_idx, doc_idx, 1) self._increment_counts(pred, pair_idx, doc_idx, 1)
def find_all_matches_and_mismatches(self, input_gen): # Finds all matches / mismatches (or, by other words: agreements & disagreements) results = [] for doc_index, document in enumerate( input_gen() ): assert doc_index < len(self._documents), 'Input generator yields more documents than expected!' to = None text = read(document.txt_path) if self._token_func: tokens = list(self._token_func(text)) to = TokenOverlap(text, tokens) # TODO: the following solution robustly works for 2 annotators. # in case of more than 2 annotators, it would be very nice # to further consolidate the results for anno_file_1, anno_file_2 in combinations(document.ann_files, 2): tp, exp, pred = self._eval_func(anno_file_1.ann_path, anno_file_2.ann_path, tokens=to) # Convert exp & pred to fp & fn if self._token_func is not None: fp = counter2list(Counter(pred) - Counter(exp)) fn = counter2list(Counter(exp) - Counter(pred)) else: fp = pred.difference(exp) fn = exp.difference(pred) doc_result = {} # all kinds of metadata doc_result['doc_id'] = doc_index doc_result['text_file'] = os.path.basename(document.txt_path) doc_result['ann_file'] = anno_file_1.ann_path.name doc_result['annotator_1'] = anno_file_1.annotator_id doc_result['annotator_2'] = anno_file_2.annotator_id # true-positive tuples: matching annotations tp_tuples = self._extract_sorted_annotation_text_tuples(tp, text) tp_tuples = self._format_text_tuples_for_pretty_printing(tp_tuples, skip_str_conversion=[0,1]) doc_result['matches'] = tp_tuples # marked by annotator2, but not by annotator1 fp_tuples = self._extract_sorted_annotation_text_tuples(fp, text, annotator=anno_file_2.annotator_id) # marked by annotator1, but not by annotator2 fn_tuples = self._extract_sorted_annotation_text_tuples(fn, text, annotator=anno_file_1.annotator_id) all_mismatches = sorted(fp_tuples + fn_tuples, key=lambda x:x[2]) all_mismatches = self._format_text_tuples_for_pretty_printing(all_mismatches, skip_str_conversion=[0,1,2]) doc_result['mismatches'] = all_mismatches results.append( doc_result ) return results