Ejemplo n.º 1
0
def test_spans_overlapping_tokens():
    text = 'This is a sentence.'
    tokens = [(0, 4), (5, 7), (8, 9), (10, 18), (18, 19)]

    to = TokenOverlap(text, tokens)

    assert to.overlapping_tokens(6, 11) == [(5, 7), (8, 9), (10, 18)]
    assert to.overlapping_tokens(5, 15) == [(5, 7), (8, 9), (10, 18)]
Ejemplo n.º 2
0
def test_empty_overlap_between_tokens():
    text = 'This is a sentence.'
    tokens = [(0, 4), (5, 7), (8, 9), (10, 18), (18, 19)]

    to = TokenOverlap(text, tokens)

    assert len(to.overlapping_tokens(4, 5)) == 0
    assert len(to.overlapping_tokens(7, 8)) == 0
    assert len(to.overlapping_tokens(18, 18)) == 0
Ejemplo n.º 3
0
def test_empty_overlap_at_beginning():
    text = '   This is a sentence.'
    tokens = [(3, 7), (8, 10), (11, 12), (13, 21), (21, 22)]

    to = TokenOverlap(text, tokens)

    assert len(to.overlapping_tokens(0, 2)) == 0
    assert len(to.overlapping_tokens(1, 3)) == 0
    assert len(to.overlapping_tokens(0, 3)) == 0
Ejemplo n.º 4
0
def test_identical_spans():
    text = 'This is a sentence.'
    tokens = [(0, 4), (5, 7), (8, 9), (10, 18), (18, 19)]

    to = TokenOverlap(text, tokens)

    for start, end in tokens:
        overlap = to.overlapping_tokens(start, end)
        assert len(overlap) == 1
        assert overlap[0] == (start, end)
Ejemplo n.º 5
0
 def _compute_tp_total(self, input_gen):
     exp_total = []
     pred_total = []
     for doc_index, document in enumerate(input_gen()):
         assert doc_index < len(
             self._documents
         ), 'Input generator yields more documents than expected!'
         to = None
         if self._token_func:
             text = read(document.txt_path)
             tokens = list(self._token_func(text))
             to = TokenOverlap(text, tokens)
         for anno_file_1, anno_file_2 in combinations(
                 document.ann_files, 2):
             #                 tp, exp, pred, exp_list, pred_list = self._eval_func(anno_file_1.ann_path, anno_file_2.ann_path, tokens=to)
             tp, exp, pred = self._eval_func(anno_file_1.ann_path,
                                             anno_file_2.ann_path,
                                             tokens=to)
             #                 exp_total.append(exp_list)
             #                 pred_total.append(pred_list)
             pair_idx = self._pair2idx[(anno_file_1.annotator_id,
                                        anno_file_2.annotator_id)]
             doc_idx = self._doc2idx[document.doc_id]
             self._increment_counts(tp, pair_idx, doc_idx, 0)
             self._increment_counts(exp, pair_idx, doc_idx, 1)
             self._increment_counts(pred, pair_idx, doc_idx, 1)
Ejemplo n.º 6
0
 def find_all_matches_and_mismatches(self, input_gen):
     # Finds all matches / mismatches (or, by other words: agreements & disagreements)
     results = []
     for doc_index, document in enumerate( input_gen() ):
         assert doc_index < len(self._documents), 'Input generator yields more documents than expected!'
         to = None
         text = read(document.txt_path)
         if self._token_func:
             tokens = list(self._token_func(text))
             to = TokenOverlap(text, tokens)
         # TODO: the following solution robustly works for 2 annotators.
         #       in case of more than 2 annotators, it would be very nice 
         #       to further consolidate the results
         for anno_file_1, anno_file_2 in combinations(document.ann_files, 2):
             tp, exp, pred = self._eval_func(anno_file_1.ann_path, anno_file_2.ann_path, tokens=to)
             # Convert exp & pred to fp & fn
             if self._token_func is not None:
                 fp = counter2list(Counter(pred) - Counter(exp))
                 fn = counter2list(Counter(exp) - Counter(pred))
             else:
                 fp = pred.difference(exp)
                 fn = exp.difference(pred)
             doc_result = {}
             # all kinds of metadata
             doc_result['doc_id']    = doc_index
             doc_result['text_file'] = os.path.basename(document.txt_path)
             doc_result['ann_file']  = anno_file_1.ann_path.name
             doc_result['annotator_1'] = anno_file_1.annotator_id
             doc_result['annotator_2'] = anno_file_2.annotator_id
             # true-positive tuples: matching annotations
             tp_tuples = self._extract_sorted_annotation_text_tuples(tp, text)
             tp_tuples = self._format_text_tuples_for_pretty_printing(tp_tuples, skip_str_conversion=[0,1])
             doc_result['matches'] = tp_tuples
             # marked by annotator2, but not by annotator1
             fp_tuples = self._extract_sorted_annotation_text_tuples(fp, text, annotator=anno_file_2.annotator_id)
             # marked by annotator1, but not by annotator2
             fn_tuples = self._extract_sorted_annotation_text_tuples(fn, text, annotator=anno_file_1.annotator_id)
             all_mismatches = sorted(fp_tuples + fn_tuples, key=lambda x:x[2])
             all_mismatches = self._format_text_tuples_for_pretty_printing(all_mismatches, skip_str_conversion=[0,1,2])
             doc_result['mismatches'] = all_mismatches
             results.append( doc_result )
     return results