def test_file_4_use_ml(self): text = self.get_text('test_get_section_spans_1.txt') # test all sections sections = list(get_section_spans(text)) self.assertEqual(len(sections), 207) # test only sections with titles sections = list(get_section_spans(text, skip_empty_headers=True)) self.assertEqual(len([i for i in sections if i['title'] is None]), 0) self.assertDictEqual( sections[1], { 'start': 2280, 'end': 2340, 'title': 'SECTION 2', 'title_start': 2280, 'title_end': 2289, 'level': 1, 'abs_level': 3, 'text': 'SECTION 2. Letters of Credit........................... 15\n' })
def test_file_4_use_ml(self): text = self.get_text('test_get_section_spans_1.txt') # test all sections sections = list(get_section_spans(text)) print(f'{len(sections)} sections are found') for s in sections: print(f'Section #{s.start}, "{s.title}"') self.assertEqual(len(sections), 207) # test only sections with titles sections = list(get_section_spans(text, skip_empty_headers=True)) self.assertEqual(len([i for i in sections if i.title is None]), 0) self.assertEqual( sections[1], DocumentSection( start=2280, end=2340, title='SECTION 2', title_start=2280, title_end=2289, level=1, abs_level=3, text= 'SECTION 2. Letters of Credit........................... 15\n' ))
def test_file_4_use_regex(self): text = self.get_text('test_get_section_spans_1.txt') # test all sections sections = list(get_section_spans(text, use_ml=False)) self.assertEqual(len(sections), 554) self.assertDictEqual( sections[2], { 'start': 1378, 'end': 1438, 'title': 'SECTION 1', 'title_start': 1378, 'title_end': 1387, 'level': 2, 'abs_level': 3, 'text': 'SECTION 1. Amount and Terms of Credit.................. 1\n' })
def get_error(self): file_count = { '1582586_2015-08-31': 23, 'test_get_section_spans_1.txt': 207 } sum_delta = 0 for file in file_count: text = self.get_text(file) count = len(list(get_section_spans(text, use_ml=True))) delta = (count - file_count[file]) / file_count[file] sum_delta += delta * delta return sum_delta
def test_file_4_use_regex(self): text = self.get_text('test_get_section_spans_1.txt') # test all sections sections = list(get_section_spans(text, use_ml=False)) self.assertEqual(len(sections), 554) self.assertEqual( sections[2], DocumentSection( start=1378, end=1438, title='SECTION 1', title_start=1378, title_end=1387, level=2, abs_level=3, text= 'SECTION 1. Amount and Terms of Credit.................. 1\n' ))
def test_title_start_end(self): text = self.get_text( 'lexnlp/nlp/en/tests/test_sections/skewed_document.txt') sentence_spans = get_sentence_span_list(text) sections = list( get_section_spans(text, use_ml=False, return_text=False, skip_empty_headers=True)) self.assertGreater(len(sections), 3) # test title coordinates before enhancing titles ... for sect in sections: title = text[sect.title_start:sect.title_end] self.assertEqual(sect.title, title) # ... and after enhancing find_section_titles(sections, sentence_spans, text) for sect in sections: title = text[sect.title_start:sect.title_end] self.assertEqual(sect.title, title)
def test_bad_text(self): text = 'text' sections = list(get_section_spans(text)) self.assertEqual(sections, [])