Beispiel #1
0
 def test_counts_correct_amount_of_paragraphs_for_complex_56(self):
     result = interleave.zip_sentences(
         interleave.build_paragraphs(
             interleave.sanitize_text(self.complex_PDF_5)),
         interleave.build_paragraphs(
             interleave.sanitize_text(self.complex_PDF_6)))
     self.assertEqual(69, len(result))
Beispiel #2
0
    def test_zip_sentences_to_tuple(self):
        list1 = '\n\n' + self.short_text
        list2 = '\n\n' + self.short_text

        self.assertEqual(
            self.processed_text,
            interleave.zip_sentences(
                interleave.build_paragraphs(interleave.sanitize_text(list1)),
                interleave.build_paragraphs(interleave.sanitize_text(list2))))
 def test_edge_valid_nonroman_string(self):
     result = interleave.build_paragraphs(
         interleave.sanitize_text('\n\n' + self.edge_complex_7))
     self.assertEqual(4, len(result))
 def test_edge_marks_bad_paragraph_parse(self):
     result = interleave.build_paragraphs(
         interleave.sanitize_text('\n\n' + self.edge_bad_parse))
     self.assertEqual(3, len(result))
     self.assertIn('2. PARSE ERROR', result)
 def test_edge_case_ignore_trailing_tables(self):
     result = interleave.sanitize_text(self.edge_final_paragraph)[0]
     self.assertNotIn(self.table_title, result)
 def test_edge_case_isolate_first_paragraph(self):
     result = interleave.build_paragraphs(
         interleave.sanitize_text(self.edge_first_paragraph))
     self.assertEqual(1, len(result))
 def test_edge_case_missing_paragraphs(self):
     result = interleave.build_paragraphs(
         interleave.sanitize_text('\n\n' + self.edge_missing_paragraphs))
     self.assertEqual(7, len(result))  # Expect seven paragraphs
 def test_edge_case_line_starts_with_numeric_sentence_end(self):
     result = interleave.build_paragraphs(
         interleave.sanitize_text('\n\n' + self.edge_numbers))
     self.assertEqual(7, len(result))  # Expect seven paragraphs
Beispiel #9
0
 def test_counts_up_to_1000_paragraphs(self):
     result = interleave.build_paragraphs(
         interleave.sanitize_text(self.thousand_paragraphs))
     self.assertEqual(1000, len(result))
Beispiel #10
0
 def test_edge_case_strip_EPA_sigblock_2(self):
     result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_2))
     self.assertNotIn(self.EPA_signature_2, result)
Beispiel #11
0
 def test_removes_nbsp_formfeed_page_breaks(self):
     result = interleave.sanitize_text(self.nbsp_formfeed)
     self.assertNotIn(chr(160), result)
     self.assertNotIn(chr(12), result)
Beispiel #12
0
 def test_removes_page_headers(self):
     self.assertNotRegex('\n'.join(interleave.sanitize_text(self.headers)),
                         r'(\fC.*\d\n)')
Beispiel #13
0
 def test_removes_page_numbers(self):
     self.assertNotRegex(
         '\n'.join(interleave.sanitize_text(self.pagenumbers)), r'\n\d+ \n')
Beispiel #14
0
 def test_splits_multipage_paragraphs(self):
     result = interleave.build_paragraphs(
         interleave.sanitize_text('\n\n' + self.multipage_text))
     self.assertEqual(self.split_multipage_text, result)
Beispiel #15
0
 def test_splits_short_sentences(self):
     result = interleave.build_paragraphs(
         interleave.sanitize_text('\n\n' + self.short_text))
     self.assertEqual(self.split_simple_text, result)