Ejemplo n.º 1
0
    def remove_extra_linebreaks(self, result: MarkedUpText) -> None:
        """
        Removes linebreaks in the middle of the sentence. Usually, single linebreaks
        within a paragraph should be deleted and replaced with one space character.
        But we preserve the linebreaks if the paragraph is a list or a table.
        Unfortunately, presently we can't recognize a paragraph as a table (if the
        source is a PDF file).
        :param result: MarkedUpText containing resulted plain text
        """
        paragraphs = result.labels.get('paragraphs') or [(0, len(result.text))]
        for par_start, par_end in paragraphs:
            # check the paragraph is not a list and, therefore, can be
            # cleared of extra line breaks
            par_text = result.text[par_start:par_end]
            par_lines = [l for l in par_text.split('\n') if l.strip()]
            if not par_lines:
                continue

            # if lines make a list then don't remove line breaks
            is_list = True
            list_lines = 0
            for line in par_lines:
                if self.re_list_start.match(line):
                    list_lines += 1
            max_breaks_allowed = math.ceil(len(par_lines) / 3)
            if len(par_lines) - list_lines > max_breaks_allowed:
                is_list = False

            if not is_list:
                result.replace_by_regex(self.re_single_newline, ' ', par_start,
                                        par_end)
Ejemplo n.º 2
0
    def test_replace_by_regex_extra_end(self):
        text = 'A text   with extra   spaces.   '
        markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 29)]})
        reg = re.compile(r'\s+')
        markup.replace_by_regex(reg, ' ')

        self.assertEqual('A text with extra spaces. ', markup.text)
        labels = markup.labels['p']
        self.assertEqual((6, 10), labels[0])
        self.assertEqual((18, 25), labels[1])
    def test_replace_by_regex_extra_longer(self):
        text = 'A text   with extra   spaces, and   more spaces'
        markup = MarkedUpText(text,
                              labels={'p': [(7, 12), (22, 32), (41, 46)]})
        reg = re.compile(r'\s+')
        markup.replace_by_regex(reg, ' ')

        self.assertEqual('A text with extra spaces, and more spaces', markup.text)
        labels = markup.labels['p']
        self.assertEqual((6, 10), labels[0])
        self.assertEqual((18, 28), labels[1])
        self.assertEqual((35, 40), labels[2])
Ejemplo n.º 4
0
    def test_replace_by_regex_limited(self):
        text = """
        <p>Here (Improve  text segmentation   (section / page / paragraph / sentence), section 1.1 Use 
        markup from document parser) I described Tika’s   output in XHTML. In short:
        </p>
        """
        labels = {'p': [(7, 12), (22, 28)]}
        reg = re.compile(r'\s+')

        markup1 = MarkedUpText(text,
                               labels={l: list(labels[l])
                                       for l in labels})
        markup1.replace_by_regex(reg, ' ')

        markup2 = MarkedUpText(text,
                               labels={l: list(labels[l])
                                       for l in labels})
        markup2.replace_by_regex(reg, ' ', 0, len(text))
        self.assertEqual(markup1.text, markup2.text)

        markup2 = MarkedUpText(text,
                               labels={l: list(labels[l])
                                       for l in labels})
        markup2.replace_by_regex(reg, ' ', 0, len(text) >> 1)
        self.assertNotEqual(markup1.text, markup2.text)
Ejemplo n.º 5
0
 def test_replace_by_regex_none(self):
     text = 'A text   with extra   spaces.'
     markup = MarkedUpText(text)
     reg = re.compile(r'AbC')
     markup.replace_by_regex(reg, ' ')
     self.assertEqual(text, markup.text)