def correct_if_corrupted(
     self,
     text: str,
     transformations: Optional[List[Tuple[Tuple[int, int],
                                          Tuple[int, int]]]] = None
 ) -> str:
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     if estim.corrupted_prob < 50:
         return text
     if estim.extra_line_breaks_prob > 50:
         text = self.correct_line_breaks(text,
                                         estimator,
                                         transformations=transformations)
     return text
Ejemplo n.º 2
0
    def correct_line_breaks(
            self,
            text: str,
            estimator: ParsedTextQualityEstimator = None) -> str:
        if estimator is None:
            estimator = ParsedTextQualityEstimator()
            estimator.split_text_on_lines(text)

        resulted = ''
        lines = estimator.lines

        for indx in range(0, len(lines)):
            line = lines[indx]
            if estimator.check_line_followed_by_unnecessary_break(indx):
                self.normalize_line_ending(line)
            resulted += line.text
            resulted += line.ending
        return resulted
Ejemplo n.º 3
0
 def test_estimate_text_abusing_headers(self):
     text = load_resource_document('parsing/text_abusing_headers.txt', 'utf-8')
     text = pre_process_document(text)
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     self.assertLess(estim.extra_line_breaks_prob, 50)