def test_estimate_text_abusing_headers(self):
     text = load_resource_document('parsing/text_abusing_headers.txt',
                                   'utf-8')
     text = pre_process_document(text)
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     self.assertLess(estim.extra_line_breaks_prob, 50)
    def correct_line_breaks(
        self,
        text: str,
        estimator: ParsedTextQualityEstimator = None,
        transformations: Optional[List[Tuple[Tuple[int, int],
                                             Tuple[int, int]]]] = None
    ) -> str:
        if estimator is None:
            estimator = ParsedTextQualityEstimator()
            estimator.split_text_on_lines(text)

        resulted = ''
        lines = estimator.lines
        total_len = 0

        for indx in range(0, len(lines)):
            line = lines[indx]  # TypedLineOrPhrase
            ending_len = len(line.ending)

            if estimator.check_line_followed_by_unnecessary_break(indx):
                self.normalize_line_ending(line)

            if transformations is not None and ending_len != len(line.ending):
                line_start = total_len + len(line.text)
                old_end = line_start + ending_len
                new_end = line_start + len(line.ending)
                transformations.append(
                    ((line_start, old_end), (line_start, new_end)))

            resulted += line.text
            resulted += line.ending
        return resulted
Example #3
0
 def correct_if_corrupted(self, text: str) -> str:
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     if estim.corrupted_prob < 50:
         return text
     if estim.extra_line_breaks_prob > 50:
         text = self.correct_line_breaks(text, estimator)
     return text
Example #4
0
    def test_estimate_dense_text(self):
        text = load_resource_document('parsing/pdf_malformat_parsed_default.txt', 'utf-8')
        estimator = ParsedTextQualityEstimator()
        estim = estimator.estimate_text(text)
        self.assertGreater(estim.extra_line_breaks_prob, 50)

        text = load_resource_document('parsing/pdf_malformat_parsed_stripper.txt', 'utf-8')
        estim = estimator.estimate_text(text)
        self.assertLess(estim.extra_line_breaks_prob, 30)
Example #5
0
 def correct_if_corrupted(self,
                          text: str,
                          transformations: Optional[List[Tuple[Tuple[int, int], Tuple[int, int]]]] = None
                          ) -> str:
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     if estim.corrupted_prob < 50:
         return text
     if estim.extra_line_breaks_prob > 50:
         text = self.correct_line_breaks(text, estimator, transformations=transformations)
     return text
    def test_estimate_fishy_header(self):
        text = """
Notwithstanding anything in this Section (B) of Article IV to the contrary, in the event any such disruption to Shmenant's operations and use of the demised premises is attributable to Landlord's negligence, or that of its agents, contractors, servants or employees, or is attributable to a breach by Landlord of its obligations under this lease, and if such disruption shall materially impair Shmenant's use of the demised premises for a period in excess of five (5) business days in duration, then a just proportion of the Rent, according to the nature and extent of the impairment to Shmenant's operation and use of the demised premises shall abate for any such period of time from the date of disruption which is in excess of said five (5) business days in duration.



ARTICLE V


RENT"""
        estimator = ParsedTextQualityEstimator()
        estim = estimator.estimate_text(text)
        self.assertLess(estim.extra_line_breaks_prob, 50)
Example #7
0
    def correct_line_breaks(
            self,
            text: str,
            estimator: ParsedTextQualityEstimator = None) -> str:
        if estimator is None:
            estimator = ParsedTextQualityEstimator()
            estimator.split_text_on_lines(text)

        resulted = ''
        lines = estimator.lines

        for indx in range(0, len(lines)):
            line = lines[indx]
            if estimator.check_line_followed_by_unnecessary_break(indx):
                self.normalize_line_ending(line)
            resulted += line.text
            resulted += line.ending
        return resulted