def test_estimate_text_abusing_headers(self):
     text = load_resource_document(
         'lexnlp/utils/parsing/text_abusing_headers.txt', 'utf-8')
     text = pre_process_document(text)
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     self.assertLess(estim.extra_line_breaks_prob, 50)
Example #2
0
    def correct_if_corrupted(self, text: str) -> str:
        """
        Checks the text and correct if corrupted.
        Let's assume the text is:
            1.1 Etymology

            Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical
            Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at

            Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a
            Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered

            the undoubtable source.
        :param text: a text containing a number of \n\n sequences, see above
        :return: the same text without 2 double line breaks:
            1.1 Etymology

            Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical
            Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at
            Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a
            Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered
            the undoubtable source.
        """
        estimator = ParsedTextQualityEstimator()
        estim = estimator.estimate_text(text)
        if estim.corrupted_prob < 50:
            return text
        if estim.extra_line_breaks_prob > 50:
            text = self.correct_line_breaks(text, estimator)
        return text
    def test_estimate_dense_text(self):
        text = load_resource_document(
            "lexnlp/utils/parsing/pdf_malformat_parsed_default.txt", 'utf-8')
        estimator = ParsedTextQualityEstimator()
        estim = estimator.estimate_text(text)
        self.assertGreater(estim.extra_line_breaks_prob, 50)

        text = load_resource_document(
            'lexnlp/utils/parsing/pdf_malformat_parsed_stripper.txt', 'utf-8')
        estim = estimator.estimate_text(text)
        self.assertLess(estim.extra_line_breaks_prob, 30)
    def test_estimate_fishy_header(self):
        text = """
Notwithstanding anything in this Section (B) of Article IV to the contrary, in the event any such disruption to Shmenant's operations and use of the demised premises is attributable to Landlord's negligence, or that of its agents, contractors, servants or employees, or is attributable to a breach by Landlord of its obligations under this lease, and if such disruption shall materially impair Shmenant's use of the demised premises for a period in excess of five (5) business days in duration, then a just proportion of the Rent, according to the nature and extent of the impairment to Shmenant's operation and use of the demised premises shall abate for any such period of time from the date of disruption which is in excess of said five (5) business days in duration.



ARTICLE V


RENT"""
        estimator = ParsedTextQualityEstimator()
        estim = estimator.estimate_text(text)
        self.assertLess(estim.extra_line_breaks_prob, 50)
Example #5
0
    def correct_line_breaks(self, text: str,
                            estimator: ParsedTextQualityEstimator = None) -> str:
        if estimator is None:
            estimator = ParsedTextQualityEstimator()
            estimator.split_text_on_lines(text)

        resulted = ''
        lines = estimator.lines

        for indx in range(0, len(lines)):
            line = lines[indx]
            if estimator.check_line_followed_by_unnecessary_break(indx):
                self.normalize_line_ending(line)
            resulted += line.text
            resulted += line.ending
        return resulted