Python ParsedTextQualityEstimator Examples

Programming Language: Python

Namespace/Package Name: apps.task.utils.nlp.parsed_text_quality_estimator

Examples at hotexamples.com: 7

Python ParsedTextQualityEstimator - 7 examples found. These are the top rated real world Python examples of apps.task.utils.nlp.parsed_text_quality_estimator.ParsedTextQualityEstimator extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

estimate_text(5)

ParsedTextQualityEstimator(4)

check_line_followed_by_unnecessary_break(2)

split_text_on_lines(2)

Example #1

Show file

File: test_parsed_text_quality_estimator.py Project: tx-anin/lexpredict-contraxsuite

 def test_estimate_text_abusing_headers(self):
     text = load_resource_document('parsing/text_abusing_headers.txt',
                                   'utf-8')
     text = pre_process_document(text)
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     self.assertLess(estim.extra_line_breaks_prob, 50)

Example #2

Show file

File: parsed_text_corrector.py Project: zajacm/lexpredict-contraxsuite

    def correct_line_breaks(
        self,
        text: str,
        estimator: ParsedTextQualityEstimator = None,
        transformations: Optional[List[Tuple[Tuple[int, int],
                                             Tuple[int, int]]]] = None
    ) -> str:
        if estimator is None:
            estimator = ParsedTextQualityEstimator()
            estimator.split_text_on_lines(text)

        resulted = ''
        lines = estimator.lines
        total_len = 0

        for indx in range(0, len(lines)):
            line = lines[indx]  # TypedLineOrPhrase
            ending_len = len(line.ending)

            if estimator.check_line_followed_by_unnecessary_break(indx):
                self.normalize_line_ending(line)

            if transformations is not None and ending_len != len(line.ending):
                line_start = total_len + len(line.text)
                old_end = line_start + ending_len
                new_end = line_start + len(line.ending)
                transformations.append(
                    ((line_start, old_end), (line_start, new_end)))

            resulted += line.text
            resulted += line.ending
        return resulted

Example #3

Show file

 def correct_if_corrupted(self, text: str) -> str:
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     if estim.corrupted_prob < 50:
         return text
     if estim.extra_line_breaks_prob > 50:
         text = self.correct_line_breaks(text, estimator)
     return text

Example #4

Show file

    def test_estimate_dense_text(self):
        text = load_resource_document('parsing/pdf_malformat_parsed_default.txt', 'utf-8')
        estimator = ParsedTextQualityEstimator()
        estim = estimator.estimate_text(text)
        self.assertGreater(estim.extra_line_breaks_prob, 50)

        text = load_resource_document('parsing/pdf_malformat_parsed_stripper.txt', 'utf-8')
        estim = estimator.estimate_text(text)
        self.assertLess(estim.extra_line_breaks_prob, 30)

Example #5

Show file

 def correct_if_corrupted(self,
                          text: str,
                          transformations: Optional[List[Tuple[Tuple[int, int], Tuple[int, int]]]] = None
                          ) -> str:
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     if estim.corrupted_prob < 50:
         return text
     if estim.extra_line_breaks_prob > 50:
         text = self.correct_line_breaks(text, estimator, transformations=transformations)
     return text

Example #6

Show file

File: test_parsed_text_quality_estimator.py Project: tx-anin/lexpredict-contraxsuite

    def test_estimate_fishy_header(self):
        text = """
Notwithstanding anything in this Section (B) of Article IV to the contrary, in the event any such disruption to Shmenant's operations and use of the demised premises is attributable to Landlord's negligence, or that of its agents, contractors, servants or employees, or is attributable to a breach by Landlord of its obligations under this lease, and if such disruption shall materially impair Shmenant's use of the demised premises for a period in excess of five (5) business days in duration, then a just proportion of the Rent, according to the nature and extent of the impairment to Shmenant's operation and use of the demised premises shall abate for any such period of time from the date of disruption which is in excess of said five (5) business days in duration.



ARTICLE V


RENT"""
        estimator = ParsedTextQualityEstimator()
        estim = estimator.estimate_text(text)
        self.assertLess(estim.extra_line_breaks_prob, 50)

Example #7

Show file

    def correct_line_breaks(
            self,
            text: str,
            estimator: ParsedTextQualityEstimator = None) -> str:
        if estimator is None:
            estimator = ParsedTextQualityEstimator()
            estimator.split_text_on_lines(text)

        resulted = ''
        lines = estimator.lines

        for indx in range(0, len(lines)):
            line = lines[indx]
            if estimator.check_line_followed_by_unnecessary_break(indx):
                self.normalize_line_ending(line)
            resulted += line.text
            resulted += line.ending
        return resulted