def test_estimate_text_abusing_headers(self): text = load_resource_document('parsing/text_abusing_headers.txt', 'utf-8') text = pre_process_document(text) estimator = ParsedTextQualityEstimator() estim = estimator.estimate_text(text) self.assertLess(estim.extra_line_breaks_prob, 50)
def correct_line_breaks( self, text: str, estimator: ParsedTextQualityEstimator = None, transformations: Optional[List[Tuple[Tuple[int, int], Tuple[int, int]]]] = None ) -> str: if estimator is None: estimator = ParsedTextQualityEstimator() estimator.split_text_on_lines(text) resulted = '' lines = estimator.lines total_len = 0 for indx in range(0, len(lines)): line = lines[indx] # TypedLineOrPhrase ending_len = len(line.ending) if estimator.check_line_followed_by_unnecessary_break(indx): self.normalize_line_ending(line) if transformations is not None and ending_len != len(line.ending): line_start = total_len + len(line.text) old_end = line_start + ending_len new_end = line_start + len(line.ending) transformations.append( ((line_start, old_end), (line_start, new_end))) resulted += line.text resulted += line.ending return resulted
def correct_if_corrupted(self, text: str) -> str: estimator = ParsedTextQualityEstimator() estim = estimator.estimate_text(text) if estim.corrupted_prob < 50: return text if estim.extra_line_breaks_prob > 50: text = self.correct_line_breaks(text, estimator) return text
def test_estimate_dense_text(self): text = load_resource_document('parsing/pdf_malformat_parsed_default.txt', 'utf-8') estimator = ParsedTextQualityEstimator() estim = estimator.estimate_text(text) self.assertGreater(estim.extra_line_breaks_prob, 50) text = load_resource_document('parsing/pdf_malformat_parsed_stripper.txt', 'utf-8') estim = estimator.estimate_text(text) self.assertLess(estim.extra_line_breaks_prob, 30)
def correct_if_corrupted(self, text: str, transformations: Optional[List[Tuple[Tuple[int, int], Tuple[int, int]]]] = None ) -> str: estimator = ParsedTextQualityEstimator() estim = estimator.estimate_text(text) if estim.corrupted_prob < 50: return text if estim.extra_line_breaks_prob > 50: text = self.correct_line_breaks(text, estimator, transformations=transformations) return text
def test_estimate_fishy_header(self): text = """ Notwithstanding anything in this Section (B) of Article IV to the contrary, in the event any such disruption to Shmenant's operations and use of the demised premises is attributable to Landlord's negligence, or that of its agents, contractors, servants or employees, or is attributable to a breach by Landlord of its obligations under this lease, and if such disruption shall materially impair Shmenant's use of the demised premises for a period in excess of five (5) business days in duration, then a just proportion of the Rent, according to the nature and extent of the impairment to Shmenant's operation and use of the demised premises shall abate for any such period of time from the date of disruption which is in excess of said five (5) business days in duration. ARTICLE V RENT""" estimator = ParsedTextQualityEstimator() estim = estimator.estimate_text(text) self.assertLess(estim.extra_line_breaks_prob, 50)
def correct_line_breaks( self, text: str, estimator: ParsedTextQualityEstimator = None) -> str: if estimator is None: estimator = ParsedTextQualityEstimator() estimator.split_text_on_lines(text) resulted = '' lines = estimator.lines for indx in range(0, len(lines)): line = lines[indx] if estimator.check_line_followed_by_unnecessary_break(indx): self.normalize_line_ending(line) resulted += line.text resulted += line.ending return resulted