def unescape(result: MarkedUpText) -> None: """ a "private" method to replace HTML codes like > with corresponding symbols in the resulting plain text :param result: MarkedUpText containing resulting plain text """ new_text = '' transformations = [ ] # type: List[Tuple[Tuple[int, int], Tuple[int, int]]] last_stop = 0 for match in _charref.finditer(result.text): replacement = _replace_charref(match) src_s, src_e = (match.start(), match.end()) end_e = src_s + len(replacement) if end_e != src_e: transformations.append(((src_s, src_e), (src_s, end_e))) new_text += result.text[last_stop:src_s] new_text += replacement last_stop = src_e new_text += result.text[last_stop:len(result.text)] result.text = new_text if transformations: result.apply_transformations(transformations)
def process_inner_tag(self, result: MarkedUpText, tag_label: str, tag_regex: Pattern, make_text_function: Callable[[Any], str]) -> None: """ this "private" method finds tags inside text, given it's start and end positions and stores labels found in result :param result: a MarkedUpText variable with plain text to process :param tag_regex: tag to find :param make_text_function: method that processes the tag found transforming the tag into plain text """ new_text = '' last_stop = 0 for match in tag_regex.finditer(result.text): link_markup = make_text_function(match) src_s, src_e = (match.start(), match.end()) # ensure spaces between text and link text starts_space = src_s == 0 or result.text[src_s - 1] in self.str_spaces ends_space = src_e == len(result.text) - 1 or result.text[ src_e + 1] in self.str_phrase_separators if not starts_space: link_markup = ' ' + link_markup if not ends_space: link_markup += ' ' new_text += result.text[last_stop:src_s] new_text += MarkedUpText.get_marker(tag_label, True) new_text += link_markup new_text += MarkedUpText.get_marker(tag_label, False) last_stop = src_e new_text += result.text[last_stop:len(result.text)] result.text = new_text
def check_ocr_text(self, result: MarkedUpText) -> None: """ a "private" method that checks text obtained from embedded images The method decides whether to leave or to delete these pieces of text :param result: MarkedUpText, containing resulting plain text """ if self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE: return self.parse_stat.parsed_ocr_text_len = self.count_text_in_images(result) self.parse_stat.parsed_text_len -= self.parse_stat.parsed_ocr_text_len if self.settings.ocr_sets == OcrTextStoreSettings.STORE_ALWAYS: return # remove some of OCR-d text fragments or remove all of them or just quit remove_ocrs = False if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT and \ self.parse_stat.parsed_text_len >= self.settings.ocr_vector_text_min_length: remove_ocrs = True if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_MORE_TEXT and \ self.parse_stat.parsed_text_len > self.parse_stat.parsed_ocr_text_len: remove_ocrs = True if not remove_ocrs: return # do remove text that was obtained from images result.text = self.remove_text_in_images(result.text)
def remove_text_in_images(self, result: MarkedUpText): im_op, im_cl = MarkedUpText.BLOCK_MARKERS['images'] while True: p_start = result.text.find(im_op) if p_start < 0: break start = p_start + len(im_op) p_end = result.text.find(im_cl, start) if p_end < 0: continue p_end += len(im_cl) result.text = result.text[:p_start] + result.text[p_end:]
def process_inner_tag(self, result: MarkedUpText, tag_regex: Pattern, make_text_function: Callable[[Any], str]) -> None: """ this "private" method finds tags inside text, given it's start and end positions and stores labels found in result :param result: a MarkedUpText variable with plain text to process :param tag_regex: tag to find :param make_text_function: method that processes the tag found transforming the tag into plain text """ new_text = '' transformations = [ ] # type: List[Tuple[Tuple[int, int], Tuple[int, int]]] last_stop = 0 new_labels = [] for match in tag_regex.finditer(result.text): link_markup = make_text_function(match) src_s, src_e = (match.start(), match.end()) # ensure spaces between text and link text starts_space = src_s == 0 or result.text[src_s - 1] in self.str_spaces ends_space = src_e == len(result.text) - 1 or result.text[ src_e + 1] in self.str_phrase_separators if not starts_space: link_markup = ' ' + link_markup if not ends_space: link_markup += ' ' end_e = src_s + len(link_markup) if end_e != src_e: transformations.append(((src_s, src_e), (src_s, end_e))) new_labels.append((src_s, end_e)) new_text += result.text[last_stop:src_s] new_text += link_markup last_stop = src_e new_text += result.text[last_stop:len(result.text)] result.text = new_text if transformations: result.apply_transformations(transformations) if new_labels: if 'a' not in result.labels: result.labels['a'] = new_labels else: result.labels['a'] = result.labels['a'] + new_labels
def remove_extra_linebreaks(self, result: MarkedUpText) -> None: """ Removes linebreaks in the middle of the sentence. Usually, single linebreaks within a paragraph should be deleted and replaced with one space character. But we preserve the linebreaks if the paragraph is a list or a table. Unfortunately, presently we can't recognize a paragraph as a table (if the source is a PDF file). :param result: MarkedUpText containing resulted plain text """ paragraph_op, paragraph_cl = MarkedUpText.BLOCK_MARKERS['paragraphs'] start = 0 while True: p_start = result.text.find(paragraph_op, start) if p_start < 0: break start = p_start + len(paragraph_op) p_end = result.text.find(paragraph_cl, start) if p_end < 0: continue p_block_end = p_end + len(paragraph_cl) # remove extra "\n" between p_start, p_end par_text = result.text[start:p_end] par_lines = [l for l in par_text.split('\n') if l.strip()] if not par_lines: start = p_block_end continue # if lines make a list then don't remove line breaks is_list = True list_lines = 0 for line in par_lines: if self.re_list_start.match(line): list_lines += 1 max_breaks_allowed = math.ceil(len(par_lines) / 3) if len(par_lines) - list_lines > max_breaks_allowed: is_list = False if not is_list: par_text = self.re_single_newline.sub(' ', par_text) result.text = result.text[:start] + par_text + result.text[ p_end:] start = p_block_end