def get_word_boundaries( self, mention: LTTextLine ) -> List[Tuple[str, float, float, float, float]]: mention_text = mention.get_text() mention_chars: List[Tuple[str, int, int, int, int]] = [] for obj in mention: if isinstance(obj, LTChar): x0, y0, x1, y1 = obj.bbox mention_chars.append([obj.get_text(), y0, x0, y1, x1]) words = [] mention_words: List[str] = mention_text.split() # word split by " " (space) char_idx = 0 for word in mention_words: curr_word = [word, float("Inf"), float("Inf"), float("-Inf"), float("-Inf")] len_idx = 0 while len_idx < len(word): if mention_chars[char_idx][0] == " ": char_idx += 1 continue if word[len_idx] != mention_chars[char_idx][0]: self.log.warning( "Out of order ({}, {})".format(word, mention_chars[char_idx][0]) ) curr_word[1] = min(curr_word[1], mention_chars[char_idx][1]) curr_word[2] = min(curr_word[2], mention_chars[char_idx][2]) curr_word[3] = max(curr_word[3], mention_chars[char_idx][3]) curr_word[4] = max(curr_word[4], mention_chars[char_idx][4]) len_idx += len(mention_chars[char_idx][0]) char_idx += 1 words.append(curr_word) return words
def _clean_textline(item: LTTextLine) -> Optional[LTTextLine]: clean_text = keep_allowed_chars(item.get_text()).strip() # Skip empty and invalid lines if clean_text: # TODO: add subscript detection and use latex underscore # or superscript item.clean_text = clean_text item.font_name, item.font_size = _font_of_mention(item) return item else: return None
def get_word_boundaries( self, mention: LTTextLine ) -> List[Tuple[str, float, float, float, float]]: """Split a line of text into words. :param mention: a line of text :return: a list of words """ mention_text = mention.get_text() mention_chars: List[Tuple[str, int, int, int, int]] = [] for obj in mention: if isinstance(obj, LTChar): x0, y0, x1, y1 = obj.bbox mention_chars.append([obj.get_text(), y0, x0, y1, x1]) words = [] mention_words: List[str] = mention_text.split( ) # word split by " " (space) char_idx = 0 for word in mention_words: curr_word = [ word, float("Inf"), float("Inf"), float("-Inf"), float("-Inf") ] len_idx = 0 while len_idx < len(word): char: str = mention_chars[char_idx][0] if char in [" ", "\xa0"]: char_idx += 1 continue if word[len_idx:len_idx + len(char)] != char: logger.warning("Out of order ({}, {})".format( word, mention_chars[char_idx][0])) curr_word[1] = min(curr_word[1], mention_chars[char_idx][1]) curr_word[2] = min(curr_word[2], mention_chars[char_idx][2]) curr_word[3] = max(curr_word[3], mention_chars[char_idx][3]) curr_word[4] = max(curr_word[4], mention_chars[char_idx][4]) len_idx += len(mention_chars[char_idx][0]) char_idx += 1 words.append(curr_word) return words