Example #1
0
 def get_word_boundaries(
     self, mention: LTTextLine
 ) -> List[Tuple[str, float, float, float, float]]:
     mention_text = mention.get_text()
     mention_chars: List[Tuple[str, int, int, int, int]] = []
     for obj in mention:
         if isinstance(obj, LTChar):
             x0, y0, x1, y1 = obj.bbox
             mention_chars.append([obj.get_text(), y0, x0, y1, x1])
     words = []
     mention_words: List[str] = mention_text.split()  # word split by " " (space)
     char_idx = 0
     for word in mention_words:
         curr_word = [word, float("Inf"), float("Inf"), float("-Inf"), float("-Inf")]
         len_idx = 0
         while len_idx < len(word):
             if mention_chars[char_idx][0] == " ":
                 char_idx += 1
                 continue
             if word[len_idx] != mention_chars[char_idx][0]:
                 self.log.warning(
                     "Out of order ({}, {})".format(word, mention_chars[char_idx][0])
                 )
             curr_word[1] = min(curr_word[1], mention_chars[char_idx][1])
             curr_word[2] = min(curr_word[2], mention_chars[char_idx][2])
             curr_word[3] = max(curr_word[3], mention_chars[char_idx][3])
             curr_word[4] = max(curr_word[4], mention_chars[char_idx][4])
             len_idx += len(mention_chars[char_idx][0])
             char_idx += 1
         words.append(curr_word)
     return words
Example #2
0
def _clean_textline(item: LTTextLine) -> Optional[LTTextLine]:
    clean_text = keep_allowed_chars(item.get_text()).strip()
    # Skip empty and invalid lines
    if clean_text:
        # TODO: add subscript detection and use latex underscore
        # or superscript
        item.clean_text = clean_text
        item.font_name, item.font_size = _font_of_mention(item)
        return item
    else:
        return None
Example #3
0
    def get_word_boundaries(
            self, mention: LTTextLine
    ) -> List[Tuple[str, float, float, float, float]]:
        """Split a line of text into words.

        :param mention: a line of text
        :return: a list of words
        """
        mention_text = mention.get_text()
        mention_chars: List[Tuple[str, int, int, int, int]] = []
        for obj in mention:
            if isinstance(obj, LTChar):
                x0, y0, x1, y1 = obj.bbox
                mention_chars.append([obj.get_text(), y0, x0, y1, x1])
        words = []
        mention_words: List[str] = mention_text.split(
        )  # word split by " " (space)
        char_idx = 0
        for word in mention_words:
            curr_word = [
                word,
                float("Inf"),
                float("Inf"),
                float("-Inf"),
                float("-Inf")
            ]
            len_idx = 0
            while len_idx < len(word):
                char: str = mention_chars[char_idx][0]
                if char in [" ", "\xa0"]:
                    char_idx += 1
                    continue
                if word[len_idx:len_idx + len(char)] != char:
                    logger.warning("Out of order ({}, {})".format(
                        word, mention_chars[char_idx][0]))
                curr_word[1] = min(curr_word[1], mention_chars[char_idx][1])
                curr_word[2] = min(curr_word[2], mention_chars[char_idx][2])
                curr_word[3] = max(curr_word[3], mention_chars[char_idx][3])
                curr_word[4] = max(curr_word[4], mention_chars[char_idx][4])
                len_idx += len(mention_chars[char_idx][0])
                char_idx += 1
            words.append(curr_word)
        return words