Beispiel #1
0
 def iter_model_data_for_layout_document(  # pylint: disable=too-many-locals
         self,
         layout_document: LayoutDocument) -> Iterable[LayoutModelData]:
     relative_font_size_feature = RelativeFontSizeFeature(
         layout_document.iter_all_tokens())
     line_indentation_status_feature = LineIndentationStatusFeature()
     previous_layout_token: Optional[LayoutToken] = None
     concatenated_line_tokens_length_by_line_id = {
         id(line): sum((len(token.text) for token in line.tokens))
         for block in layout_document.iter_all_blocks()
         for line in block.lines
     }
     if not concatenated_line_tokens_length_by_line_id:
         LOGGER.debug('empty layout document')
         return
     max_concatenated_line_tokens_length = max(
         concatenated_line_tokens_length_by_line_id.values())
     document_token_count = sum(
         (1 for _ in layout_document.iter_all_tokens()))
     document_token_index = 0
     for block in layout_document.iter_all_blocks():
         block_lines = block.lines
         line_count = len(block_lines)
         for line_index, line in enumerate(block_lines):
             line_indentation_status_feature.on_new_line()
             line_tokens = line.tokens
             token_count = len(line_tokens)
             concatenated_line_tokens_text = ''.join(
                 [token.text for token in line_tokens])
             line_token_position = 0
             for token_index, token in enumerate(line_tokens):
                 yield from self.iter_model_data_for_context_layout_token_features(
                     ContextAwareLayoutTokenFeatures(
                         token,
                         layout_line=line,
                         previous_layout_token=previous_layout_token,
                         document_features_context=self.
                         document_features_context,
                         token_index=token_index,
                         token_count=token_count,
                         document_token_index=document_token_index,
                         document_token_count=document_token_count,
                         line_index=line_index,
                         line_count=line_count,
                         concatenated_line_tokens_text=
                         concatenated_line_tokens_text,
                         max_concatenated_line_tokens_length=
                         max_concatenated_line_tokens_length,
                         line_token_position=line_token_position,
                         relative_font_size_feature=
                         relative_font_size_feature,
                         line_indentation_status_feature=
                         line_indentation_status_feature))
                 previous_layout_token = token
                 line_token_position += len(token.text)
                 document_token_index += 1
Beispiel #2
0
 def iter_line_features(  # pylint: disable=too-many-locals
         self, layout_document: LayoutDocument
 ) -> Iterable[SegmentationLineFeatures]:
     segmentation_line_features = SegmentationLineFeatures(
         document_features_context=self.document_features_context)
     previous_token: Optional[LayoutToken] = None
     segmentation_line_features.document_token_count = sum(
         len(line.tokens) for block in layout_document.iter_all_blocks()
         for line in block.lines)
     pattern_candididate_block_iterable = (
         block for page in layout_document.pages
         for block_index, block in enumerate(page.blocks)
         if block_index < 2 or block_index > len(page.blocks) - 2)
     pattern_candididate_line_iterable = (
         block.lines[0] for block in pattern_candididate_block_iterable
         if block.lines and block.lines[0].tokens)
     all_pattern_by_line_id = {
         id(line): get_text_pattern(line.text)
         for line in pattern_candididate_line_iterable
     }
     LOGGER.debug('all_pattern_by_line_id: %s', all_pattern_by_line_id)
     pattern_by_line_id = {
         key: value
         for key, value in all_pattern_by_line_id.items() if len(value) >=
         8  # Java GROBID sometimes counts an additional trailing space
     }
     pattern_counter = Counter(pattern_by_line_id.values())
     LOGGER.debug('pattern_counter: %s', pattern_counter)
     seen_repetitive_patterns: Set[str] = set()
     document_token_index = 0
     for page in layout_document.pages:
         blocks = page.blocks
         segmentation_line_features.page_blocks = blocks
         for block_index, block in enumerate(blocks):
             segmentation_line_features.page_block_index = block_index
             block_lines = block.lines
             segmentation_line_features.block_lines = block_lines
             block_line_texts = [line.text for line in block_lines]
             max_block_line_text_length = max(
                 len(text) for text in block_line_texts)
             first_block_token = next(iter(block.iter_all_tokens()), None)
             assert first_block_token
             for line_index, line in enumerate(block_lines):
                 segmentation_line_features.document_token_index = document_token_index
                 document_token_index += len(line.tokens)
                 segmentation_line_features.layout_line = line
                 segmentation_line_features.block_line_index = line_index
                 segmentation_line_features.max_block_line_text_length = (
                     max_block_line_text_length)
                 line_text = block_line_texts[line_index]
                 retokenized_token_texts = re.split(r" |\t|\f|\u00A0",
                                                    line_text)
                 if not retokenized_token_texts:
                     continue
                 if self.use_first_token_of_block:
                     # Java GROBID uses the first token in the block
                     token = first_block_token
                 else:
                     token = line.tokens[0]
                 segmentation_line_features.layout_token = token
                 segmentation_line_features.line_text = line_text
                 segmentation_line_features.concatenated_line_tokens_text = line_text
                 segmentation_line_features.token_text = retokenized_token_texts[
                     0].strip()
                 segmentation_line_features.second_token_text = (
                     retokenized_token_texts[1]
                     if len(retokenized_token_texts) >= 2 else '')
                 segmentation_line_features.previous_layout_token = previous_token
                 line_pattern = pattern_by_line_id.get(id(line), '')
                 LOGGER.debug('line_pattern: %r', line_pattern)
                 segmentation_line_features.is_repetitive_pattern = (
                     pattern_counter[line_pattern] > 1)
                 segmentation_line_features.is_first_repetitive_pattern = (
                     segmentation_line_features.is_repetitive_pattern
                     and line_pattern not in seen_repetitive_patterns)
                 if segmentation_line_features.is_first_repetitive_pattern:
                     seen_repetitive_patterns.add(line_pattern)
                 yield segmentation_line_features
                 previous_token = token