예제 #1
0
    def flush_block(self) -> None:
        if self.in_body == 0:
            if self.last_start_tag.lower() == "title":
                self.set_title(self.text_buffer.strip())
            self.clear_text_buffer()
            return
        if len(self.token_buffer.strip()) == 0:
            self.clear_text_buffer()
            return

        tokens = self.tokenize(self.token_buffer)
        num_words = 0
        num_linked_words = 0
        num_wrapped_lines = 0
        current_line_length = -1
        #  don't count the first space
        max_line_length = 80
        num_tokens = 0
        num_words_current_line = 0

        for token in tokens:
            if token == SpecialTokens.ANCHOR_TEXT_START:
                self.in_anchor_text = True
            elif token == SpecialTokens.ANCHOR_TEXT_END:
                self.in_anchor_text = False
            elif self.is_word(token):
                num_tokens += 1
                num_words += 1
                num_words_current_line += 1
                if self.in_anchor_text:
                    num_linked_words += 1
                current_line_length += len(token) + 1
                if current_line_length > max_line_length:
                    num_wrapped_lines += 1
                    current_line_length = len(token)
                    num_words_current_line = 1
            else:
                num_tokens += 1

        # if only special tokens (num_tokens excludes special tokens)
        if num_tokens == 0:
            self.clear_text_buffer()
            return

        if num_wrapped_lines == 0:
            num_words_in_wrapped_lines = num_words
            num_wrapped_lines = 1
        else:
            num_words_in_wrapped_lines = num_words - num_words_current_line

        tb = TextBlock(self.text_buffer.strip(),
                       self.current_contained_text_elements, num_words,
                       num_linked_words, num_words_in_wrapped_lines,
                       num_wrapped_lines, self.offset_blocks)
        self.current_contained_text_elements = set()
        self.offset_blocks += 1
        self.clear_text_buffer()
        tb.tag_level = self.block_tag_level
        self.add_text_block(tb)
        self.block_tag_level = -1
예제 #2
0
 def add_text_block(self, tb: TextBlock) -> None:
     for font_size in self.font_size_stack[::-1]:
         if font_size is not None:
             tb.add_label(f"font-{font_size}")
             break
     for label_stack in self.label_stacks:
         for labels in label_stack:
             labels.add_to(tb)
     self.text_blocks.append(tb)
예제 #3
0
def test_merge():
    block1 = TextBlock("AA BB CC ", {0}, 3, 3, 3, 1, 0)
    block2 = TextBlock("DD EE FF GG HH II JJ .", {1}, 6, 0, 6, 2, 1)
    block1.add_labels(DefaultLabels.MIGHT_BE_CONTENT)
    block2.add_labels(DefaultLabels.ARTICLE_METADATA)
    block1.merge_next(block2)
    assert block1.text == "AA BB CC \nDD EE FF GG HH II JJ ."
    assert block1.num_words == 9
    assert block1.num_words_in_anchor_text == 3
    assert round(abs(block1.link_density - 1.0 / 3.0), 7) == 0
    assert block1.text_density == 3
    assert block1.labels == {DefaultLabels.MIGHT_BE_CONTENT, DefaultLabels.ARTICLE_METADATA}
    assert block1.offset_blocks_start == 0
    assert block1.offset_blocks_end == 1
예제 #4
0
    def classify(self, prev_block: TextBlock, curr_block: TextBlock,
                 next_block: TextBlock):
        if curr_block.link_density <= 0.333333:
            if prev_block.link_density <= 0.555556:
                if curr_block.text_density <= 9:
                    if next_block.text_density <= 10:
                        if prev_block.text_density <= 4:
                            is_content = False
                        else:
                            is_content = True
                    else:
                        is_content = True
                else:
                    if next_block.text_density == 0:
                        is_content = False
                    else:
                        is_content = True
            else:
                if next_block.text_density <= 11:
                    is_content = False
                else:
                    is_content = True
        else:
            is_content = False

        changes = curr_block.is_content is is_content
        curr_block.is_content = is_content

        return changes
예제 #5
0
    def classify(self, prev_block: TextBlock, curr_block: TextBlock,
                 next_block: TextBlock):
        if curr_block.link_density <= 0.333333:
            if prev_block.link_density <= 0.555556:
                if curr_block.num_words <= 16:
                    if next_block.num_words <= 15:
                        if prev_block.num_words <= 4:
                            is_content = False
                        else:
                            is_content = True
                    else:
                        is_content = True
                else:
                    is_content = True
            else:
                if curr_block.num_words <= 40:
                    if next_block.num_words <= 17:
                        is_content = False
                    else:
                        is_content = True
                else:
                    is_content = True
        else:
            is_content = False

        changes = curr_block.is_content is is_content
        curr_block.is_content = is_content

        return changes
예제 #6
0
    def classify(self, prev_block: TextBlock, curr_block: TextBlock,
                 next_block: TextBlock):
        cond1 = curr_block.link_density > 0 and next_block.num_words > 11
        cond2 = curr_block.num_words > 19
        cond3 = next_block.num_words > 6 and next_block.link_density == 0 and prev_block.link_density == 0 and \
                (curr_block.num_words > 6 or prev_block.num_words > 7 or next_block.num_words > 19)
        is_content = cond1 or cond2 or cond3

        changes = curr_block.is_content is is_content
        curr_block.is_content = is_content

        return changes
예제 #7
0
    def process(self, doc: TextDocument) -> bool:
        changes = False
        blocks = doc.text_blocks
        blocks_new = []
        for tb in blocks:
            text = tb.text
            paragraphs = self.NEWLINE_REGEX.split(text)
            if len(paragraphs) < 2:
                blocks_new.append(tb)
                continue
            is_content = tb.is_content
            labels = tb.labels
            for p in paragraphs:
                tb_p = TextBlock(p)
                tb_p.is_content = is_content
                tb_p.add_labels(labels)
                blocks_new.append(tb_p)
                changes = True

        if changes:
            doc.text_blocks = blocks_new
        return changes
예제 #8
0
def make_doc(words_arr,
             num_anchor_words_arr=None,
             is_content_arr=None,
             label_arr=None):
    text_blocks = []
    for idx, words in enumerate(words_arr):
        if isinstance(words, int):
            num_words = words
            text = ' '.join(default_words[:num_words])
        else:
            text = words
            num_words = text.count(' ')
        try:
            num_anchor_words = num_anchor_words_arr[idx]
        except (TypeError, IndexError):
            num_anchor_words = 0
        block = TextBlock(text, set(), num_words, num_anchor_words, 0, 0, idx)
        try:
            block.is_content = is_content_arr[idx]
        except (TypeError, IndexError):
            pass
        try:
            label = label_arr[idx]
            if label is None:
                pass
            elif isinstance(label, list):
                for l in label:
                    block.add_label(l)
            else:
                block.add_label(label)
        except (TypeError, IndexError):
            pass

        text_blocks.append(block)

    return TextDocument(text_blocks)
예제 #9
0
 def add_labels_to(self, text_block: TextBlock):
     text_block.add_labels(self.labels)