Beispiel #1
0
def test_merge():
    block1 = TextBlock("AA BB CC ", {0}, 3, 3, 3, 1, 0)
    block2 = TextBlock("DD EE FF GG HH II JJ .", {1}, 6, 0, 6, 2, 1)
    block1.add_labels(DefaultLabels.MIGHT_BE_CONTENT)
    block2.add_labels(DefaultLabels.ARTICLE_METADATA)
    block1.merge_next(block2)
    assert block1.text == "AA BB CC \nDD EE FF GG HH II JJ ."
    assert block1.num_words == 9
    assert block1.num_words_in_anchor_text == 3
    assert round(abs(block1.link_density - 1.0 / 3.0), 7) == 0
    assert block1.text_density == 3
    assert block1.labels == {DefaultLabels.MIGHT_BE_CONTENT, DefaultLabels.ARTICLE_METADATA}
    assert block1.offset_blocks_start == 0
    assert block1.offset_blocks_end == 1
Beispiel #2
0
    def process(self, doc: TextDocument) -> bool:
        changes = False
        blocks = doc.text_blocks
        blocks_new = []
        for tb in blocks:
            text = tb.text
            paragraphs = self.NEWLINE_REGEX.split(text)
            if len(paragraphs) < 2:
                blocks_new.append(tb)
                continue
            is_content = tb.is_content
            labels = tb.labels
            for p in paragraphs:
                tb_p = TextBlock(p)
                tb_p.is_content = is_content
                tb_p.add_labels(labels)
                blocks_new.append(tb_p)
                changes = True

        if changes:
            doc.text_blocks = blocks_new
        return changes
Beispiel #3
0
 def add_labels_to(self, text_block: TextBlock):
     text_block.add_labels(self.labels)