def test_merge(): block1 = TextBlock("AA BB CC ", {0}, 3, 3, 3, 1, 0) block2 = TextBlock("DD EE FF GG HH II JJ .", {1}, 6, 0, 6, 2, 1) block1.add_labels(DefaultLabels.MIGHT_BE_CONTENT) block2.add_labels(DefaultLabels.ARTICLE_METADATA) block1.merge_next(block2) assert block1.text == "AA BB CC \nDD EE FF GG HH II JJ ." assert block1.num_words == 9 assert block1.num_words_in_anchor_text == 3 assert round(abs(block1.link_density - 1.0 / 3.0), 7) == 0 assert block1.text_density == 3 assert block1.labels == {DefaultLabels.MIGHT_BE_CONTENT, DefaultLabels.ARTICLE_METADATA} assert block1.offset_blocks_start == 0 assert block1.offset_blocks_end == 1
def process(self, doc: TextDocument) -> bool: changes = False blocks = doc.text_blocks blocks_new = [] for tb in blocks: text = tb.text paragraphs = self.NEWLINE_REGEX.split(text) if len(paragraphs) < 2: blocks_new.append(tb) continue is_content = tb.is_content labels = tb.labels for p in paragraphs: tb_p = TextBlock(p) tb_p.is_content = is_content tb_p.add_labels(labels) blocks_new.append(tb_p) changes = True if changes: doc.text_blocks = blocks_new return changes
def add_labels_to(self, text_block: TextBlock): text_block.add_labels(self.labels)