def classify(self, prev_block: TextBlock, curr_block: TextBlock, next_block: TextBlock): if curr_block.link_density <= 0.333333: if prev_block.link_density <= 0.555556: if curr_block.text_density <= 9: if next_block.text_density <= 10: if prev_block.text_density <= 4: is_content = False else: is_content = True else: is_content = True else: if next_block.text_density == 0: is_content = False else: is_content = True else: if next_block.text_density <= 11: is_content = False else: is_content = True else: is_content = False changes = curr_block.is_content is is_content curr_block.is_content = is_content return changes
def classify(self, prev_block: TextBlock, curr_block: TextBlock, next_block: TextBlock): if curr_block.link_density <= 0.333333: if prev_block.link_density <= 0.555556: if curr_block.num_words <= 16: if next_block.num_words <= 15: if prev_block.num_words <= 4: is_content = False else: is_content = True else: is_content = True else: is_content = True else: if curr_block.num_words <= 40: if next_block.num_words <= 17: is_content = False else: is_content = True else: is_content = True else: is_content = False changes = curr_block.is_content is is_content curr_block.is_content = is_content return changes
def classify(self, prev_block: TextBlock, curr_block: TextBlock, next_block: TextBlock): cond1 = curr_block.link_density > 0 and next_block.num_words > 11 cond2 = curr_block.num_words > 19 cond3 = next_block.num_words > 6 and next_block.link_density == 0 and prev_block.link_density == 0 and \ (curr_block.num_words > 6 or prev_block.num_words > 7 or next_block.num_words > 19) is_content = cond1 or cond2 or cond3 changes = curr_block.is_content is is_content curr_block.is_content = is_content return changes
def process(self, doc: TextDocument) -> bool: changes = False blocks = doc.text_blocks blocks_new = [] for tb in blocks: text = tb.text paragraphs = self.NEWLINE_REGEX.split(text) if len(paragraphs) < 2: blocks_new.append(tb) continue is_content = tb.is_content labels = tb.labels for p in paragraphs: tb_p = TextBlock(p) tb_p.is_content = is_content tb_p.add_labels(labels) blocks_new.append(tb_p) changes = True if changes: doc.text_blocks = blocks_new return changes
def make_doc(words_arr, num_anchor_words_arr=None, is_content_arr=None, label_arr=None): text_blocks = [] for idx, words in enumerate(words_arr): if isinstance(words, int): num_words = words text = ' '.join(default_words[:num_words]) else: text = words num_words = text.count(' ') try: num_anchor_words = num_anchor_words_arr[idx] except (TypeError, IndexError): num_anchor_words = 0 block = TextBlock(text, set(), num_words, num_anchor_words, 0, 0, idx) try: block.is_content = is_content_arr[idx] except (TypeError, IndexError): pass try: label = label_arr[idx] if label is None: pass elif isinstance(label, list): for l in label: block.add_label(l) else: block.add_label(label) except (TypeError, IndexError): pass text_blocks.append(block) return TextDocument(text_blocks)