def process(self, doc: TextDocument) -> bool: text_blocks = doc.text_blocks if len(text_blocks) < 2: return False # WARNING: POSSIBLE BUG FOUND: shouldn't prev_block be reset every passthrough? changes = False # if it has been changed on the previous passthrough changed_on_pass = True while changed_on_pass: changed_on_pass = False prev_block = text_blocks[0] blocks_to_remove = [] for block in text_blocks[1:]: if prev_block.is_content and block.link_density < 0.56 \ and not block.has_label(DefaultLabels.STRICTLY_NOT_CONTENT): prev_block.merge_next(block) blocks_to_remove.append(block) changed_on_pass = True changes = True else: prev_block = block text_blocks = self.subtract_blocks(text_blocks, blocks_to_remove) if changes: doc.text_blocks = text_blocks return changes
def process(self, doc: TextDocument) -> bool: text_blocks = doc.text_blocks new_blocks = [tb for tb in text_blocks if tb.is_content] has_changes = len(new_blocks) < len(text_blocks) doc.text_blocks = new_blocks return has_changes
def to_text_document(self) -> TextDocument: """ Returns a TextDocument containing the extracted TextBlocks. NOTE: Only call this after parsing. :return: The TextDocument """ # just to be sure self.flush_block() return TextDocument(self.text_blocks, self.title)
def process(self, doc: TextDocument) -> bool: text_blocks = doc.text_blocks if len(text_blocks) < 2: return False changes = False if self.content_only: start_idx = None for idx, block in enumerate(text_blocks): if block.is_content: start_idx = idx break if start_idx is None: return False else: start_idx = 0 prev_block = text_blocks[start_idx] blocks_to_remove = [] for block in text_blocks[start_idx + 1:]: if not block.is_content: prev_block = block continue diff_blocks = (block.offset_blocks_start - prev_block.offset_blocks_end - 1) if diff_blocks <= self.max_blocks_distance: ok = True if self.content_only: if not prev_block.is_content or not block.is_content: ok = False if (self.same_tag_level_only and prev_block.tag_level != block.tag_level): ok = False if ok: prev_block.merge_next(block) # remove current block blocks_to_remove.append(block) changes = True else: prev_block = block else: prev_block = block if len(blocks_to_remove) > 0: doc.text_blocks = self.subtract_blocks(text_blocks, blocks_to_remove) changes = True return changes
def process(self, doc: TextDocument) -> bool: text_blocks = doc.text_blocks if len(text_blocks) < 2: return False changes = False prev_block = text_blocks[0] blocks_to_remove = [] for block in text_blocks[1::]: if self.equal_labels(prev_block.labels, block.labels): prev_block.merge_next(block) blocks_to_remove.append(block) changes = True else: prev_block = block if changes: doc.text_blocks = self.subtract_blocks(text_blocks, blocks_to_remove) return changes
def parse_doc(self, input_str: str) -> Union[TextDocument, None]: bp_parser = parser.BoilerpipeHTMLParser( raise_on_failure=self.raise_on_failure) try: bp_parser.feed(input_str) except: # in case of error, try again, first removing script tag content bp_parser = parser.BoilerpipeHTMLParser( raise_on_failure=self.raise_on_failure) input_str = self.SCRIPT_REGEX.sub('<script></script>', input_str) try: bp_parser.feed(input_str) except Exception as ex: logger.exception('Error parsing HTML') if self.raise_on_failure: raise HTMLExtractionError from ex else: return TextDocument([]) doc = bp_parser.to_text_document() return doc
def process(self, doc: TextDocument) -> bool: changes = False blocks = doc.text_blocks blocks_new = [] for tb in blocks: text = tb.text paragraphs = self.NEWLINE_REGEX.split(text) if len(paragraphs) < 2: blocks_new.append(tb) continue is_content = tb.is_content labels = tb.labels for p in paragraphs: tb_p = TextBlock(p) tb_p.is_content = is_content tb_p.add_labels(labels) blocks_new.append(tb_p) changes = True if changes: doc.text_blocks = blocks_new return changes
def make_doc(words_arr, num_anchor_words_arr=None, is_content_arr=None, label_arr=None): text_blocks = [] for idx, words in enumerate(words_arr): if isinstance(words, int): num_words = words text = ' '.join(default_words[:num_words]) else: text = words num_words = text.count(' ') try: num_anchor_words = num_anchor_words_arr[idx] except (TypeError, IndexError): num_anchor_words = 0 block = TextBlock(text, set(), num_words, num_anchor_words, 0, 0, idx) try: block.is_content = is_content_arr[idx] except (TypeError, IndexError): pass try: label = label_arr[idx] if label is None: pass elif isinstance(label, list): for l in label: block.add_label(l) else: block.add_label(label) except (TypeError, IndexError): pass text_blocks.append(block) return TextDocument(text_blocks)