def __init__(self, structural, blacklist, flatten, lingual, strip, replacements, tabular, visual, pdf_path, language, **kwargs): """ :param visual: boolean, if True visual features are used in the model :param pdf_path: directory where pdf are saved, if a pdf file is not found, it will be created from the html document and saved in that directory :param replacements: a list of (_pattern_, _replace_) tuples where _pattern_ isinstance a regex and _replace_ is a character string. All occurents of _pattern_ in the text will be replaced by _replace_. """ super(ParserUDF, self).__init__(**kwargs) # structural (html) setup self.structural = structural self.blacklist = blacklist if isinstance(blacklist, list) else [blacklist] self.flatten = flatten if isinstance(flatten, list) else [flatten] # lingual setup self.language = language self.strip = strip self.replacements = [] for (pattern, replace) in replacements: self.replacements.append((re.compile(pattern, flags=re.UNICODE), replace)) self.lingual = lingual self.lingual_parser = Spacy(self.language) if self.lingual_parser.has_tokenizer_support(): self.tokenize_and_split_sentences = self.lingual_parser.split_sentences self.lingual_parser.load_lang_model() else: self.tokenize_and_split_sentences = SimpleTokenizer().parse if self.lingual: if self.lingual_parser.has_NLP_support(): self.enrich_tokenized_sentences_with_nlp = ( self.lingual_parser.enrich_sentences_with_NLP) else: logger.warning("Lingual mode will be turned off, " "as spacy doesn't provide support for this " "language ({})".format(self.language)) self.lingual = False # tabular setup self.tabular = tabular # visual setup self.visual = visual if self.visual: self.pdf_path = pdf_path self.vizlink = VisualLinker()
def test_spacy_split_sentences(caplog): caplog.set_level(logging.INFO) lingual_parser = Spacy("en") tokenize_and_split_sentences = lingual_parser.split_sentences text = "This is a text. This is another text." iterator = tokenize_and_split_sentences(text) with pytest.raises(AttributeError): next(iterator) lingual_parser.load_lang_model() iterator = tokenize_and_split_sentences(text) assert len(list(iterator)) == 2
def __init__( self, structural=True, # structural information blacklist=["style"], # ignore tag types, default: style flatten=["span", "br"], # flatten tag types, default: span, br flatten_delim="", lingual=True, # lingual information strip=True, replacements=[(u"[\u2010\u2011\u2012\u2013\u2014\u2212\uf02d]", "-")], tabular=True, # tabular information visual=False, # visual information pdf_path=None, ): # Use spaCy as our lingual parser self.lingual_parser = Spacy() super(Parser, self).__init__( ParserUDF, structural=structural, blacklist=blacklist, flatten=flatten, flatten_delim=flatten_delim, lingual=lingual, strip=strip, replacements=replacements, tabular=tabular, visual=visual, pdf_path=pdf_path, lingual_parser=self.lingual_parser, )
def __init__( self, structural=True, # structural information blacklist=["style"], # ignore tag types, default: style flatten=['span', 'br'], # flatten tag types, default: span, br flatten_delim='', lingual=True, # lingual information strip=True, replacements=[(u'[\u2010\u2011\u2012\u2013\u2014\u2212\uf02d]', '-')], tabular=True, # tabular information visual=False, # visual information pdf_path=None): self.delim = "<NB>" # NB = New Block # Use spaCy as our lingual parser self.lingual_parser = Spacy() super(OmniParser, self).__init__( OmniParserUDF, structural=structural, blacklist=blacklist, flatten=flatten, flatten_delim=flatten_delim, lingual=lingual, strip=strip, replacements=replacements, tabular=tabular, visual=visual, pdf_path=pdf_path, lingual_parser=self.lingual_parser)
def test_split_sentences_by_char_limit(caplog): caplog.set_level(logging.INFO) lingual_parser = Spacy("en") lingual_parser.load_lang_model() text = "This is a text. This is another text." all_sentences = [ Sentence(**parts) for parts in lingual_parser.split_sentences(text) ] assert len(all_sentences) == 2 assert [len(sentence.text) for sentence in all_sentences] == [15, 21] lingual_parser.model.remove_pipe("sentencizer") lingual_parser.model.add_pipe(set_custom_boundary, before="parser", name="sentence_boundary_detector") sentence_batches = lingual_parser._split_sentences_by_char_limit( all_sentences, 20) assert len(sentence_batches) == 2 sentence_batches = lingual_parser._split_sentences_by_char_limit( all_sentences, 100) assert len(sentence_batches) == 1 sentence_batch = sentence_batches[0] custom_tokenizer = TokenPreservingTokenizer(lingual_parser.model.vocab) doc = custom_tokenizer(sentence_batch) doc.user_data = sentence_batch for name, proc in lingual_parser.model.pipeline: # iterate over components in order doc = proc(doc) assert doc.is_parsed # See if the number of parsed spaCy sentences matches that of input sentences assert len(list(doc.sents)) == len(sentence_batch)
def test_spacy_support(caplog): caplog.set_level(logging.INFO) # Supported language lingual_parser = Spacy("en") assert lingual_parser.has_tokenizer_support() assert lingual_parser.has_NLP_support() # Alpha-supported language lingual_parser = Spacy("ja") assert lingual_parser.has_tokenizer_support() assert not lingual_parser.has_NLP_support() # Non supported language lingual_parser = Spacy("non-supported-lang") assert not lingual_parser.has_tokenizer_support() assert not lingual_parser.has_NLP_support() # Language not specified with pytest.raises(TypeError): lingual_parser = Spacy()
class Parser_xmlUDF(UDF): def __init__( self, structural, blacklist, flatten, lingual, strip, replacements, tabular, visual, pdf_path, language, **kwargs, ): """ :param visual: boolean, if True visual features are used in the model :param pdf_path: directory where pdf are saved, if a pdf file is not found, it will be created from the html document and saved in that directory :param replacements: a list of (_pattern_, _replace_) tuples where _pattern_ isinstance a regex and _replace_ is a character string. All occurents of _pattern_ in the text will be replaced by _replace_. """ super(Parser_xmlUDF, self).__init__(**kwargs) # structural (html) setup self.structural = structural self.blacklist = blacklist if isinstance(blacklist, list) else [blacklist] self.flatten = flatten if isinstance(flatten, list) else [flatten] # lingual setup self.language = language self.strip = strip self.replacements = [] for (pattern, replace) in replacements: self.replacements.append((re.compile(pattern, flags=re.UNICODE), replace)) self.lingual = lingual self.lingual_parser = Spacy(self.language) if self.lingual_parser.has_tokenizer_support(): self.tokenize_and_split_sentences = self.lingual_parser.split_sentences self.lingual_parser.load_lang_model() else: self.tokenize_and_split_sentences = SimpleTokenizer().parse if self.lingual: if self.lingual_parser.has_NLP_support(): self.enrich_tokenized_sentences_with_nlp = ( self.lingual_parser.enrich_sentences_with_NLP) else: logger.warning(f"Lingual mode will be turned off, " f"as spacy doesn't provide support for this " f"language ({self.language})") self.lingual = False # tabular setup self.tabular = tabular # visual setup self.visual = visual if self.visual: self.pdf_path = pdf_path def apply(self, document, pdf_path=None, **kwargs): # The document is the Document model text = document.text # Only return sentences, if no exceptions occur during parsing try: [y for y in self.parse(document, text)] return document except Exception as e: warnings.warn((f"Document {document.name} not added to database, " f"because of parse error: \n{e}")) def _valid_pdf(self, path, filename): """Verify that the file exists and has a PDF extension.""" # If path is file, but not PDF. if os.path.isfile(path) and path.lower().endswith(".pdf"): return True else: full_path = os.path.join(path, filename) if os.path.isfile(full_path) and full_path.lower().endswith( ".pdf"): return True elif os.path.isfile(os.path.join(path, filename + ".pdf")): return True elif os.path.isfile(os.path.join(path, filename + ".PDF")): return True return False def _parse_table(self, node, state): """Parse a table node. :param node: The lxml table node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if not self.tabular: logger.error("Called _parse_table without tabular activated.") return state if node.tag == "table": table_idx = state["table"]["idx"] stable_id = f"{state['document'].name}::{'table'}:{state['table']['idx']}" # Set name for Table name = node.attrib["name"] if "name" in node.attrib else None # Create the Table in the DB parts = {} parts["document"] = state["document"] parts["stable_id"] = stable_id parts["name"] = name parts["position"] = table_idx parent = state["parent"][node] if isinstance(parent, Cell): parts["section"] = parent.table.section elif isinstance(parent, Section): parts["section"] = parent else: raise NotImplementedError( "Table is not within a Section or Cell") state["context"][node] = Table(**parts) # Local state for each table. This is required to support nested # tables state["table"][table_idx] = { "grid": defaultdict(int), "cell_pos": 0, "row_idx": -1, "col_idx": 0, } # Increment table counter state["table"]["idx"] += 1 elif node.tag == "tr": if not isinstance(state["parent"][node], Table): raise NotImplementedError("Table row parent must be a Table.") state["table"][state["parent"][node].position]["col_idx"] = 0 state["table"][state["parent"][node].position]["row_idx"] += 1 elif node.tag in ["td", "th"]: if not isinstance(state["parent"][node], Table): raise NotImplementedError("Cell parent must be a Table.") if not state["table"][state["parent"] [node].position]["row_idx"] >= 0: raise NotImplementedError( "Table cell encountered before a table row.") # calculate row_start/col_start while state["table"][state["parent"][node].position]["grid"][( state["table"][state["parent"][node].position]["row_idx"], state["table"][state["parent"][node].position]["col_idx"], )]: # while a cell on the grid is occupied, keep moving state["table"][state["parent"][node].position]["col_idx"] += 1 col_start = state["table"][state["parent"] [node].position]["col_idx"] row_start = state["table"][state["parent"] [node].position]["row_idx"] # calculate row_end/col_end row_end = row_start if "rowspan" in node.attrib: try: row_end += int(node.get("rowspan")) - 1 except ValueError: logger.error( f"Rowspan has invalid value: '{node.get('rowspan')}'") col_end = col_start if "colspan" in node.attrib: try: col_end += int(node.get("colspan")) - 1 except ValueError: logger.error( f"Colspan has invalid value: '{node.get('colspan')}'") # update grid with occupied cells for r, c in itertools.product(list(range(row_start, row_end + 1)), list(range(col_start, col_end + 1))): state["table"][state["parent"][node].position]["grid"][(r, c)] = 1 # Set name for Cell name = node.attrib["name"] if "name" in node.attrib else None # construct cell parts = defaultdict(list) parts["document"] = state["document"] parts["name"] = name parts["table"] = state["parent"][node] parts["row_start"] = row_start parts["row_end"] = row_end parts["col_start"] = col_start parts["col_end"] = col_end parts["position"] = state["table"][state["parent"] [node].position]["cell_pos"] stable_id = (f"{parts['document'].name}" f"::" f"{'cell'}" f":" f"{parts['table'].position}" f":" f"{row_start}" f":" f"{col_start}") parts["stable_id"] = stable_id # Create the Cell in the DB state["context"][node] = Cell(**parts) # Update position state["table"][state["parent"][node].position]["col_idx"] += 1 state["table"][state["parent"][node].position]["cell_pos"] += 1 return state def _parse_figure(self, node, state): """Parse the figure node. :param node: The lxml img node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["img", "figure"]: return state # Process the Figure stable_id = (f"{state['document'].name}" f"::" f"{'figure'}" f":" f"{state['figure']['idx']}") # Set name for Figure name = node.attrib["name"] if "name" in node.attrib else None # img within a Figure get's processed in the parent Figure if node.tag == "img" and isinstance(state["parent"][node], Figure): return state # NOTE: We currently do NOT support nested figures. parts = {} parent = state["parent"][node] if isinstance(parent, Section): parts["section"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent else: logger.warning(f"Figure is nested within {state['parent'][node]}") return state parts["document"] = state["document"] parts["stable_id"] = stable_id parts["name"] = name parts["position"] = state["figure"]["idx"] # If processing a raw img if node.tag == "img": # Create the Figure entry in the DB parts["url"] = node.get("src") state["context"][node] = Figure(**parts) elif node.tag == "figure": # Pull the image from a child img node, if one exists imgs = [child for child in node if child.tag == "img"] if len(imgs) > 1: logger.warning("Figure contains multiple images.") # Right now we don't support multiple URLs in the Figure context # As a workaround, just ignore the outer Figure and allow processing # of the individual images. We ignore the accompanying figcaption # by marking it as visited. captions = [ child for child in node if child.tag == "figcaption" ] state["visited"].update(captions) return state img = imgs[0] state["visited"].add(img) # Create the Figure entry in the DB parts["url"] = img.get("src") state["context"][node] = Figure(**parts) state["figure"]["idx"] += 1 return state def _parse_sentence(self, paragraph, node, state): """Parse the Sentences of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ # Set name for Sentence name = node.attrib["name"] if "name" in node.attrib else None # Lingual Parse document = state["document"] sens_parts = [] sens_words_id = [] for sentence in node: parts = defaultdict(list) parts["document"] = document flag = 0 text = "" words = [] char_abs_offsets = [] start = 0 for i, word in enumerate(sentence): w = "" for char in word: if "bbox" in char.attrib.keys(): flag = 1 w += char.text words.append(w) char_abs_offsets.append(start) start += (1 + len(word)) text += re.sub("\s+", " ", w) text += " " if not flag: continue if text.isspace(): continue if not any(p and p[0].isalnum() for p in words): continue if not text: continue for i, word in enumerate(sentence): parts["words"].append(words[i].replace(" ", "_")) parts["lemmas"].append(words[i].replace(" ", "_")) parts["ner_tags"].append( "") # placeholder for later NLP parsing parts["char_offsets"].append(char_abs_offsets[i]) # parts["abs_char_offsets"].append(char_abs_offsets[i]) parts["dep_parents"].append( 0) # placeholder for later NLP parsing parts["dep_labels"].append( "") # placeholder for later NLP parsing parts["text"], parts["pos_tags"] = self.lingual_parser.tagger( text[:-1]) abs_offset = state["sentence"]["abs_offset"] parts["abs_char_offsets"] = [ char_offset + abs_offset for char_offset in parts["char_offsets"] ] parts["position"] = state["sentence"]["idx"] if self.tabular: parts["position"] = state["sentence"]["idx"] # If tabular, consider own Context first in case a Cell # was just created. Otherwise, defer to the parent. parent = paragraph if isinstance(parent, Paragraph): parts["section"] = parent.section parts["paragraph"] = parent else: raise NotImplementedError( "Sentence parent must be Paragraph.") if self.structural: context_node = sentence tree = lxml.etree.ElementTree(state["root"]) parts["xpath"] = tree.getpath(context_node) parts["html_tag"] = context_node.tag parts["html_attrs"] = [] temp_attrs = [] for word in sentence: if len(word) == 0: continue t = "" for k, v in word[0].attrib.items(): if k != "bbox": v = v.replace(" ", "") t = t + k + "=" + v + " " t = t[:-1] temp_attrs.append(t) for temp_attr in temp_attrs: parts["html_attrs"].append(temp_attr) if self.visual: page = [] top = [] left = [] right = [] bottom = [] p = int(node.getparent().get("id")) bbox = node.getparent().get("bbox") bbox = bbox.split(",") height = int(round(float(bbox[3]))) # hack for handle error coordinate in sentence flag = False try: for word in sentence: if len(word) == 0: continue coord_f = word[0].attrib[ "bbox"] # coordinate first character of word coord_l = word[-1].attrib["bbox"] coord_f = coord_f.split(",") coord_l = coord_l.split(",") page.append(p) left.append(int(round(float(coord_f[0])))) bottom.append(height - int(round(float(coord_f[1])))) right.append(int(round(float(coord_l[2])))) if height > int(round(float(coord_f[3]))): top.append(height - int(round(float(coord_f[3])))) else: top.append(0) parts["page"] = page parts["left"] = left parts["top"] = top parts["right"] = right parts["bottom"] = bottom except Exception as e: print(e) print(document, "\n", text) continue abs_sentence_offset_end = (state["sentence"]["abs_offset"] + parts["char_offsets"][-1] + len(parts["words"][-1])) parts["stable_id"] = construct_stable_id( document, "sentence", state["sentence"]["abs_offset"], abs_sentence_offset_end, ) state["sentence"]["idx"] += 1 state["sentence"]["abs_offset"] = abs_sentence_offset_end parts["name"] = name yield Sentence(**parts) def _parse_paragraph(self, node, state): """Parse a Paragraph of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ # Both Paragraphs will share the same parent parent = (state["context"][node] if node in state["context"] else state["parent"][node]) # Set name for Paragraph name = node.attrib["name"] if "name" in node.attrib else None for field in ["text"]: if node.tag != "paragraph": continue # Process the Paragraph stable_id = (f"{state['document'].name}" f"::" f"{'paragraph'}" f":" f"{state['paragraph']['idx']}") parts = {} parts["stable_id"] = stable_id parts["name"] = name parts["document"] = state["document"] parts["position"] = state["paragraph"]["idx"] if isinstance(parent, Caption): if parent.table: parts["section"] = parent.table.section elif parent.figure: parts["section"] = parent.figure.section parts["caption"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent elif isinstance(parent, Section): parts["section"] = parent elif isinstance(parent, Figure): # occurs with text in the tail of an img parts["section"] = parent.section elif isinstance(parent, Table): # occurs with text in the tail of a table parts["section"] = parent.section else: raise NotImplementedError( f"Para '{text}' parent must be Section, Caption, or Cell, " f"not {parent}") # Create the entry in the DB paragraph = Paragraph(**parts) state["paragraph"]["idx"] += 1 try: yield from self._parse_sentence(paragraph, node, state) except Exception as e: print(e.__doc__) print(e.message) def _parse_section(self, node, state): """Parse a Section of the node. Note that this implementation currently creates a Section at the beginning of the document and creates Section based on tag of node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["pages", "section"]: return state # Add a Section stable_id = (f"{state['document'].name}" f"::" f"{'section'}" f":" f"{state['section']['idx']}") # Set name for Section name = node.attrib["name"] if "name" in node.attrib else None state["context"][node] = Section( document=state["document"], name=name, stable_id=stable_id, position=state["section"]["idx"], ) state["section"]["idx"] += 1 return state def _parse_caption(self, node, state): """Parse a Caption of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["caption", "figcaption"]: # captions used in Tables return state # Add a Caption parent = state["parent"][node] stable_id = (f"{state['document'].name}" f"::" f"{'caption'}" f":" f"{state['caption']['idx']}") # Set name for Section name = node.attrib["name"] if "name" in node.attrib else None if isinstance(parent, Table): state["context"][node] = Caption( document=state["document"], table=parent, figure=None, stable_id=stable_id, name=name, position=state["caption"]["idx"], ) elif isinstance(parent, Figure): state["context"][node] = Caption( document=state["document"], table=None, figure=parent, stable_id=stable_id, name=name, position=state["caption"]["idx"], ) else: raise NotImplementedError( "Caption must be a child of Table or Figure.") state["caption"]["idx"] += 1 return state def _parse_node(self, node, state): """Entry point for parsing all node types. :param node: The lxml HTML node to parse :param state: The global state necessary to place the node in context of the document as a whole. :rtype: a *generator* of Sentences """ # Processing on entry of node state = self._parse_section(node, state) state = self._parse_figure(node, state) if self.tabular: state = self._parse_table(node, state) state = self._parse_caption(node, state) yield from self._parse_paragraph(node, state) def parse(self, document, text): """Depth-first search over the provided tree. Implemented as an iterative procedure. The structure of the state needed to parse each node is also defined in this function. :param document: the Document context :param text: the structured text of the document (e.g. HTML) :rtype: a *generator* of Sentences. """ stack = [] root = lxml.etree.fromstring(text) # flattens children of node that are in the 'flatten' list if self.flatten: lxml.etree.strip_tags(root, self.flatten) # Strip comments lxml.etree.strip_tags(root, lxml.etree.Comment) # Assign the text, which was stripped of the 'flatten'-tags, to the document document.text = lxml.etree.tostring(root, encoding="unicode") # This dictionary contain the global state necessary to parse a # document and each context element. This reflects the relationships # defined in parser/models. This contains the state necessary to create # the respective Contexts within the document. state = { "visited": set(), "parent": {}, # map of parent[child] = node used to discover child "context": {}, # track the Context of each node (context['td'] = Cell) "root": root, "document": document, "section": { "idx": 0 }, "paragraph": { "idx": 0 }, "figure": { "idx": 0 }, "caption": { "idx": 0 }, "table": { "idx": 0 }, "sentence": { "idx": 0, "abs_offset": 0 }, } # NOTE: Currently the helper functions directly manipulate the state # rather than returning a modified copy. # Iterative Depth-First Search stack.append(root) state["parent"][root] = document state["context"][root] = document tokenized_sentences = [] while len(stack) != 0: node = stack.pop() if node not in state["visited"]: state["visited"].add(node) # mark as visited # Process if self.lingual: tokenized_sentences += [ y for y in self._parse_node(node, state) ] else: yield from self._parse_node(node, state) # NOTE: This reversed() order is to ensure that the iterative # DFS matches the order that would be produced by a recursive # DFS implementation. if node.tag == "pargraph": continue for child in reversed(node): # Skip nodes that are blacklisted if self.blacklist and child.tag in self.blacklist: continue stack.append(child) # store the parent of the node, which is either the parent # Context, or if the parent did not create a Context, then # use the node's parent Context. state["parent"][child] = (state["context"][node] if node in state["context"] else state["parent"][node])
class ParserUDF(UDF): def __init__(self, structural, blacklist, flatten, lingual, strip, replacements, tabular, visual, pdf_path, language, **kwargs): """ :param visual: boolean, if True visual features are used in the model :param pdf_path: directory where pdf are saved, if a pdf file is not found, it will be created from the html document and saved in that directory :param replacements: a list of (_pattern_, _replace_) tuples where _pattern_ isinstance a regex and _replace_ is a character string. All occurents of _pattern_ in the text will be replaced by _replace_. """ super(ParserUDF, self).__init__(**kwargs) # structural (html) setup self.structural = structural self.blacklist = blacklist if isinstance(blacklist, list) else [blacklist] self.flatten = flatten if isinstance(flatten, list) else [flatten] # lingual setup self.language = language self.strip = strip self.replacements = [] for (pattern, replace) in replacements: self.replacements.append((re.compile(pattern, flags=re.UNICODE), replace)) self.lingual = lingual self.lingual_parser = Spacy(self.language) if self.lingual_parser.has_tokenizer_support(): self.tokenize_and_split_sentences = self.lingual_parser.split_sentences self.lingual_parser.load_lang_model() else: self.tokenize_and_split_sentences = SimpleTokenizer().parse if self.lingual: if self.lingual_parser.has_NLP_support(): self.enrich_tokenized_sentences_with_nlp = ( self.lingual_parser.enrich_sentences_with_NLP) else: logger.warning("Lingual mode will be turned off, " "as spacy doesn't provide support for this " "language ({})".format(self.language)) self.lingual = False # tabular setup self.tabular = tabular # visual setup self.visual = visual if self.visual: self.pdf_path = pdf_path self.vizlink = VisualLinker() def apply(self, document, **kwargs): # The document is the Document model text = document.text if self.visual: if not self.pdf_path: warnings.warn( "Visual parsing failed: pdf_path is required. " + "Proceeding without visual parsing.", RuntimeWarning, ) self.visual = False yield from self.parse(document, text) elif not self._valid_pdf(self.pdf_path, document.name): warnings.warn( "Visual parse failed. {} not a PDF. {}".format( self.pdf_path + document.name, "Proceeding without visual parsing.", ), RuntimeWarning, ) self.visual = False yield from self.parse(document, text) else: # Populate document.sentences for _ in self.parse(document, text): pass # Add visual attributes yield from self.vizlink.parse_visual(document.name, document.sentences, self.pdf_path) else: yield from self.parse(document, text) def _valid_pdf(self, path, filename): """Verify that the file exists and has a PDF extension.""" # If path is file, but not PDF. if os.path.isfile(path) and path.lower().endswith(".pdf"): return True else: full_path = os.path.join(path, filename) if os.path.isfile(full_path) and full_path.lower().endswith( ".pdf"): return True elif os.path.isfile(os.path.join(path, filename + ".pdf")): return True elif os.path.isfile(os.path.join(path, filename + ".PDF")): return True return False def _parse_table(self, node, state): """Parse a table node. :param node: The lxml table node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if not self.tabular: logger.error("Called _parse_table without tabular activated.") return state if node.tag == "table": table_idx = state["table"]["idx"] stable_id = "{}::{}:{}".format(state["document"].name, "table", state["table"]["idx"]) # Create the Table in the DB parts = {} parts["document"] = state["document"] parts["stable_id"] = stable_id parts["position"] = table_idx parent = state["parent"][node] if isinstance(parent, Cell): parts["section"] = parent.table.section elif isinstance(parent, Section): parts["section"] = parent else: raise NotImplementedError( "Table is not within a Section or Cell") state["context"][node] = Table(**parts) # Local state for each table. This is required to support nested # tables state["table"][table_idx] = { "grid": defaultdict(int), "cell_pos": 0, "row_idx": -1, "col_idx": 0, } # Increment table counter state["table"]["idx"] += 1 elif node.tag == "tr": if not isinstance(state["parent"][node], Table): raise NotImplementedError("Table row parent must be a Table.") state["table"][state["parent"][node].position]["col_idx"] = 0 state["table"][state["parent"][node].position]["row_idx"] += 1 elif node.tag in ["td", "th"]: if not isinstance(state["parent"][node], Table): raise NotImplementedError("Cell parent must be a Table.") if not state["table"][state["parent"] [node].position]["row_idx"] >= 0: raise NotImplementedError( "Table cell encountered before a table row.") # calculate row_start/col_start while state["table"][state["parent"][node].position]["grid"][( state["table"][state["parent"][node].position]["row_idx"], state["table"][state["parent"][node].position]["col_idx"], )]: # while a cell on the grid is occupied, keep moving state["table"][state["parent"][node].position]["col_idx"] += 1 col_start = state["table"][state["parent"] [node].position]["col_idx"] row_start = state["table"][state["parent"] [node].position]["row_idx"] # calculate row_end/col_end row_end = row_start if "rowspan" in node.attrib: row_end += int(node.get("rowspan")) - 1 col_end = col_start if "colspan" in node.attrib: col_end += int(node.get("colspan")) - 1 # update grid with occupied cells for r, c in itertools.product(list(range(row_start, row_end + 1)), list(range(col_start, col_end + 1))): state["table"][state["parent"][node].position]["grid"][(r, c)] = 1 # construct cell parts = defaultdict(list) parts["document"] = state["document"] parts["table"] = state["parent"][node] parts["row_start"] = row_start parts["row_end"] = row_end parts["col_start"] = col_start parts["col_end"] = col_end parts["position"] = state["table"][state["parent"] [node].position]["cell_pos"] stable_id = "{}::{}:{}:{}:{}".format( parts["document"].name, "cell", parts["table"].position, row_start, col_start, ) parts["stable_id"] = stable_id # Create the Cell in the DB state["context"][node] = Cell(**parts) # Update position state["table"][state["parent"][node].position]["col_idx"] += 1 state["table"][state["parent"][node].position]["cell_pos"] += 1 return state def _parse_figure(self, node, state): """Parse the figure node. :param node: The lxml img node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["img", "figure"]: return state # Process the figure stable_id = "{}::{}:{}".format(state["document"].name, "figure", state["figure"]["idx"]) # img within a Figure get's processed in the parent Figure if node.tag == "img" and isinstance(state["parent"][node], Figure): return state # NOTE: We currently do NOT support nested figures. parts = {} parent = state["parent"][node] if isinstance(parent, Section): parts["section"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent else: logger.warning("Figure is nested within {}".format( state["parent"][node])) return state parts["document"] = state["document"] parts["stable_id"] = stable_id parts["position"] = state["figure"]["idx"] # If processing a raw img if node.tag == "img": # Create the Figure entry in the DB parts["url"] = node.get("src") state["context"][node] = Figure(**parts) elif node.tag == "figure": # Pull the image from a child img node, if one exists imgs = [child for child in node if child.tag == "img"] if len(imgs) > 1: logger.warning("Figure contains multiple images.") # Right now we don't support multiple URLs in the Figure context # As a workaround, just ignore the outer Figure and allow processing # of the individual images. We ignore the accompanying figcaption # by marking it as visited. captions = [ child for child in node if child.tag == "figcaption" ] state["visited"].update(captions) return state img = imgs[0] state["visited"].add(img) # Create the Figure entry in the DB parts["url"] = img.get("src") state["context"][node] = Figure(**parts) state["figure"]["idx"] += 1 return state def _parse_sentence(self, paragraph, node, state): """Parse the Sentences of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ text = state["paragraph"]["text"] field = state["paragraph"]["field"] # Lingual Parse document = state["document"] for parts in self.tokenize_and_split_sentences(document, text): parts["document"] = document # NOTE: Why do we overwrite this from the spacy parse? parts["position"] = state["sentence"]["idx"] abs_sentence_offset_end = (state["sentence"]["abs_offset"] + parts["char_offsets"][-1] + len(parts["words"][-1])) parts["stable_id"] = construct_stable_id( document, "sentence", state["sentence"]["abs_offset"], abs_sentence_offset_end, ) state["sentence"]["abs_offset"] = abs_sentence_offset_end if self.structural: context_node = node.getparent() if field == "tail" else node tree = lxml.etree.ElementTree(state["root"]) parts["xpath"] = tree.getpath(context_node) parts["html_tag"] = context_node.tag parts["html_attrs"] = [ "=".join(x) for x in list(context_node.attrib.items()) ] # Extending html style attribute with the styles # from inline style class for the element. cur_style_index = None for index, attr in enumerate(parts["html_attrs"]): if attr.find("style") >= 0: cur_style_index = index break head = state["root"].find("head") styles = None if head is not None: styles = head.find("style") if styles is not None: for x in list(context_node.attrib.items()): if x[0] == "class": exp = r"(." + x[1] + ")([\n\s\r]*)\{(.*?)\}" r = re.compile(exp, re.DOTALL) if r.search(styles.text) is not None: if cur_style_index is not None: parts["html_attrs"][cur_style_index] += ( r.search(styles.text).group(3).replace( "\r", "").replace("\n", "").replace("\t", "")) else: parts["html_attrs"].extend([ "style=" + re.sub( r"\s{1,}", " ", r.search( styles.text).group(3).replace( "\r", "").replace( "\n", "").replace( "\t", "").strip(), ) ]) break if self.tabular: parts["position"] = state["sentence"]["idx"] # If tabular, consider own Context first in case a Cell # was just created. Otherwise, defer to the parent. parent = paragraph if isinstance(parent, Paragraph): parts["section"] = parent.section parts["paragraph"] = parent if parent.cell: parts["table"] = parent.cell.table parts["cell"] = parent.cell parts["row_start"] = parent.cell.row_start parts["row_end"] = parent.cell.row_end parts["col_start"] = parent.cell.col_start parts["col_end"] = parent.cell.col_end else: raise NotImplementedError( "Sentence parent must be Paragraph.") yield Sentence(**parts) state["sentence"]["idx"] += 1 def _parse_paragraph(self, node, state): """Parse a Paragraph of the node. A Paragraph is defined as :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ # Both Paragraphs will share the same parent parent = (state["context"][node] if node in state["context"] else state["parent"][node]) for field in ["text", "tail"]: text = getattr(node, field) text = text.strip() if text and self.strip else text # Skip if "" or None if not text: continue # Run RegEx replacements for (rgx, replace) in self.replacements: text = rgx.sub(replace, text) # Process the Paragraph stable_id = "{}::{}:{}".format(state["document"].name, "paragraph", state["paragraph"]["idx"]) parts = {} parts["stable_id"] = stable_id parts["document"] = state["document"] parts["position"] = state["paragraph"]["idx"] if isinstance(parent, Caption): if parent.table: parts["section"] = parent.table.section elif parent.figure: parts["section"] = parent.figure.section parts["caption"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent elif isinstance(parent, Section): parts["section"] = parent elif isinstance(parent, Figure): # occurs with text in the tail of an img parts["section"] = parent.section else: raise NotImplementedError(( 'Para "{}" parent must be Section, Caption, or Cell, not {}' ).format(text, parent)) # Create the Figure entry in the DB paragraph = Paragraph(**parts) state["paragraph"]["idx"] += 1 state["paragraph"]["text"] = text state["paragraph"]["field"] = field yield from self._parse_sentence(paragraph, node, state) def _parse_section(self, node, state): """Parse a Section of the node. Note that this implementation currently just creates a single Section for a document. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag != "html": return state # Add a Section stable_id = "{}::{}:{}".format(state["document"].name, "section", state["section"]["idx"]) state["context"][node] = Section( document=state["document"], stable_id=stable_id, position=state["section"]["idx"], ) state["section"]["idx"] += 1 return state def _parse_caption(self, node, state): """Parse a Caption of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["caption", "figcaption"]: # captions used in Tables return state # Add a Caption parent = state["parent"][node] stable_id = "{}::{}:{}".format(state["document"].name, "caption", state["caption"]["idx"]) if isinstance(parent, Table): state["context"][node] = Caption( document=state["document"], table=parent, figure=None, stable_id=stable_id, position=state["caption"]["idx"], ) elif isinstance(parent, Figure): state["context"][node] = Caption( document=state["document"], table=None, figure=parent, stable_id=stable_id, position=state["caption"]["idx"], ) else: raise NotImplementedError( "Caption must be a child of Table or Figure.") state["caption"]["idx"] += 1 return state def _parse_node(self, node, state): """Entry point for parsing all node types. :param node: The lxml HTML node to parse :param state: The global state necessary to place the node in context of the document as a whole. :rtype: a *generator* of Sentences """ # Processing on entry of node state = self._parse_section(node, state) state = self._parse_figure(node, state) if self.tabular: state = self._parse_table(node, state) state = self._parse_caption(node, state) yield from self._parse_paragraph(node, state) def parse(self, document, text): """Depth-first search over the provided tree. Implemented as an iterative procedure. The structure of the state needed to parse each node is also defined in this function. :param document: the Document context :param text: the structured text of the document (e.g. HTML) :rtype: a *generator* of Sentences. """ stack = [] root = lxml.html.fromstring(text) document.text = text # flattens children of node that are in the 'flatten' list if self.flatten: lxml.etree.strip_tags(root, self.flatten) # This dictionary contain the global state necessary to parse a # document and each context element. This reflects the relationships # defined in parser/models. This contains the state necessary to create # the respective Contexts within the document. state = { "visited": set(), "parent": {}, # map of parent[child] = node used to discover child "context": {}, # track the Context of each node (context['td'] = Cell) "root": root, "document": document, "section": { "idx": 0 }, "paragraph": { "idx": 0 }, "figure": { "idx": 0 }, "caption": { "idx": 0 }, "table": { "idx": 0 }, "sentence": { "idx": 0, "abs_offset": 0 }, } # NOTE: Currently the helper functions directly manipulate the state # rather than returning a modified copy. # Iterative Depth-First Search stack.append(root) state["parent"][root] = document state["context"][root] = document tokenized_sentences = [] while stack: node = stack.pop() if node not in state["visited"]: state["visited"].add(node) # mark as visited # Process if self.lingual: tokenized_sentences += [ y for y in self._parse_node(node, state) ] else: yield from self._parse_node(node, state) # NOTE: This reversed() order is to ensure that the iterative # DFS matches the order that would be produced by a recursive # DFS implementation. for child in reversed(node): # Skip nodes that are comments or blacklisted if child.tag is lxml.etree.Comment or ( self.blacklist and child.tag in self.blacklist): continue stack.append(child) # store the parent of the node, which is either the parent # Context, or if the parent did not create a Context, then # use the node's parent Context. state["parent"][child] = (state["context"][node] if node in state["context"] else state["parent"][node]) if self.lingual: yield from self.enrich_tokenized_sentences_with_nlp( tokenized_sentences)