def _parse_paragraph(self, node, state): """Parse a Paragraph of the node. A Paragraph is defined as :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ # Both Paragraphs will share the same parent parent = ( state["context"][node] if node in state["context"] else state["parent"][node] ) for field in ["text", "tail"]: text = getattr(node, field) text = text.strip() if text and self.strip else text # Skip if "" or None if not text: continue # Run RegEx replacements for (rgx, replace) in self.replacements: text = rgx.sub(replace, text) # Process the Paragraph stable_id = "{}::{}:{}".format( state["document"].name, "paragraph", state["paragraph"]["idx"] ) parts = {} parts["stable_id"] = stable_id parts["document"] = state["document"] parts["position"] = state["paragraph"]["idx"] if isinstance(parent, Caption): if parent.table: parts["section"] = parent.table.section elif parent.figure: parts["section"] = parent.figure.section parts["caption"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent elif isinstance(parent, Section): parts["section"] = parent else: raise NotImplementedError( "Paragraph parent must be Section, Caption, or Cell" ) # Create the Figure entry in the DB paragraph = Paragraph(**parts) state["paragraph"]["idx"] += 1 state["paragraph"]["text"] = text state["paragraph"]["field"] = field # Parse the Sentences in the Paragraph yield from self._parse_sentence(paragraph, node, state) return state
def _parse_paragraph(self, node, state): """Parse a Paragraph of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ # Both Paragraphs will share the same parent parent = (state["context"][node] if node in state["context"] else state["parent"][node]) # Set name for Paragraph name = node.attrib["name"] if "name" in node.attrib else None for field in ["text"]: if node.tag != "paragraph": continue # Process the Paragraph stable_id = (f"{state['document'].name}" f"::" f"{'paragraph'}" f":" f"{state['paragraph']['idx']}") parts = {} parts["stable_id"] = stable_id parts["name"] = name parts["document"] = state["document"] parts["position"] = state["paragraph"]["idx"] if isinstance(parent, Caption): if parent.table: parts["section"] = parent.table.section elif parent.figure: parts["section"] = parent.figure.section parts["caption"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent elif isinstance(parent, Section): parts["section"] = parent elif isinstance(parent, Figure): # occurs with text in the tail of an img parts["section"] = parent.section elif isinstance(parent, Table): # occurs with text in the tail of a table parts["section"] = parent.section else: raise NotImplementedError( f"Para '{text}' parent must be Section, Caption, or Cell, " f"not {parent}") # Create the entry in the DB paragraph = Paragraph(**parts) state["paragraph"]["idx"] += 1 try: yield from self._parse_sentence(paragraph, node, state) except Exception as e: print(e.__doc__) print(e.message)
def _parse_paragraph(self, node: HtmlElement, state: Dict[str, Any]) -> Iterator[Sentence]: """Parse a Paragraph of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ # Both Paragraphs will share the same parent parent = (state["context"][node] if node in state["context"] else state["parent"][node]) # Set name for Paragraph name = node.attrib["name"] if "name" in node.attrib else None for field in ["text", "tail"]: text = getattr(node, field) text = text.strip() if text and self.strip else text # Skip if "" or None if not text: continue # Run RegEx replacements for (rgx, replace) in self.replacements: text = rgx.sub(replace, text) # Process the Paragraph stable_id = (f"{state['document'].name}" f"::" f"{'paragraph'}" f":" f"{state['paragraph']['idx']}") parts = {} parts["stable_id"] = stable_id parts["name"] = name parts["document"] = state["document"] parts["position"] = state["paragraph"]["idx"] if isinstance(parent, Caption): if parent.table: parts["section"] = parent.table.section elif parent.figure: parts["section"] = parent.figure.section parts["caption"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent elif isinstance(parent, Section): parts["section"] = parent elif isinstance(parent, Figure): # occurs with text in the tail of an img parts["section"] = parent.section elif isinstance(parent, Table): # occurs with text in the tail of a table parts["section"] = parent.section else: raise NotImplementedError( f"Para '{text}' parent must be Section, Caption, or Cell, " f"not {parent}") # Create the entry in the DB paragraph = Paragraph(**parts) state["paragraph"]["idx"] += 1 state["paragraph"]["text"] = text state["paragraph"]["field"] = field yield from self._parse_sentence(paragraph, node, state)