def get_stable_id(self) -> str: """ Return a stable id. :rtype: string """ return construct_stable_id(self.section, self._get_polymorphic_identity(), 0, 0)
def parse(self, document, contents): """Parse the document. :param document: The Document context of the data model. :param contents: The text contents of the document. :rtype: a *generator* of tokenized text. """ i = 0 for text in contents.split(self.delim): if not len(text.strip()): continue words = text.split() char_offsets = [0] + [ int(_) for _ in np.cumsum([len(x) + 1 for x in words])[:-1] ] text = " ".join(words) stable_id = construct_stable_id(document, "sentence", i, i) yield { "text": text, "words": words, "pos_tags": [""] * len(words), "ner_tags": [""] * len(words), "lemmas": [""] * len(words), "dep_parents": [0] * len(words), "dep_labels": [""] * len(words), "char_offsets": char_offsets, "abs_char_offsets": char_offsets, "stable_id": stable_id, } i += 1
def get_stable_id(self) -> str: """Return a stable id.""" return construct_stable_id( self.sentence, self._get_polymorphic_identity(), self.char_start, self.char_end, )
def _parse_sentence(self, paragraph, node, state): """Parse the Sentences of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ # Set name for Sentence name = node.attrib["name"] if "name" in node.attrib else None # Lingual Parse document = state["document"] sens_parts = [] sens_words_id = [] for sentence in node: parts = defaultdict(list) parts["document"] = document flag = 0 text = "" words = [] char_abs_offsets = [] start = 0 for i, word in enumerate(sentence): w = "" for char in word: if "bbox" in char.attrib.keys(): flag = 1 w += char.text words.append(w) char_abs_offsets.append(start) start += (1 + len(word)) text += re.sub("\s+", " ", w) text += " " if not flag: continue if text.isspace(): continue if not any(p and p[0].isalnum() for p in words): continue if not text: continue for i, word in enumerate(sentence): parts["words"].append(words[i].replace(" ", "_")) parts["lemmas"].append(words[i].replace(" ", "_")) parts["ner_tags"].append( "") # placeholder for later NLP parsing parts["char_offsets"].append(char_abs_offsets[i]) # parts["abs_char_offsets"].append(char_abs_offsets[i]) parts["dep_parents"].append( 0) # placeholder for later NLP parsing parts["dep_labels"].append( "") # placeholder for later NLP parsing parts["text"], parts["pos_tags"] = self.lingual_parser.tagger( text[:-1]) abs_offset = state["sentence"]["abs_offset"] parts["abs_char_offsets"] = [ char_offset + abs_offset for char_offset in parts["char_offsets"] ] parts["position"] = state["sentence"]["idx"] if self.tabular: parts["position"] = state["sentence"]["idx"] # If tabular, consider own Context first in case a Cell # was just created. Otherwise, defer to the parent. parent = paragraph if isinstance(parent, Paragraph): parts["section"] = parent.section parts["paragraph"] = parent else: raise NotImplementedError( "Sentence parent must be Paragraph.") if self.structural: context_node = sentence tree = lxml.etree.ElementTree(state["root"]) parts["xpath"] = tree.getpath(context_node) parts["html_tag"] = context_node.tag parts["html_attrs"] = [] temp_attrs = [] for word in sentence: if len(word) == 0: continue t = "" for k, v in word[0].attrib.items(): if k != "bbox": v = v.replace(" ", "") t = t + k + "=" + v + " " t = t[:-1] temp_attrs.append(t) for temp_attr in temp_attrs: parts["html_attrs"].append(temp_attr) if self.visual: page = [] top = [] left = [] right = [] bottom = [] p = int(node.getparent().get("id")) bbox = node.getparent().get("bbox") bbox = bbox.split(",") height = int(round(float(bbox[3]))) # hack for handle error coordinate in sentence flag = False try: for word in sentence: if len(word) == 0: continue coord_f = word[0].attrib[ "bbox"] # coordinate first character of word coord_l = word[-1].attrib["bbox"] coord_f = coord_f.split(",") coord_l = coord_l.split(",") page.append(p) left.append(int(round(float(coord_f[0])))) bottom.append(height - int(round(float(coord_f[1])))) right.append(int(round(float(coord_l[2])))) if height > int(round(float(coord_f[3]))): top.append(height - int(round(float(coord_f[3])))) else: top.append(0) parts["page"] = page parts["left"] = left parts["top"] = top parts["right"] = right parts["bottom"] = bottom except Exception as e: print(e) print(document, "\n", text) continue abs_sentence_offset_end = (state["sentence"]["abs_offset"] + parts["char_offsets"][-1] + len(parts["words"][-1])) parts["stable_id"] = construct_stable_id( document, "sentence", state["sentence"]["abs_offset"], abs_sentence_offset_end, ) state["sentence"]["idx"] += 1 state["sentence"]["abs_offset"] = abs_sentence_offset_end parts["name"] = name yield Sentence(**parts)
def get_stable_id(self) -> str: """Return a stable id.""" return construct_stable_id(self.paragraph, self._get_polymorphic_identity(), 0, 0)
def _parse_sentence(self, paragraph: Paragraph, node: HtmlElement, state: Dict[str, Any]) -> Iterator[Sentence]: """Parse the Sentences of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ text = state["paragraph"]["text"] field = state["paragraph"]["field"] # Set name for Sentence name = node.attrib["name"] if "name" in node.attrib else None # Lingual Parse document = state["document"] for parts in self.lingual_parser.split_sentences(text): abs_offset = state["sentence"]["abs_offset"] parts["abs_char_offsets"] = [ char_offset + abs_offset for char_offset in parts["char_offsets"] ] parts["document"] = document # NOTE: Why do we overwrite this from the spacy parse? parts["position"] = state["sentence"]["idx"] abs_sentence_offset_end = (state["sentence"]["abs_offset"] + parts["char_offsets"][-1] + len(parts["words"][-1])) parts["stable_id"] = construct_stable_id( document, "sentence", state["sentence"]["abs_offset"], abs_sentence_offset_end, ) parts["name"] = name state["sentence"]["abs_offset"] = abs_sentence_offset_end if self.structural: context_node = node.getparent() if field == "tail" else node tree = lxml.etree.ElementTree(state["root"]) parts["xpath"] = tree.getpath(context_node) parts["html_tag"] = context_node.tag parts["html_attrs"] = [ "=".join(x) for x in list(context_node.attrib.items()) ] # Extending html style attribute with the styles # from inline style class for the element. cur_style_index = None for index, attr in enumerate(parts["html_attrs"]): if attr.find("style") >= 0: cur_style_index = index break head = state["root"].find("head") styles = None if head is not None: styles = head.find("style") if styles is not None: for x in list(context_node.attrib.items()): if x[0] == "class": exp = r"(." + x[1] + r")([\n\s\r]*)\{(.*?)\}" r = re.compile(exp, re.DOTALL) if r.search(styles.text) is not None: if cur_style_index is not None: parts["html_attrs"][cur_style_index] += ( r.search(styles.text).group(3).replace( "\r", "").replace("\n", "").replace("\t", "")) else: parts["html_attrs"].extend([ "style=" + re.sub( r"\s{1,}", " ", r.search( styles.text).group(3).replace( "\r", "").replace( "\n", "").replace( "\t", "").strip(), ) ]) break parts["position"] = state["sentence"]["idx"] # If tabular, consider own Context first in case a Cell # was just created. Otherwise, defer to the parent. parent = paragraph if isinstance(parent, Paragraph): parts["section"] = parent.section parts["paragraph"] = parent if parent.cell: # if True self.tabular is also always True parts["table"] = parent.cell.table parts["cell"] = parent.cell parts["row_start"] = parent.cell.row_start parts["row_end"] = parent.cell.row_end parts["col_start"] = parent.cell.col_start parts["col_end"] = parent.cell.col_end else: raise NotImplementedError("Sentence parent must be Paragraph.") yield Sentence(**parts) state["sentence"]["idx"] += 1