def doc_setup(): """Set up a document.""" parser_udf = get_parser_udf() doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = """<html> <body> <h1>test1</h1> <h2>test2</h2> <div> <h3>test3</h3> <table> <tr> <td>test4</td> <td>test5</td> </tr> </table> <table> <tr> <td>test6</td> <td>test7</td> </tr> </table> </div> <p>test8 test9</p> </body> </html>""" doc = parser_udf.apply(doc) return doc
def doc_setup(): doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = "This is apple" lingual_parser = SpacyParser("en") for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) return doc
def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]: name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")] with codecs.open(fp, encoding=self.encoding) as f: reader = csv.reader(f) # Load CSV header header_names = None if self.header: header_names = next(reader) # Load document per row for i, row in enumerate(reader): sections = [] for j, content in enumerate(row): rule = (self.parser_rule[j] if self.parser_rule is not None and j in self.parser_rule else column_constructor) content_header = (header_names[j] if header_names is not None else None) context = [ build_node(t, n, c) for t, n, c in rule(content) ] sections.append( build_node("section", content_header, "".join(context))) text = build_node("doc", None, "".join(sections)) doc_name = name + ":" + str(i) stable_id = self._get_stable_id(doc_name) yield Document( name=doc_name, stable_id=stable_id, text=text, meta={"file_name": file_name}, )
def doc_setup(): """Set up document.""" doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = "This is apple. That is orange. Where is banaba? I like Apple." lingual_parser = SpacyParser("en") # Split sentences for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) # Enrich sentences for _ in lingual_parser.enrich_sentences_with_NLP(doc.sentences): pass # Pick one sentence and add visual information # so that all the words get aligned horizontally. sentence: Sentence = doc.sentences[0] sentence.page = [1, 1, 1, 1] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] # Assume the 2nd sentence is horizontally aligned with 1st. sentence: Sentence = doc.sentences[1] sentence.page = [1, 1, 1, 1] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [40, 50, 60, 70] sentence.right = [50, 60, 70, 80] # Assume the 3rd sentence is vertically aligned with 1st. sentence: Sentence = doc.sentences[2] sentence.page = [1, 1, 1, 1] sentence.top = [10, 10, 10, 10] sentence.bottom = [20, 20, 20, 20] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] # Assume the 4th sentence is in 2nd page. sentence: Sentence = doc.sentences[3] sentence.page = [2, 2, 2, 2] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] return doc
def parse_file(self, fp, file_name): with codecs.open(fp, encoding=self.encoding) as f: name = os.path.basename(fp).rsplit(".", 1)[0] stable_id = self.get_stable_id(name) doc = Document( name=name, stable_id=stable_id, meta={"file_name": file_name} ) yield doc, f.read()
def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]: with codecs.open(fp, encoding=self.encoding) as f: name = os.path.basename(fp).rsplit(".", 1)[0] stable_id = self._get_stable_id(name) text = build_node("doc", None, build_node("text", None, f.read().strip())) yield Document( name=name, stable_id=stable_id, text=text, meta={"file_name": file_name} )
def parse_file(self, fp, file_name): with codecs.open(fp, encoding=self.encoding) as tsv: for line in tsv: (doc_name, doc_text) = line.split("\t") stable_id = self.get_stable_id(doc_name) doc = Document( name=doc_name, stable_id=stable_id, meta={"file_name": file_name} ) yield doc, doc_text
def parse_file(self, f, file_name): for i, doc in enumerate(et.parse(f).xpath(self.doc)): doc_id = str(doc.xpath(self.id)[0]) text = "\n".join( [t for t in doc.xpath(self.text) if t is not None]) meta = {"file_name": str(file_name)} if self.keep_xml_tree: meta["root"] = et.tostring(doc) stable_id = self.get_stable_id(doc_id) yield Document(name=doc_id, stable_id=stable_id, meta=meta), text
def parse_file(self, fp, file_name): with codecs.open(fp, encoding=self.encoding) as f: soup = BeautifulSoup(f, "lxml") for text in soup.find_all("html"): name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")] stable_id = self.get_stable_id(name) yield Document( name=name, stable_id=stable_id, text=str(text), meta={"file_name": file_name}, ), str(text)
def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]: with codecs.open(fp, encoding=self.encoding) as tsv: if self.header: tsv.readline() for line in tsv: (doc_name, doc_text) = line.split("\t") stable_id = self._get_stable_id(doc_name) text = build_node("doc", None, build_node("text", None, doc_text)) yield Document( name=doc_name, stable_id=stable_id, text=text, meta={"file_name": file_name}, )
def parse_file(self, fp, file_name): with codecs.open(fp, encoding=self.encoding) as f: soup = BeautifulSoup(f, "lxml") all_html_elements = soup.find_all("html") if len(all_html_elements) != 1: raise NotImplementedError( "Expecting one html element per html file") text = all_html_elements[0] name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")] stable_id = self.get_stable_id(name) yield Document( name=name, stable_id=stable_id, text=str(text), meta={"file_name": file_name}, )
def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]: with codecs.open(fp, encoding=self.encoding) as f: soup = BeautifulSoup(f, "lxml") all_xml_elements = soup.find_all("pages") if len(all_xml_elements) != 1: raise NotImplementedError( f"unsupported format file: {file_name}") text = all_xml_elements[0] name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")] stable_id = self._get_stable_id(name) yield Document( name=name, stable_id=stable_id, text=str(text), meta={"file_name": file_name}, )
def _preprocess_visual_features(doc: Document) -> None: if hasattr(doc, "_visual_features"): return # cache flag doc._visual_features = True sentence_by_page: DefaultDict[str, List[Sentence]] = defaultdict(list) for sentence in doc.sentences: sentence_by_page[sentence.page[0]].append(sentence) sentence._aligned_lemmas = set() for page, sentences in sentence_by_page.items(): # process per page alignments yc_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list) x0_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list) xc_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list) x1_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list) for sentence in sentences: sentence.bbox = bbox_from_sentence(sentence) sentence.yc = (sentence.bbox.top + sentence.bbox.bottom) / 2 sentence.x0 = sentence.bbox.left sentence.x1 = sentence.bbox.right sentence.xc = (sentence.x0 + sentence.x1) / 2 # index current sentence by different alignment keys yc_aligned[sentence.yc].append(sentence) x0_aligned[sentence.x0].append(sentence) x1_aligned[sentence.x1].append(sentence) xc_aligned[sentence.xc].append(sentence) for l in yc_aligned.values(): l.sort(key=lambda p: p.xc) for l in x0_aligned.values(): l.sort(key=lambda p: p.yc) for l in x1_aligned.values(): l.sort(key=lambda p: p.yc) for l in xc_aligned.values(): l.sort(key=lambda p: p.yc) _assign_alignment_features(yc_aligned, "Y_") _assign_alignment_features(x0_aligned, "LEFT_") _assign_alignment_features(x1_aligned, "RIGHT_") _assign_alignment_features(xc_aligned, "CENTER_")
def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]: xml_content = subprocess.check_output( f"pdf2txt.py -t xml -M 3 -m 5 -A '{fp}' ", shell=True) soup = BeautifulSoup(xml_content, "lxml") all_xml_elements = soup.find_all("pages") if len(all_xml_elements) != 1: raise NotImplementedError(f"unsupported format file: {file_name}") text = all_xml_elements[0] tree = etree.fromstring(str(text)) try: tree = analysis(tree) except Exception as e: print(e) pass name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")] stable_id = self._get_stable_id(name) print(name) yield Document( name=name, stable_id=stable_id, text=etree.tostring(tree), meta={"file_name": file_name}, )
def test_parser_skips_and_flattens(): """Test if ``Parser`` skips/flattens elements.""" parser_udf = get_parser_udf() # Test if a parser skips comments doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = "<html><body>Hello!<!-- comment --></body></html>" doc = parser_udf.apply(doc) assert doc.sentences[0].text == "Hello!" # Test if a parser skips blacklisted elements doc = Document(id=2, name="test2", stable_id="2::document:0:0") doc.text = "<html><body><script>alert('Hello');</script><p>Hello!</p></body></html>" doc = parser_udf.apply(doc) assert doc.sentences[0].text == "Hello!" # Test if a parser flattens elements doc = Document(id=3, name="test3", stable_id="3::document:0:0") doc.text = "<html><body><span>Hello, <br>world!</span></body></html>" doc = parser_udf.apply(doc) assert doc.sentences[0].text == "Hello, world!" # Now with different blacklist and flatten parser_udf = get_parser_udf(blacklist=["meta"], flatten=["word"]) # Test if a parser does not skip non-blacklisted element doc = Document(id=4, name="test4", stable_id="4::document:0:0") doc.text = "<html><body><script>alert('Hello');</script><p>Hello!</p></body></html>" doc = parser_udf.apply(doc) assert doc.sentences[0].text == "alert('Hello');" assert doc.sentences[1].text == "Hello!" # Test if a parser skips blacklisted elements doc = Document(id=5, name="test5", stable_id="5::document:0:0") doc.text = "<html><head><meta name='keywords'></head><body>Hello!</body></html>" doc = parser_udf.apply(doc) assert doc.sentences[0].text == "Hello!" # Test if a parser does not flatten elements doc = Document(id=6, name="test6", stable_id="6::document:0:0") doc.text = "<html><body><span>Hello, <br>world!</span></body></html>" doc = parser_udf.apply(doc) assert doc.sentences[0].text == "Hello," assert doc.sentences[1].text == "world!" # Test if a parser flattens elements doc = Document(id=7, name="test7", stable_id="7::document:0:0") doc.text = "<html><body><word>Hello, </word><word>world!</word></body></html>" doc = parser_udf.apply(doc) assert doc.sentences[0].text == "Hello, world!"
def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]: # Adapted from https://github.com/ocropus/hocr-tools/blob/v1.3.0/hocr-check def get_prop(node: Tag, name: str) -> Optional[str]: title = node["title"] if not title: return None props = title.split(";") for prop in props: (key, args) = prop.split(None, 1) if key == name: return args return None # Adapted from https://github.com/ocropus/hocr-tools/blob/v1.3.0/hocr-check def get_bbox(node: Tag) -> Tuple[str, ...]: bbox = get_prop(node, "bbox") if not bbox: return None return tuple([x for x in bbox.split()]) with codecs.open(fp, encoding=self.encoding) as f: soup = BeautifulSoup(f, "lxml") all_html_elements = soup.find_all("html") if len(all_html_elements) != 1: raise NotImplementedError( f"Expecting exactly one html element per html file: {file_name}" ) root = all_html_elements[0] capabilities = root.find("meta", attrs={"name": "ocr-capabilities"}) if capabilities is None: raise RuntimeError( "The input hOCR does not contain ocr-capabilities metadata.") # Unwrap ocr_line/ocrx_line as Fonduer has no data model for lines. if "ocr_line" in capabilities["content"]: for line in root.find_all(class_="ocr_line"): line.unwrap() if "ocrx_line" in capabilities["content"]: for line in root.find_all(class_="ocrx_line"): line.unwrap() if "ocrx_word" in capabilities["content"]: for p, page in enumerate(root.find_all(class_="ocr_page")): ppageno = str(p) # 0-based for word in page.find_all(class_="ocrx_word"): parent = word.parent (left, top, right, bottom) = get_bbox(word) # ocrx_word could have multiple words with one or more of spaces # in-between. This actually happens on Tesseract 4.00. # This is normalized by splitting and concatenating later. tokens = word.text.split() if "left" not in parent.attrs: parent["left"] = [] parent["top"] = [] parent["right"] = [] parent["bottom"] = [] parent["ppageno"] = [] parent["tokens"] = [] parent["left"] += [left] * len(tokens) parent["top"] += [top] * len(tokens) parent["right"] += [right] * len(tokens) parent["bottom"] += [bottom] * len(tokens) parent["ppageno"] += [ppageno] * len(tokens) parent["tokens"] += tokens if "ocrp_wconf" in capabilities["content"]: x_wconf = get_prop(word, "x_wconf") if "x_wconf" not in parent.attrs: parent["x_wconf"] = [] parent["x_wconf"].append(x_wconf) # Mark the parent element if "fonduer" not in parent.attrs: parent["fonduer"] = ["1"] # Concat words again with " " or "". if len(tokens) > 1: if self.space: word.string.replace_with(" ".join(tokens)) else: word.string.replace_with("".join(tokens)) word.unwrap() # Clean-up for i, parent in enumerate(root.find_all(attrs={"fonduer": "1"})): # Concat consecutive NavigableString parent.smooth() # beautifulsoup4 >= 4.8.0 # Remove linebreaks and excess spaces # in reverse order b/c removing element from list in loop for child in reversed(parent.contents): if isinstance(child, Comment): # remove comments child.extract() elif isinstance(child, NavigableString): if child.strip() == "": # remove if space or linebreak child.extract() else: tmp = re.sub(r"[\n\s]+", " " if self.space else "", child) n = NavigableString(tmp.strip()) child.replace_with(n) del parent["fonduer"] name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")] stable_id = self._get_stable_id(name) yield Document( name=name, stable_id=stable_id, text=str(root), meta={"file_name": file_name}, )
def parse(self, document: Document, text: str) -> Iterator[Sentence]: """Depth-first search over the provided tree. Implemented as an iterative procedure. The structure of the state needed to parse each node is also defined in this function. :param document: the Document context :param text: the structured text of the document (e.g. HTML) :return: a *generator* of Sentences. """ stack = [] root = lxml.html.fromstring(text) # flattens children of node that are in the 'flatten' list if self.flatten: lxml.etree.strip_tags(root, self.flatten) # Strip comments lxml.etree.strip_tags(root, lxml.etree.Comment) # Assign the text, which was stripped of the 'flatten'-tags, to the document document.text = lxml.etree.tostring(root, encoding="unicode") # This dictionary contain the global state necessary to parse a # document and each context element. This reflects the relationships # defined in parser/models. This contains the state necessary to create # the respective Contexts within the document. state = { "visited": set(), "parent": {}, # map of parent[child] = node used to discover child "context": {}, # track the Context of each node (context['td'] = Cell) "root": root, "document": document, "section": { "idx": 0 }, "paragraph": { "idx": 0 }, "figure": { "idx": 0 }, "caption": { "idx": 0 }, "table": { "idx": 0 }, "sentence": { "idx": 0, "abs_offset": 0 }, } # NOTE: Currently the helper functions directly manipulate the state # rather than returning a modified copy. # Iterative Depth-First Search stack.append(root) state["parent"][root] = document state["context"][root] = document tokenized_sentences: List[Sentence] = [] while stack: node = stack.pop() if node not in state["visited"]: state["visited"].add(node) # mark as visited # Process if self.lingual: tokenized_sentences += [ y for y in self._parse_node(node, state) ] else: yield from self._parse_node(node, state) # NOTE: This reversed() order is to ensure that the iterative # DFS matches the order that would be produced by a recursive # DFS implementation. for child in reversed(node): # Skip nodes that are blacklisted if self.blacklist and child.tag in self.blacklist: continue stack.append(child) # store the parent of the node, which is either the parent # Context, or if the parent did not create a Context, then # use the node's parent Context. state["parent"][child] = (state["context"][node] if node in state["context"] else state["parent"][node]) if self.lingual: yield from self.lingual_parser.enrich_sentences_with_NLP( tokenized_sentences)
def test_ner_matchers(): """Test different ner type matchers.""" # Set up a document doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = " ".join([ "Tim Cook was born in USA in 1960.", "He is the CEO of Apple.", "He sold 100 million of iPhone.", ]) lingual_parser = SpacyParser("en") for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) # Manually attach ner_tags as the result from spacy may fluctuate. doc.sentences[0].ner_tags = [ "PERSON", "PERSON", "O", "O", "O", "GPE", "O", "DATE", "O", ] doc.sentences[1].ner_tags = ["O", "O", "O", "O", "O", "ORG", "O"] doc.sentences[2].ner_tags = [ "O", "O", "CARDINAL", "CARDINAL", "O", "MISC", "O" ] # the length of words and that of ner_tags should match. assert len(doc.sentences[0].words) == len(doc.sentences[0].ner_tags) assert len(doc.sentences[1].words) == len(doc.sentences[1].ner_tags) space = MentionNgrams(n_min=1, n_max=2) # Test if PersonMatcher works as expected matcher = PersonMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Tim Cook"} # Test if LocationMatcher works as expected matcher = LocationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"USA"} # Test if DateMatcher works as expected matcher = DateMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"1960"} # Test if OrganizationMatcher works as expected matcher = OrganizationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Apple"} # Test if NumberMatcher works as expected matcher = NumberMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"100 million"} # Test if MiscMatcher works as expected matcher = MiscMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"iPhone"}