def test_split_sentences_by_char_limit(caplog): caplog.set_level(logging.INFO) lingual_parser = Spacy("en") lingual_parser.load_lang_model() text = "This is a text. This is another text." all_sentences = [ Sentence(**parts) for parts in lingual_parser.split_sentences(text) ] assert len(all_sentences) == 2 assert [len(sentence.text) for sentence in all_sentences] == [15, 21] lingual_parser.model.remove_pipe("sentencizer") lingual_parser.model.add_pipe(set_custom_boundary, before="parser", name="sentence_boundary_detector") sentence_batches = lingual_parser._split_sentences_by_char_limit( all_sentences, 20) assert len(sentence_batches) == 2 sentence_batches = lingual_parser._split_sentences_by_char_limit( all_sentences, 100) assert len(sentence_batches) == 1 sentence_batch = sentence_batches[0] custom_tokenizer = TokenPreservingTokenizer(lingual_parser.model.vocab) doc = custom_tokenizer(sentence_batch) doc.user_data = sentence_batch for name, proc in lingual_parser.model.pipeline: # iterate over components in order doc = proc(doc) assert doc.is_parsed # See if the number of parsed spaCy sentences matches that of input sentences assert len(list(doc.sents)) == len(sentence_batch)
def doc_setup(): doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = "This is apple" lingual_parser = SpacyParser("en") for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) return doc
def bbox_from_sentence(sentence: Sentence) -> Bbox: # TODO: this may have issues where a sentence is linked to words on different pages if isinstance(sentence, Sentence) and sentence.is_visual(): return Bbox( sentence.page[0], min(sentence.top), max(sentence.bottom), min(sentence.left), max(sentence.right), ) else: return None
def doc_setup(): """Set up document.""" doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = "This is apple. That is orange. Where is banaba? I like Apple." lingual_parser = SpacyParser("en") # Split sentences for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) # Enrich sentences for _ in lingual_parser.enrich_sentences_with_NLP(doc.sentences): pass # Pick one sentence and add visual information # so that all the words get aligned horizontally. sentence: Sentence = doc.sentences[0] sentence.page = [1, 1, 1, 1] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] # Assume the 2nd sentence is horizontally aligned with 1st. sentence: Sentence = doc.sentences[1] sentence.page = [1, 1, 1, 1] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [40, 50, 60, 70] sentence.right = [50, 60, 70, 80] # Assume the 3rd sentence is vertically aligned with 1st. sentence: Sentence = doc.sentences[2] sentence.page = [1, 1, 1, 1] sentence.top = [10, 10, 10, 10] sentence.bottom = [20, 20, 20, 20] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] # Assume the 4th sentence is in 2nd page. sentence: Sentence = doc.sentences[3] sentence.page = [2, 2, 2, 2] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] return doc
def test_span_char_start_and_char_end(): """Test chart_start and char_end of TemporarySpan that comes from Ngrams.apply.""" ngrams = Ngrams() sent = Sentence() sent.text = "BC548BG" sent.words = ["BC548BG"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 1 assert result[0].get_span() == "BC548BG" assert result[0].char_start == 0 assert result[0].char_end == 6
def _parse_sentence(self, paragraph, node, state): """Parse the Sentences of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ # Set name for Sentence name = node.attrib["name"] if "name" in node.attrib else None # Lingual Parse document = state["document"] sens_parts = [] sens_words_id = [] for sentence in node: parts = defaultdict(list) parts["document"] = document flag = 0 text = "" words = [] char_abs_offsets = [] start = 0 for i, word in enumerate(sentence): w = "" for char in word: if "bbox" in char.attrib.keys(): flag = 1 w += char.text words.append(w) char_abs_offsets.append(start) start += (1 + len(word)) text += re.sub("\s+", " ", w) text += " " if not flag: continue if text.isspace(): continue if not any(p and p[0].isalnum() for p in words): continue if not text: continue for i, word in enumerate(sentence): parts["words"].append(words[i].replace(" ", "_")) parts["lemmas"].append(words[i].replace(" ", "_")) parts["ner_tags"].append( "") # placeholder for later NLP parsing parts["char_offsets"].append(char_abs_offsets[i]) # parts["abs_char_offsets"].append(char_abs_offsets[i]) parts["dep_parents"].append( 0) # placeholder for later NLP parsing parts["dep_labels"].append( "") # placeholder for later NLP parsing parts["text"], parts["pos_tags"] = self.lingual_parser.tagger( text[:-1]) abs_offset = state["sentence"]["abs_offset"] parts["abs_char_offsets"] = [ char_offset + abs_offset for char_offset in parts["char_offsets"] ] parts["position"] = state["sentence"]["idx"] if self.tabular: parts["position"] = state["sentence"]["idx"] # If tabular, consider own Context first in case a Cell # was just created. Otherwise, defer to the parent. parent = paragraph if isinstance(parent, Paragraph): parts["section"] = parent.section parts["paragraph"] = parent else: raise NotImplementedError( "Sentence parent must be Paragraph.") if self.structural: context_node = sentence tree = lxml.etree.ElementTree(state["root"]) parts["xpath"] = tree.getpath(context_node) parts["html_tag"] = context_node.tag parts["html_attrs"] = [] temp_attrs = [] for word in sentence: if len(word) == 0: continue t = "" for k, v in word[0].attrib.items(): if k != "bbox": v = v.replace(" ", "") t = t + k + "=" + v + " " t = t[:-1] temp_attrs.append(t) for temp_attr in temp_attrs: parts["html_attrs"].append(temp_attr) if self.visual: page = [] top = [] left = [] right = [] bottom = [] p = int(node.getparent().get("id")) bbox = node.getparent().get("bbox") bbox = bbox.split(",") height = int(round(float(bbox[3]))) # hack for handle error coordinate in sentence flag = False try: for word in sentence: if len(word) == 0: continue coord_f = word[0].attrib[ "bbox"] # coordinate first character of word coord_l = word[-1].attrib["bbox"] coord_f = coord_f.split(",") coord_l = coord_l.split(",") page.append(p) left.append(int(round(float(coord_f[0])))) bottom.append(height - int(round(float(coord_f[1])))) right.append(int(round(float(coord_l[2])))) if height > int(round(float(coord_f[3]))): top.append(height - int(round(float(coord_f[3])))) else: top.append(0) parts["page"] = page parts["left"] = left parts["top"] = top parts["right"] = right parts["bottom"] = bottom except Exception as e: print(e) print(document, "\n", text) continue abs_sentence_offset_end = (state["sentence"]["abs_offset"] + parts["char_offsets"][-1] + len(parts["words"][-1])) parts["stable_id"] = construct_stable_id( document, "sentence", state["sentence"]["abs_offset"], abs_sentence_offset_end, ) state["sentence"]["idx"] += 1 state["sentence"]["abs_offset"] = abs_sentence_offset_end parts["name"] = name yield Sentence(**parts)
def test_ngram_split(caplog): """Test ngram split.""" caplog.set_level(logging.INFO) ngrams = Ngrams(split_tokens=["-", "/"]) sent = Sentence() # When a split_token appears in the middle of the text. sent.text = "New-Text" sent.words = ["New-Text"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 3 assert result[0].get_span() == "New-Text" assert result[1].get_span() == "New" assert result[2].get_span() == "Text" # When a text ends with a split_token. sent.text = "New-" sent.words = ["New-"] result = list(ngrams.apply(sent)) assert len(result) == 2 assert result[0].get_span() == "New-" assert result[1].get_span() == "New" # When a text starts with a split_token. sent.text = "-Text" sent.words = ["-Text"] result = list(ngrams.apply(sent)) assert len(result) == 2 assert result[0].get_span() == "-Text" assert result[1].get_span() == "Text" # When more than one split_token appears. sent.text = "New/Text-Word" sent.words = ["New/Text-Word"] result = list(ngrams.apply(sent)) assert len(result) == 6 spans = [r.get_span() for r in result] assert "New/Text-Word" in spans assert "New" in spans assert "New/Text" in spans assert "Text" in spans assert "Text-Word" in spans assert "Word" in spans sent.text = "A-B/C-D" sent.words = ["A-B/C-D"] result = list(ngrams.apply(sent)) assert len(result) == 10 spans = [r.get_span() for r in result] assert "A-B/C-D" in spans assert "A-B/C" in spans assert "B/C-D" in spans assert "A-B" in spans assert "C-D" in spans assert "B/C" in spans assert "A" in spans assert "B" in spans assert "C" in spans assert "D" in spans ngrams = Ngrams(split_tokens=["~", "~~"]) sent = Sentence() sent.text = "a~b~~c~d" sent.words = ["a~b~~c~d"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 10 spans = [r.get_span() for r in result] assert "a~b~~c~d" in spans assert "a" in spans assert "a~b" in spans assert "a~b~~c" in spans assert "b" in spans assert "b~~c" in spans assert "b~~c~d" in spans assert "c" in spans assert "c~d" in spans assert "d" in spans ngrams = Ngrams(split_tokens=["~a", "a~"]) sent = Sentence() sent.text = "~a~b~~c~d" sent.words = ["~a~b~~c~d"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 2 spans = [r.get_span() for r in result] assert "~a~b~~c~d" in spans assert "~b~~c~d" in spans ngrams = Ngrams(split_tokens=["-", "/", "*"]) sent = Sentence() sent.text = "A-B/C*D" sent.words = ["A-B/C*D"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 10 spans = [r.get_span() for r in result] assert "A-B/C*D" in spans assert "A" in spans assert "A-B" in spans assert "A-B/C" in spans assert "B" in spans assert "B/C" in spans assert "B/C*D" in spans assert "C" in spans assert "C*D" in spans assert "D" in spans
def _parse_sentence(self, paragraph, node, state): """Parse the Sentences of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ text = state["paragraph"]["text"] field = state["paragraph"]["field"] # Lingual Parse document = state["document"] for parts in self.lingual_parse(document, text): (_, _, _, char_end) = split_stable_id(parts["stable_id"]) parts["document"] = document parts["position"] = state["sentence"]["idx"] abs_sentence_offset_end = ( state["sentence"]["abs_offset"] + parts["char_offsets"][-1] + len(parts["words"][-1]) ) parts["stable_id"] = construct_stable_id( document, "sentence", state["sentence"]["abs_offset"], abs_sentence_offset_end, ) state["sentence"]["abs_offset"] = abs_sentence_offset_end if self.structural: context_node = node.getparent() if field == "tail" else node tree = lxml.etree.ElementTree(state["root"]) parts["xpath"] = tree.getpath(context_node) parts["html_tag"] = context_node.tag parts["html_attrs"] = [ "=".join(x) for x in list(context_node.attrib.items()) ] # Extending html style attribute with the styles # from inline style class for the element. cur_style_index = None for index, attr in enumerate(parts["html_attrs"]): if attr.find("style") >= 0: cur_style_index = index break styles = state["root"].find("head").find("style") if styles is not None: for x in list(context_node.attrib.items()): if x[0] == "class": exp = r"(." + x[1] + ")([\n\s\r]*)\{(.*?)\}" r = re.compile(exp, re.DOTALL) if r.search(styles.text) is not None: if cur_style_index is not None: parts["html_attrs"][cur_style_index] += ( r.search(styles.text) .group(3) .replace("\r", "") .replace("\n", "") .replace("\t", "") ) else: parts["html_attrs"].extend( [ "style=" + re.sub( r"\s{1,}", " ", r.search(styles.text) .group(3) .replace("\r", "") .replace("\n", "") .replace("\t", "") .strip(), ) ] ) break if self.tabular: parts["position"] = state["sentence"]["idx"] # If tabular, consider own Context first in case a Cell # was just created. Otherwise, defer to the parent. parent = paragraph if isinstance(parent, Paragraph): parts["section"] = parent.section parts["paragraph"] = parent if parent.cell: parts["table"] = parent.cell.table parts["cell"] = parent.cell parts["row_start"] = parent.cell.row_start parts["row_end"] = parent.cell.row_end parts["col_start"] = parent.cell.col_start parts["col_end"] = parent.cell.col_end else: raise NotImplementedError("Sentence parent must be Paragraph.") yield Sentence(**parts) state["sentence"]["idx"] += 1
def test_ner_matchers(): """Test different ner type matchers.""" # Set up a document doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = " ".join([ "Tim Cook was born in USA in 1960.", "He is the CEO of Apple.", "He sold 100 million of iPhone.", ]) lingual_parser = SpacyParser("en") for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) # Manually attach ner_tags as the result from spacy may fluctuate. doc.sentences[0].ner_tags = [ "PERSON", "PERSON", "O", "O", "O", "GPE", "O", "DATE", "O", ] doc.sentences[1].ner_tags = ["O", "O", "O", "O", "O", "ORG", "O"] doc.sentences[2].ner_tags = [ "O", "O", "CARDINAL", "CARDINAL", "O", "MISC", "O" ] # the length of words and that of ner_tags should match. assert len(doc.sentences[0].words) == len(doc.sentences[0].ner_tags) assert len(doc.sentences[1].words) == len(doc.sentences[1].ner_tags) space = MentionNgrams(n_min=1, n_max=2) # Test if PersonMatcher works as expected matcher = PersonMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Tim Cook"} # Test if LocationMatcher works as expected matcher = LocationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"USA"} # Test if DateMatcher works as expected matcher = DateMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"1960"} # Test if OrganizationMatcher works as expected matcher = OrganizationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Apple"} # Test if NumberMatcher works as expected matcher = NumberMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"100 million"} # Test if MiscMatcher works as expected matcher = MiscMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"iPhone"}
def test_ngram_split(caplog): """Test ngram split.""" caplog.set_level(logging.INFO) ngrams = Ngrams() sent = Sentence() # When a split_token appears in the middle of the text. sent.text = "New-Text" sent.words = ["New-Text"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 3 assert result[0].get_span() == "New-Text" assert result[1].get_span() == "New" assert result[2].get_span() == "Text" # When a text ends with a split_token. sent.text = "New-" sent.words = ["New-"] result = list(ngrams.apply(sent)) assert len(result) == 2 assert result[0].get_span() == "New-" assert result[1].get_span() == "New" # When a text starts with a split_token. sent.text = "-Text" sent.words = ["-Text"] result = list(ngrams.apply(sent)) assert len(result) == 2 assert result[0].get_span() == "-Text" assert result[1].get_span() == "Text" # When more than one split_token appears. sent.text = "New/Text-Word" sent.words = ["New/Text-Word"] result = list(ngrams.apply(sent)) assert len(result) == 3 assert result[0].get_span() == "New/Text-Word" assert result[1].get_span() == "New" assert result[2].get_span() == "Text-Word"