def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower, from_sentence): # TODO: this currently looks only in current table; # precompute over the whole document/page instead bbox_direction_aligned = (bbox_vert_aligned if direction == "vert" else bbox_horz_aligned) ngrams_space = Ngrams(n_max=n_max, split_tokens=[]) f = (lambda w: w.lower()) if lower else (lambda w: w) spans = _to_spans(c) for span in spans: if not span.sentence.is_tabular() or not span.sentence.is_visual(): continue for sentence in span.sentence.table.sentences: if from_sentence: if (bbox_direction_aligned(bbox_from_sentence(sentence), bbox_from_span(span)) and sentence is not span.sentence): for ngram in tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram else: for ts in ngrams_space.apply(sentence): if bbox_direction_aligned( bbox_from_span(ts), bbox_from_span(span)) and not ( sentence == span.sentence and ts.get_span() in span.get_span()): yield f(ts.get_span())
def test_ngram_split(caplog): """Test ngram split.""" caplog.set_level(logging.INFO) ngrams = Ngrams() sent = Sentence() # When a split_token appears in the middle of the text. sent.text = "New-Text" sent.words = ["New-Text"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 3 assert result[0].get_span() == "New-Text" assert result[1].get_span() == "New" assert result[2].get_span() == "Text" # When a text ends with a split_token. sent.text = "New-" sent.words = ["New-"] result = list(ngrams.apply(sent)) assert len(result) == 2 assert result[0].get_span() == "New-" assert result[1].get_span() == "New" # When a text starts with a split_token. sent.text = "-Text" sent.words = ["-Text"] result = list(ngrams.apply(sent)) assert len(result) == 2 assert result[0].get_span() == "-Text" assert result[1].get_span() == "Text" # When more than one split_token appears. sent.text = "New/Text-Word" sent.words = ["New/Text-Word"] result = list(ngrams.apply(sent)) assert len(result) == 3 assert result[0].get_span() == "New/Text-Word" assert result[1].get_span() == "New" assert result[2].get_span() == "Text-Word"
def test_span_char_start_and_char_end(): """Test chart_start and char_end of TemporarySpan that comes from Ngrams.apply.""" ngrams = Ngrams() sent = Sentence() sent.text = "BC548BG" sent.words = ["BC548BG"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 1 assert result[0].get_span() == "BC548BG" assert result[0].char_start == 0 assert result[0].char_end == 6
def _get_direction_ngrams( direction: str, c: Union[Candidate, Mention, TemporarySpanMention], attrib: str, n_min: int, n_max: int, lower: bool, from_sentence: bool, ) -> Iterator[str]: bbox_direction_aligned = (bbox_vert_aligned if direction == "vert" else bbox_horz_aligned) ngrams_space = Ngrams(n_max=n_max, split_tokens=[]) f = (lambda w: w.lower()) if lower else (lambda w: w) spans = _to_spans(c) for span in spans: if not span.sentence.is_visual(): continue for sentence in span.sentence.document.sentences: # Skip if not in the same page. if span.sentence.get_bbox().page != sentence.get_bbox().page: continue if from_sentence: if (bbox_direction_aligned(sentence.get_bbox(), span.get_bbox()) and sentence is not span.sentence # not from its Sentence ): for ngram in tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram else: for ts in ngrams_space.apply(sentence): if ( # True if visually aligned AND not from itself. bbox_direction_aligned(ts.get_bbox(), span.get_bbox()) and ts not in span and span not in ts): yield f(ts.get_span())
def test_ngram_split(caplog): """Test ngram split.""" caplog.set_level(logging.INFO) ngrams = Ngrams(split_tokens=["-", "/"]) sent = Sentence() # When a split_token appears in the middle of the text. sent.text = "New-Text" sent.words = ["New-Text"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 3 assert result[0].get_span() == "New-Text" assert result[1].get_span() == "New" assert result[2].get_span() == "Text" # When a text ends with a split_token. sent.text = "New-" sent.words = ["New-"] result = list(ngrams.apply(sent)) assert len(result) == 2 assert result[0].get_span() == "New-" assert result[1].get_span() == "New" # When a text starts with a split_token. sent.text = "-Text" sent.words = ["-Text"] result = list(ngrams.apply(sent)) assert len(result) == 2 assert result[0].get_span() == "-Text" assert result[1].get_span() == "Text" # When more than one split_token appears. sent.text = "New/Text-Word" sent.words = ["New/Text-Word"] result = list(ngrams.apply(sent)) assert len(result) == 6 spans = [r.get_span() for r in result] assert "New/Text-Word" in spans assert "New" in spans assert "New/Text" in spans assert "Text" in spans assert "Text-Word" in spans assert "Word" in spans sent.text = "A-B/C-D" sent.words = ["A-B/C-D"] result = list(ngrams.apply(sent)) assert len(result) == 10 spans = [r.get_span() for r in result] assert "A-B/C-D" in spans assert "A-B/C" in spans assert "B/C-D" in spans assert "A-B" in spans assert "C-D" in spans assert "B/C" in spans assert "A" in spans assert "B" in spans assert "C" in spans assert "D" in spans ngrams = Ngrams(split_tokens=["~", "~~"]) sent = Sentence() sent.text = "a~b~~c~d" sent.words = ["a~b~~c~d"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 10 spans = [r.get_span() for r in result] assert "a~b~~c~d" in spans assert "a" in spans assert "a~b" in spans assert "a~b~~c" in spans assert "b" in spans assert "b~~c" in spans assert "b~~c~d" in spans assert "c" in spans assert "c~d" in spans assert "d" in spans ngrams = Ngrams(split_tokens=["~a", "a~"]) sent = Sentence() sent.text = "~a~b~~c~d" sent.words = ["~a~b~~c~d"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 2 spans = [r.get_span() for r in result] assert "~a~b~~c~d" in spans assert "~b~~c~d" in spans ngrams = Ngrams(split_tokens=["-", "/", "*"]) sent = Sentence() sent.text = "A-B/C*D" sent.words = ["A-B/C*D"] sent.char_offsets = [0] sent.abs_char_offsets = [0] result = list(ngrams.apply(sent)) assert len(result) == 10 spans = [r.get_span() for r in result] assert "A-B/C*D" in spans assert "A" in spans assert "A-B" in spans assert "A-B/C" in spans assert "B" in spans assert "B/C" in spans assert "B/C*D" in spans assert "C" in spans assert "C*D" in spans assert "D" in spans