def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower, from_phrase): # TODO: this currently looks only in current table; # precompute over the whole document/page instead bbox_direction_aligned = bbox_vert_aligned if direction == 'vert' else bbox_horz_aligned ngrams_space = Ngrams(n_max=n_max, split_tokens=[]) f = (lambda w: w.lower()) if lower else (lambda w: w) spans = [c] if isinstance(c, TemporarySpan) else c.get_contexts() for span in spans: if not span.sentence.is_tabular() or not span.sentence.is_visual(): continue for phrase in span.sentence.table.phrases: if (from_phrase): if (bbox_direction_aligned(bbox_from_phrase(phrase), bbox_from_span(span)) and phrase is not span.sentence): for ngram in tokens_to_ngrams(getattr(phrase, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram else: for ts in ngrams_space.apply(phrase): if (bbox_direction_aligned(bbox_from_span(ts), bbox_from_span(span)) and not (phrase == span.sentence and ts.get_span() in span.get_span())): yield f(ts.get_span())
def get_right_ngrams(span, window=3, attrib='words', n_min=1, n_max=1, lower=True): """Get the ngrams within a window to the _right_ of the Candidate from its sentence Context. For higher-arity Candidates, defaults to the _last_ argument. :param span: The Span to evaluate. If a candidate is given, default to its last Span. :param window: The number of tokens to the left of the first argument to return :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a _generator_ of ngrams """ span = span if isinstance(span, TemporarySpan) else span[-1] # get last Span i = span.get_word_end() for ngram in tokens_to_ngrams(getattr(span.sentence, attrib)[i + 1:i + 1 + window], n_min=n_min, n_max=n_max, lower=lower): yield ngram
def get_cell_ngrams(span, attrib='words', n_min=1, n_max=1, lower=True): """Get the ngrams that are in the Cell of the given span, not including itself. Note that if a candidate is passed in, all of its Spans will be searched. :param span: The span whose Cell is being searched :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a _generator_ of ngrams """ spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts() for span in spans: for ngram in get_phrase_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram if isinstance(span.sentence, Phrase) and span.sentence.cell is not None: for ngram in chain.from_iterable([ tokens_to_ngrams(getattr(phrase, attrib), n_min=n_min, n_max=n_max, lower=lower) for phrase in span.sentence.cell.phrases if phrase != span.sentence ]): yield ngram
def get_neighbor_phrase_ngrams(span, d=1, attrib='words', n_min=1, n_max=1, lower=True): """Get the ngrams that are in the neighoring Phrases of the given Span. Note that if a candidate is passed in, all of its Spans will be searched. :param span: The span whose neighbor Phrases are being searched :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a _generator_ of ngrams """ spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts() for span in spans: for ngram in chain.from_iterable([ tokens_to_ngrams(getattr(phrase, attrib), n_min=n_min, n_max=n_max, lower=lower) for phrase in span.sentence.document.phrases if abs(phrase.phrase_num - span.sentence.phrase_num) <= d and phrase != span.sentence ]): yield ngram
def _get_axis_ngrams(span, axis, attrib='words', n_min=1, n_max=1, spread=[0, 0], lower=True): for ngram in get_phrase_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram if (span.sentence.cell is not None): for phrase in _get_aligned_phrases(span.sentence, axis, spread=spread): for ngram in tokens_to_ngrams(getattr(phrase, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram
def get_head_ngrams(span, axis=None, attrib='words', n_min=1, n_max=1, lower=True): """Get the ngrams from the cell in the head of the row or column. More specifically, this returns the ngrams in the leftmost cell in a row and/or the ngrams in the topmost cell in the column, depending on the axis parameter. Note that if a candidate is passed in, all of its Spans will be searched. :param span: The span whose head Cells are being returned :param axis: Which axis {'row', 'col'} to search. If None, then both row and col are searched. :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a _generator_ of ngrams """ spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts() axes = [axis] if axis else ['row', 'col'] for span in spans: if not span.sentence.cell: return else: for axis in axes: if getattr(span.sentence, _other_axis(axis) + '_start') == 0: return for phrase in getattr(_get_head_cell(span.sentence.cell, axis), 'phrases', []): for ngram in tokens_to_ngrams(getattr(phrase, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram
def get_neighbor_cell_ngrams(span, dist=1, directions=False, attrib='words', n_min=1, n_max=1, lower=True): """Get the ngrams from all Cells that are within a given Cell distance in one direction from the given Span. Note that if a candidate is passed in, all of its Spans will be searched. If `directions=True``, each ngram will be returned with a direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}. :param span: The span whose neighbor Cells are being searched :param dist: The Cell distance within which a neighbor Cell must be to be considered :param directions: A Boolean expressing whether or not to return the direction of each ngram :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a _generator_ of ngrams (or (ngram, direction) tuples if directions=True) """ # TODO: Fix this to be more efficient (optimize with SQL query) spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts() for span in spans: for ngram in get_phrase_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram if isinstance(span.sentence, Phrase) and span.sentence.cell is not None: root_cell = span.sentence.cell for phrase in chain.from_iterable([ _get_aligned_phrases(root_cell, 'row'), _get_aligned_phrases(root_cell, 'col') ]): row_diff = min_row_diff(phrase, root_cell, absolute=False) col_diff = min_col_diff(phrase, root_cell, absolute=False) if (row_diff or col_diff) and not ( row_diff and col_diff) and abs(row_diff) + abs(col_diff) <= dist: if directions: direction = '' if col_diff == 0: if 0 < row_diff and row_diff <= dist: direction = "UP" elif 0 > row_diff and row_diff >= -dist: direction = "DOWN" elif row_diff == 0: if 0 < col_diff and col_diff <= dist: direction = "RIGHT" elif 0 > col_diff and col_diff >= -dist: direction = "LEFT" for ngram in tokens_to_ngrams(getattr(phrase, attrib), n_min=n_min, n_max=n_max, lower=lower): yield (ngram, direction) else: for ngram in tokens_to_ngrams(getattr(phrase, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram