Python tokens_to_ngrams Exemples, fonduer.snorkel.utils.tokens_to_ngrams Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : lf_helpers.py Projet : WeijieChen2017/fonduer_pottery

def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower,
                          from_phrase):
    # TODO: this currently looks only in current table;
    #   precompute over the whole document/page instead
    bbox_direction_aligned = bbox_vert_aligned if direction == 'vert' else bbox_horz_aligned
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = [c] if isinstance(c, TemporarySpan) else c.get_contexts()
    for span in spans:
        if not span.sentence.is_tabular() or not span.sentence.is_visual():
            continue
        for phrase in span.sentence.table.phrases:
            if (from_phrase):
                if (bbox_direction_aligned(bbox_from_phrase(phrase),
                                           bbox_from_span(span))
                        and phrase is not span.sentence):
                    for ngram in tokens_to_ngrams(getattr(phrase, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(phrase):
                    if (bbox_direction_aligned(bbox_from_span(ts),
                                               bbox_from_span(span))
                            and not (phrase == span.sentence
                                     and ts.get_span() in span.get_span())):
                        yield f(ts.get_span())

Exemple #2

0

Afficher le fichier

Fichier : lf_helpers.py Projet : WeijieChen2017/fonduer_pottery

def get_right_ngrams(span,
                     window=3,
                     attrib='words',
                     n_min=1,
                     n_max=1,
                     lower=True):
    """Get the ngrams within a window to the _right_ of the Candidate from its sentence Context.

    For higher-arity Candidates, defaults to the _last_ argument.

    :param span: The Span to evaluate. If a candidate is given, default to its last Span.
    :param window: The number of tokens to the left of the first argument to return
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a _generator_ of ngrams
    """
    span = span if isinstance(span,
                              TemporarySpan) else span[-1]  # get last Span
    i = span.get_word_end()
    for ngram in tokens_to_ngrams(getattr(span.sentence,
                                          attrib)[i + 1:i + 1 + window],
                                  n_min=n_min,
                                  n_max=n_max,
                                  lower=lower):
        yield ngram

Exemple #3

0

Afficher le fichier

Fichier : lf_helpers.py Projet : WeijieChen2017/fonduer_pottery

def get_cell_ngrams(span, attrib='words', n_min=1, n_max=1, lower=True):
    """Get the ngrams that are in the Cell of the given span, not including itself.

    Note that if a candidate is passed in, all of its Spans will be searched.

    :param span: The span whose Cell is being searched
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a _generator_ of ngrams
    """
    spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts()
    for span in spans:
        for ngram in get_phrase_ngrams(span,
                                       attrib=attrib,
                                       n_min=n_min,
                                       n_max=n_max,
                                       lower=lower):
            yield ngram
        if isinstance(span.sentence,
                      Phrase) and span.sentence.cell is not None:
            for ngram in chain.from_iterable([
                    tokens_to_ngrams(getattr(phrase, attrib),
                                     n_min=n_min,
                                     n_max=n_max,
                                     lower=lower)
                    for phrase in span.sentence.cell.phrases
                    if phrase != span.sentence
            ]):
                yield ngram

Exemple #4

0

Afficher le fichier

Fichier : lf_helpers.py Projet : WeijieChen2017/fonduer_pottery

def get_neighbor_phrase_ngrams(span,
                               d=1,
                               attrib='words',
                               n_min=1,
                               n_max=1,
                               lower=True):
    """Get the ngrams that are in the neighoring Phrases of the given Span.

    Note that if a candidate is passed in, all of its Spans will be searched.

    :param span: The span whose neighbor Phrases are being searched
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a _generator_ of ngrams
    """
    spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts()
    for span in spans:
        for ngram in chain.from_iterable([
                tokens_to_ngrams(getattr(phrase, attrib),
                                 n_min=n_min,
                                 n_max=n_max,
                                 lower=lower)
                for phrase in span.sentence.document.phrases
                if abs(phrase.phrase_num - span.sentence.phrase_num) <= d
                and phrase != span.sentence
        ]):
            yield ngram

Exemple #5

0

Afficher le fichier

Fichier : lf_helpers.py Projet : WeijieChen2017/fonduer_pottery

def _get_axis_ngrams(span,
                     axis,
                     attrib='words',
                     n_min=1,
                     n_max=1,
                     spread=[0, 0],
                     lower=True):
    for ngram in get_phrase_ngrams(span,
                                   attrib=attrib,
                                   n_min=n_min,
                                   n_max=n_max,
                                   lower=lower):
        yield ngram
    if (span.sentence.cell is not None):
        for phrase in _get_aligned_phrases(span.sentence, axis, spread=spread):
            for ngram in tokens_to_ngrams(getattr(phrase, attrib),
                                          n_min=n_min,
                                          n_max=n_max,
                                          lower=lower):
                yield ngram

Exemple #6

0

Afficher le fichier

Fichier : lf_helpers.py Projet : WeijieChen2017/fonduer_pottery

def get_head_ngrams(span,
                    axis=None,
                    attrib='words',
                    n_min=1,
                    n_max=1,
                    lower=True):
    """Get the ngrams from the cell in the head of the row or column.

    More specifically, this returns the ngrams in the leftmost cell in a row and/or the
    ngrams in the topmost cell in the column, depending on the axis parameter.

    Note that if a candidate is passed in, all of its Spans will be searched.

    :param span: The span whose head Cells are being returned
    :param axis: Which axis {'row', 'col'} to search. If None, then both row and col are searched.
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a _generator_ of ngrams
    """
    spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts()
    axes = [axis] if axis else ['row', 'col']
    for span in spans:
        if not span.sentence.cell:
            return
        else:
            for axis in axes:
                if getattr(span.sentence, _other_axis(axis) + '_start') == 0:
                    return
                for phrase in getattr(_get_head_cell(span.sentence.cell, axis),
                                      'phrases', []):
                    for ngram in tokens_to_ngrams(getattr(phrase, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram

Exemple #7

0

Afficher le fichier

Fichier : lf_helpers.py Projet : WeijieChen2017/fonduer_pottery

def get_neighbor_cell_ngrams(span,
                             dist=1,
                             directions=False,
                             attrib='words',
                             n_min=1,
                             n_max=1,
                             lower=True):
    """Get the ngrams from all Cells that are within a given Cell distance in one direction from the given Span.

    Note that if a candidate is passed in, all of its Spans will be searched.
    If `directions=True``, each ngram will be returned with a direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}.

    :param span: The span whose neighbor Cells are being searched
    :param dist: The Cell distance within which a neighbor Cell must be to be considered
    :param directions: A Boolean expressing whether or not to return the direction of each ngram
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a _generator_ of ngrams (or (ngram, direction) tuples if directions=True)
    """
    # TODO: Fix this to be more efficient (optimize with SQL query)
    spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts()
    for span in spans:
        for ngram in get_phrase_ngrams(span,
                                       attrib=attrib,
                                       n_min=n_min,
                                       n_max=n_max,
                                       lower=lower):
            yield ngram
        if isinstance(span.sentence,
                      Phrase) and span.sentence.cell is not None:
            root_cell = span.sentence.cell
            for phrase in chain.from_iterable([
                    _get_aligned_phrases(root_cell, 'row'),
                    _get_aligned_phrases(root_cell, 'col')
            ]):
                row_diff = min_row_diff(phrase, root_cell, absolute=False)
                col_diff = min_col_diff(phrase, root_cell, absolute=False)
                if (row_diff or col_diff) and not (
                        row_diff and
                        col_diff) and abs(row_diff) + abs(col_diff) <= dist:
                    if directions:
                        direction = ''
                        if col_diff == 0:
                            if 0 < row_diff and row_diff <= dist:
                                direction = "UP"
                            elif 0 > row_diff and row_diff >= -dist:
                                direction = "DOWN"
                        elif row_diff == 0:
                            if 0 < col_diff and col_diff <= dist:
                                direction = "RIGHT"
                            elif 0 > col_diff and col_diff >= -dist:
                                direction = "LEFT"
                        for ngram in tokens_to_ngrams(getattr(phrase, attrib),
                                                      n_min=n_min,
                                                      n_max=n_max,
                                                      lower=lower):
                            yield (ngram, direction)
                    else:
                        for ngram in tokens_to_ngrams(getattr(phrase, attrib),
                                                      n_min=n_min,
                                                      n_max=n_max,
                                                      lower=lower):
                            yield ngram