Ejemplo n.º 1
0
def _get_axis_ngrams(mention,
                     axis,
                     attrib="words",
                     n_min=1,
                     n_max=1,
                     spread=[0, 0],
                     lower=True):
    span = _to_span(mention)

    if not span.sentence.is_tabular():
        yield None
        return

    for ngram in get_sentence_ngrams(span,
                                     attrib=attrib,
                                     n_min=n_min,
                                     n_max=n_max,
                                     lower=lower):
        yield ngram
    if span.sentence.is_tabular():
        for sentence in _get_aligned_sentences(span.sentence,
                                               axis,
                                               spread=spread):
            for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                          n_min=n_min,
                                          n_max=n_max,
                                          lower=lower):
                yield ngram
Ejemplo n.º 2
0
def get_cell_ngrams(mention, attrib="words", n_min=1, n_max=1, lower=True):
    """Get the ngrams that are in the Cell of the given mention, not including itself.

    Note that if a candidate is passed in, all of its Mentions will be searched.

    :param mention: The Mention whose Cell is being searched
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams
    """
    spans = _to_spans(mention)
    for span in spans:
        for ngram in get_sentence_ngrams(span,
                                         attrib=attrib,
                                         n_min=n_min,
                                         n_max=n_max,
                                         lower=lower):
            yield ngram
        if span.sentence.is_tabular():
            for ngram in chain.from_iterable([
                    tokens_to_ngrams(getattr(sentence, attrib),
                                     n_min=n_min,
                                     n_max=n_max,
                                     lower=lower) for sentence in
                    _get_table_cells(span.sentence.table)[span.sentence.cell]
                    if sentence != span.sentence
            ]):
                yield ngram
Ejemplo n.º 3
0
def get_word_feats(span):
    attrib = "words"

    if span.stable_id not in unary_word_feats:
        unary_word_feats[span.stable_id] = set()

        for ngram in tokens_to_ngrams(span.get_attrib_tokens(attrib),
                                      n_min=1,
                                      n_max=2):
            feature = f"CONTAINS_{attrib.upper()}_[{ngram}]"
            unary_word_feats.add(feature)

        for ngram in get_left_ngrams(
                span,
                window=settings["featurization"]["content"]["word_feature"]
            ["window"],
                n_max=2,
                attrib=attrib,
        ):
            feature = f"LEFT_{attrib.upper()}_[{ngram}]"
            unary_word_feats.add(feature)

        for ngram in get_right_ngrams(
                span,
                window=settings["featurization"]["content"]["word_feature"]
            ["window"],
                n_max=2,
                attrib=attrib,
        ):
            feature = f"RIGHT_{attrib.upper()}_[{ngram}]"
            unary_word_feats.add(feature)

    for f in unary_word_feats[span.stable_id]:
        yield f
Ejemplo n.º 4
0
def get_neighbor_sentence_ngrams(mention,
                                 d=1,
                                 attrib="words",
                                 n_min=1,
                                 n_max=1,
                                 lower=True):
    """Get the ngrams that are in the neighoring Sentences of the given Mention.

    Note that if a candidate is passed in, all of its Mentions will be searched.

    :param mention: The Mention whose neighbor Sentences are being searched
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams
    """
    spans = _to_spans(mention)
    for span in spans:
        for ngram in chain.from_iterable([
                tokens_to_ngrams(getattr(sentence, attrib),
                                 n_min=n_min,
                                 n_max=n_max,
                                 lower=lower)
                for sentence in span.sentence.document.sentences
                if abs(sentence.position - span.sentence.position) <= d
                and sentence != span.sentence
        ]):
            yield ngram
Ejemplo n.º 5
0
def _get_axis_ngrams(
    mention: Union[Candidate, Mention, TemporarySpanMention],
    axis: str,
    attrib: str = "words",
    n_min: int = 1,
    n_max: int = 1,
    spread: List[int] = [0, 0],
    lower: bool = True,
) -> Iterator[str]:
    span = _to_span(mention)

    if not span.sentence.is_tabular():
        return
        yield

    for ngram in get_sentence_ngrams(span,
                                     attrib=attrib,
                                     n_min=n_min,
                                     n_max=n_max,
                                     lower=lower):
        yield ngram

    for sentence in _get_aligned_sentences(span.sentence, axis, spread=spread):
        for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                      n_min=n_min,
                                      n_max=n_max,
                                      lower=lower):
            yield ngram
Ejemplo n.º 6
0
def get_left_ngrams(
    mention: Union[Candidate, Mention, TemporarySpanMention],
    window: int = 3,
    attrib: str = "words",
    n_min: int = 1,
    n_max: int = 1,
    lower: bool = True,
) -> Iterator[str]:
    """Get the ngrams within a window to the *left* from the sentence Context.

    For higher-arity Candidates, defaults to the *first* argument.

    :param mention: The Mention to evaluate. If a candidate is given, default
        to its first Mention.
    :param window: The number of tokens to the left of the first argument to
        return.
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    """
    span = _to_span(mention)
    i = span.get_word_start_index()
    for ngram in tokens_to_ngrams(
            getattr(span.sentence, attrib)[max(0, i - window):i],
            n_min=n_min,
            n_max=n_max,
            lower=lower,
    ):
        yield ngram
Ejemplo n.º 7
0
def get_head_ngrams(mention, axis=None, attrib="words", n_min=1, n_max=1, lower=True):
    """Get the ngrams from the cell in the head of the row or column.

    More specifically, this returns the ngrams in the leftmost cell in a row and/or the
    ngrams in the topmost cell in the column, depending on the axis parameter.

    Note that if a candidate is passed in, all of its Mentions will be searched.

    :param mention: The Mention whose head Cells are being returned
    :param axis: Which axis {'row', 'col'} to search. If None, then both row
        and col are searched.
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams
    """
    spans = _to_spans(mention)
    axes = (axis,) if axis else ("row", "col")
    for span in spans:
        if span.sentence.is_tabular():
            for axis in axes:
                if getattr(span.sentence, _other_axis(axis) + "_start") == 0:
                    return
                for sentence in getattr(
                    _get_head_cell(span.sentence.cell, axis), "sentences", []
                ):
                    for ngram in tokens_to_ngrams(
                        getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower
                    ):
                        yield ngram
Ejemplo n.º 8
0
def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower,
                          from_sentence):
    # TODO: this currently looks only in current table;
    #   precompute over the whole document/page instead
    bbox_direction_aligned = (bbox_vert_aligned
                              if direction == "vert" else bbox_horz_aligned)
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = _to_spans(c)
    for span in spans:
        if not span.sentence.is_tabular() or not span.sentence.is_visual():
            continue
        for sentence in span.sentence.table.sentences:
            if from_sentence:
                if (bbox_direction_aligned(bbox_from_sentence(sentence),
                                           bbox_from_span(span))
                        and sentence is not span.sentence):
                    for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(sentence):
                    if bbox_direction_aligned(
                            bbox_from_span(ts), bbox_from_span(span)) and not (
                                sentence == span.sentence
                                and ts.get_span() in span.get_span()):
                        yield f(ts.get_span())
Ejemplo n.º 9
0
def get_right_ngrams(mention,
                     window=3,
                     attrib="words",
                     n_min=1,
                     n_max=1,
                     lower=True):
    """Get the ngrams within a window to the *right* from the sentence Context.

    For higher-arity Candidates, defaults to the *last* argument.

    :param mention: The Mention to evaluate. If a candidate is given, default
        to its last Mention.
    :param window: The number of tokens to the left of the first argument to
        return
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams
    """
    span = _to_span(mention, idx=-1)
    i = span.get_word_end()
    for ngram in tokens_to_ngrams(
            getattr(span.sentence, attrib)[i + 1:i + 1 + window],
            n_min=n_min,
            n_max=n_max,
            lower=lower,
    ):
        yield ngram
Ejemplo n.º 10
0
def _get_word_feats(span: SpanMention) -> Iterator[str]:
    attrib = "words"

    if span.stable_id not in unary_word_feats:
        unary_word_feats[span.stable_id] = set()

        for ngram in tokens_to_ngrams(span.get_attrib_tokens(attrib),
                                      n_min=1,
                                      n_max=2):
            feature = f"CONTAINS_{attrib.upper()}_[{ngram}]"
            unary_word_feats[span.stable_id].add(feature)

        for ngram in get_left_ngrams(
                span,
                window=settings["featurization"]["textual"]["word_feature"]
            ["window"],
                n_max=2,
                attrib=attrib,
        ):
            feature = f"LEFT_{attrib.upper()}_[{ngram}]"
            unary_word_feats[span.stable_id].add(feature)

        for ngram in get_right_ngrams(
                span,
                window=settings["featurization"]["textual"]["word_feature"]
            ["window"],
                n_max=2,
                attrib=attrib,
        ):
            feature = f"RIGHT_{attrib.upper()}_[{ngram}]"
            unary_word_feats[span.stable_id].add(feature)

        unary_word_feats[span.stable_id].add((
            f"SPAN_TYPE_["
            f"{'IMPLICIT' if isinstance(span, ImplicitSpanMention) else 'EXPLICIT'}"
            f"]"))

        if span.get_span()[0].isupper():
            unary_word_feats[span.stable_id].add("STARTS_WITH_CAPITAL")

        unary_word_feats[span.stable_id].add(f"LENGTH_{span.get_num_words()}")

    for f in unary_word_feats[span.stable_id]:
        yield f
Ejemplo n.º 11
0
def get_cell_ngrams(
    mention: Union[Candidate, Mention, TemporarySpanMention],
    attrib: str = "words",
    n_min: int = 1,
    n_max: int = 1,
    lower: bool = True,
) -> Iterator[str]:
    """Get the ngrams that are in the Cell of the given mention, not including itself.

    Note that if a candidate is passed in, all of its Mentions will be searched.
    Also note that if the mention is not tabular, nothing will be yielded.

    :param mention: The Mention whose Cell is being searched
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    """
    spans = _to_spans(mention)
    for span in spans:
        if not span.sentence.is_tabular():
            continue

        for ngram in get_sentence_ngrams(span,
                                         attrib=attrib,
                                         n_min=n_min,
                                         n_max=n_max,
                                         lower=lower):
            yield ngram
            for ngram in chain.from_iterable([
                    tokens_to_ngrams(getattr(sentence, attrib),
                                     n_min=n_min,
                                     n_max=n_max,
                                     lower=lower) for sentence in
                    _get_table_cells(span.sentence.table)[span.sentence.cell]
                    if sentence != span.sentence
            ]):
                yield ngram
Ejemplo n.º 12
0
def _get_direction_ngrams(
    direction: str,
    c: Union[Candidate, Mention, TemporarySpanMention],
    attrib: str,
    n_min: int,
    n_max: int,
    lower: bool,
    from_sentence: bool,
) -> Iterator[str]:
    bbox_direction_aligned = (bbox_vert_aligned
                              if direction == "vert" else bbox_horz_aligned)
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = _to_spans(c)
    for span in spans:
        if not span.sentence.is_visual():
            continue
        for sentence in span.sentence.document.sentences:
            # Skip if not in the same page.
            if span.sentence.get_bbox().page != sentence.get_bbox().page:
                continue
            if from_sentence:
                if (bbox_direction_aligned(sentence.get_bbox(),
                                           span.get_bbox()) and sentence
                        is not span.sentence  # not from its Sentence
                    ):
                    for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(sentence):
                    if (  # True if visually aligned AND not from itself.
                            bbox_direction_aligned(ts.get_bbox(),
                                                   span.get_bbox())
                            and ts not in span and span not in ts):
                        yield f(ts.get_span())
Ejemplo n.º 13
0
def get_neighbor_cell_ngrams(mention,
                             dist=1,
                             directions=False,
                             attrib="words",
                             n_min=1,
                             n_max=1,
                             lower=True):
    """
    Get the ngrams from all Cells that are within a given Cell distance in one
    direction from the given Mention.

    Note that if a candidate is passed in, all of its Mentions will be
    searched. If `directions=True``, each ngram will be returned with a
    direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}.

    :param mention: The Mention whose neighbor Cells are being searched
    :param dist: The Cell distance within which a neighbor Cell must be to be
        considered
    :param directions: A Boolean expressing whether or not to return the
        direction of each ngram
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams (or (ngram, direction) tuples if directions=True)
    """
    # TODO: Fix this to be more efficient (optimize with SQL query)
    spans = _to_spans(mention)
    for span in spans:
        for ngram in get_sentence_ngrams(span,
                                         attrib=attrib,
                                         n_min=n_min,
                                         n_max=n_max,
                                         lower=lower):
            yield ngram
        if span.sentence.is_tabular():
            root_cell = span.sentence.cell
            for sentence in chain.from_iterable([
                    _get_aligned_sentences(root_cell, "row"),
                    _get_aligned_sentences(root_cell, "col"),
            ]):
                row_diff = min_row_diff(sentence, root_cell, absolute=False)
                col_diff = min_col_diff(sentence, root_cell, absolute=False)
                if ((row_diff or col_diff) and not (row_diff and col_diff)
                        and abs(row_diff) + abs(col_diff) <= dist):
                    if directions:
                        direction = ""
                        if col_diff == 0:
                            if 0 < row_diff and row_diff <= dist:
                                direction = "UP"
                            elif 0 > row_diff and row_diff >= -dist:
                                direction = "DOWN"
                        elif row_diff == 0:
                            if 0 < col_diff and col_diff <= dist:
                                direction = "RIGHT"
                            elif 0 > col_diff and col_diff >= -dist:
                                direction = "LEFT"
                        for ngram in tokens_to_ngrams(
                                getattr(sentence, attrib),
                                n_min=n_min,
                                n_max=n_max,
                                lower=lower,
                        ):
                            yield (ngram, direction)
                    else:
                        for ngram in tokens_to_ngrams(
                                getattr(sentence, attrib),
                                n_min=n_min,
                                n_max=n_max,
                                lower=lower,
                        ):
                            yield ngram
Ejemplo n.º 14
0
def get_neighbor_cell_ngrams(
    mention: Union[Candidate, Mention, TemporarySpanMention],
    dist: int = 1,
    directions: bool = False,
    attrib: str = "words",
    n_min: int = 1,
    n_max: int = 1,
    lower: bool = True,
) -> Iterator[Union[str, Tuple[str, str]]]:
    """Get ngrams from all neighbor Cells.

    Get the ngrams from all Cells that are within a given Cell distance in one
    direction from the given Mention.

    Note that if a candidate is passed in, all of its Mentions will be
    searched. If `directions=True``, each ngram will be returned with a
    direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}.
    Also note that if the mention is not tabular, nothing will be yielded.

    :param mention: The Mention whose neighbor Cells are being searched
    :param dist: The Cell distance within which a neighbor Cell must be to be
        considered
    :param directions: A Boolean expressing whether or not to return the
        direction of each ngram
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :return: a *generator* of ngrams (or (ngram, direction) tuples if directions=True)
    """
    # TODO: Fix this to be more efficient (optimize with SQL query)
    spans = _to_spans(mention)
    for span in spans:
        if not span.sentence.is_tabular():
            continue

        for ngram in get_sentence_ngrams(span,
                                         attrib=attrib,
                                         n_min=n_min,
                                         n_max=n_max,
                                         lower=lower):
            yield ngram
            root_cell = span.sentence.cell
            for sentence in chain.from_iterable([
                    _get_aligned_sentences(root_cell, "row"),
                    _get_aligned_sentences(root_cell, "col"),
            ]):
                row_diff = min_row_diff([sentence, root_cell], absolute=False)
                col_diff = min_col_diff([sentence, root_cell], absolute=False)
                if (row_diff ^ col_diff  # Exclusive OR
                        and abs(row_diff) + abs(col_diff) <= dist):
                    if directions:
                        if col_diff == 0:
                            direction = "DOWN" if 0 < row_diff else "UP"
                        else:
                            direction = "RIGHT" if 0 < col_diff else "LEFT"
                        for ngram in tokens_to_ngrams(
                                getattr(sentence, attrib),
                                n_min=n_min,
                                n_max=n_max,
                                lower=lower,
                        ):
                            yield (ngram, direction)
                    else:
                        for ngram in tokens_to_ngrams(
                                getattr(sentence, attrib),
                                n_min=n_min,
                                n_max=n_max,
                                lower=lower,
                        ):
                            yield ngram