Ejemplo n.º 1
0
def _tablelib_multinary_features(
        spans: Tuple[SpanMention, ...]) -> Iterator[Tuple[str, int]]:
    """Table-/structure-related features for multiple spans."""
    multinary_features = settings["featurization"]["tabular"][
        "multinary_features"]
    span_sentences = [span.sentence for span in spans]
    if all([sentence.is_tabular() for sentence in span_sentences]):
        span_tables = [sentence.table for sentence in span_sentences]
        if span_tables[1:] == span_tables[:-1]:
            yield "SAME_TABLE", DEF_VALUE
            if all([span.sentence.cell is not None for span in spans]):
                row_diff = min_row_diff(
                    span_sentences,
                    absolute=multinary_features["min_row_diff"]["absolute"],
                )
                col_diff = min_col_diff(
                    span_sentences,
                    absolute=multinary_features["min_col_diff"]["absolute"],
                )
                yield f"SAME_TABLE_ROW_DIFF_[{row_diff}]", DEF_VALUE
                yield f"SAME_TABLE_COL_DIFF_[{col_diff}]", DEF_VALUE
                yield (
                    f"SAME_TABLE_MANHATTAN_DIST_[{abs(row_diff) + abs(col_diff)}]"
                ), DEF_VALUE
                span_cells = [sentence.cell for sentence in span_sentences]
                if span_cells[1:] == span_cells[:-1]:
                    yield "SAME_CELL", DEF_VALUE
                    word_diff = sum([
                        s1.get_word_start_index() - s2.get_word_start_index()
                        for s1, s2 in zip(spans[:-1], spans[1:])
                    ])
                    yield (f"WORD_DIFF_[{word_diff}]"), DEF_VALUE
                    char_diff = sum([
                        s1.char_start - s2.char_start
                        for s1, s2 in zip(spans[:-1], spans[1:])
                    ])
                    yield (f"CHAR_DIFF_[{char_diff}]"), DEF_VALUE
                    if [span_sentences[1:] == span_sentences[:-1]]:
                        yield "SAME_SENTENCE", DEF_VALUE
        else:
            if all([sentence.cell is not None for sentence in span_sentences]):
                yield "DIFF_TABLE", DEF_VALUE
                row_diff = min_row_diff(
                    span_sentences,
                    absolute=multinary_features["min_row_diff"]["absolute"],
                )
                col_diff = min_col_diff(
                    span_sentences,
                    absolute=multinary_features["min_col_diff"]["absolute"],
                )
                yield f"DIFF_TABLE_ROW_DIFF_[{row_diff}]", DEF_VALUE
                yield f"DIFF_TABLE_COL_DIFF_[{col_diff}]", DEF_VALUE
                yield (
                    f"DIFF_TABLE_MANHATTAN_DIST_[{abs(row_diff) + abs(col_diff)}]"
                ), DEF_VALUE
Ejemplo n.º 2
0
def tablelib_binary_features(span1, span2):
    """
    Table-/structure-related features for a pair of spans
    """
    binary_features = settings["featurization"]["table"]["binary_features"]
    if span1.sentence.is_tabular() and span2.sentence.is_tabular():
        if span1.sentence.table == span2.sentence.table:
            yield "SAME_TABLE", DEF_VALUE
            if span1.sentence.cell is not None and span2.sentence.cell is not None:
                row_diff = min_row_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=binary_features["min_row_diff"]["absolute"],
                )
                col_diff = min_col_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=binary_features["min_col_diff"]["absolute"],
                )
                yield f"SAME_TABLE_ROW_DIFF_[{row_diff}]", DEF_VALUE
                yield f"SAME_TABLE_COL_DIFF_[{col_diff}]", DEF_VALUE
                yield (
                    f"SAME_TABLE_MANHATTAN_DIST_[{abs(row_diff) + abs(col_diff)}]"
                ), DEF_VALUE
                if span1.sentence.cell == span2.sentence.cell:
                    yield "SAME_CELL", DEF_VALUE
                    yield (
                        f"WORD_DIFF_["
                        f"{span1.get_word_start_index() - span2.get_word_start_index()}"
                        f"]"
                    ), DEF_VALUE
                    yield (
                        f"CHAR_DIFF_[{span1.char_start - span2.char_start}]"
                    ), DEF_VALUE
                    if span1.sentence == span2.sentence:
                        yield "SAME_SENTENCE", DEF_VALUE
        else:
            if span1.sentence.cell is not None and span2.sentence.cell is not None:
                yield "DIFF_TABLE", DEF_VALUE
                row_diff = min_row_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=binary_features["min_row_diff"]["absolute"],
                )
                col_diff = min_col_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=binary_features["min_col_diff"]["absolute"],
                )
                yield f"DIFF_TABLE_ROW_DIFF_[{row_diff}]", DEF_VALUE
                yield f"DIFF_TABLE_COL_DIFF_[{col_diff}]", DEF_VALUE
                yield (
                    f"DIFF_TABLE_MANHATTAN_DIST_[{abs(row_diff) + abs(col_diff)}]"
                ), DEF_VALUE
Ejemplo n.º 3
0
def tablelib_binary_features(span1, span2):
    """
    Table-/structure-related features for a pair of spans
    """
    if span1.sentence.is_tabular() and span2.sentence.is_tabular():
        if span1.sentence.table == span2.sentence.table:
            yield u"SAME_TABLE", DEF_VALUE
            if span1.sentence.cell is not None and span2.sentence.cell is not None:
                row_diff = min_row_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=settings.featurization.table.binary_features.
                    min_row_diff.absolute,
                )
                col_diff = min_col_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=settings.featurization.table.binary_features.
                    min_col_diff.absolute,
                )
                yield u"SAME_TABLE_ROW_DIFF_[%s]" % row_diff, DEF_VALUE
                yield u"SAME_TABLE_COL_DIFF_[%s]" % col_diff, DEF_VALUE
                yield u"SAME_TABLE_MANHATTAN_DIST_[%s]" % str(
                    abs(row_diff) + abs(col_diff)), DEF_VALUE
                if span1.sentence.cell == span2.sentence.cell:
                    yield u"SAME_CELL", DEF_VALUE
                    yield u"WORD_DIFF_[%s]" % (
                        span1.get_word_start() -
                        span2.get_word_start()), DEF_VALUE
                    yield u"CHAR_DIFF_[%s]" % (span1.char_start -
                                               span2.char_start), DEF_VALUE
                    if span1.sentence == span2.sentence:
                        yield u"SAME_SENTENCE", DEF_VALUE
        else:
            if span1.sentence.cell is not None and span2.sentence.cell is not None:
                yield u"DIFF_TABLE", DEF_VALUE
                row_diff = min_row_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=settings.featurization.table.binary_features.
                    min_row_diff.absolute,
                )
                col_diff = min_col_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=settings.featurization.table.binary_features.
                    min_col_diff.absolute,
                )
                yield u"DIFF_TABLE_ROW_DIFF_[%s]" % row_diff, DEF_VALUE
                yield u"DIFF_TABLE_COL_DIFF_[%s]" % col_diff, DEF_VALUE
                yield u"DIFF_TABLE_MANHATTAN_DIST_[%s]" % str(
                    abs(row_diff) + abs(col_diff)), DEF_VALUE
Ejemplo n.º 4
0
def get_neighbor_cell_ngrams(mention,
                             dist=1,
                             directions=False,
                             attrib="words",
                             n_min=1,
                             n_max=1,
                             lower=True):
    """
    Get the ngrams from all Cells that are within a given Cell distance in one
    direction from the given Mention.

    Note that if a candidate is passed in, all of its Mentions will be
    searched. If `directions=True``, each ngram will be returned with a
    direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}.

    :param mention: The Mention whose neighbor Cells are being searched
    :param dist: The Cell distance within which a neighbor Cell must be to be
        considered
    :param directions: A Boolean expressing whether or not to return the
        direction of each ngram
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams (or (ngram, direction) tuples if directions=True)
    """
    # TODO: Fix this to be more efficient (optimize with SQL query)
    spans = _to_spans(mention)
    for span in spans:
        for ngram in get_sentence_ngrams(span,
                                         attrib=attrib,
                                         n_min=n_min,
                                         n_max=n_max,
                                         lower=lower):
            yield ngram
        if span.sentence.is_tabular():
            root_cell = span.sentence.cell
            for sentence in chain.from_iterable([
                    _get_aligned_sentences(root_cell, "row"),
                    _get_aligned_sentences(root_cell, "col"),
            ]):
                row_diff = min_row_diff(sentence, root_cell, absolute=False)
                col_diff = min_col_diff(sentence, root_cell, absolute=False)
                if ((row_diff or col_diff) and not (row_diff and col_diff)
                        and abs(row_diff) + abs(col_diff) <= dist):
                    if directions:
                        direction = ""
                        if col_diff == 0:
                            if 0 < row_diff and row_diff <= dist:
                                direction = "UP"
                            elif 0 > row_diff and row_diff >= -dist:
                                direction = "DOWN"
                        elif row_diff == 0:
                            if 0 < col_diff and col_diff <= dist:
                                direction = "RIGHT"
                            elif 0 > col_diff and col_diff >= -dist:
                                direction = "LEFT"
                        for ngram in tokens_to_ngrams(
                                getattr(sentence, attrib),
                                n_min=n_min,
                                n_max=n_max,
                                lower=lower,
                        ):
                            yield (ngram, direction)
                    else:
                        for ngram in tokens_to_ngrams(
                                getattr(sentence, attrib),
                                n_min=n_min,
                                n_max=n_max,
                                lower=lower,
                        ):
                            yield ngram
Ejemplo n.º 5
0
def get_neighbor_cell_ngrams(
    mention: Union[Candidate, Mention, TemporarySpanMention],
    dist: int = 1,
    directions: bool = False,
    attrib: str = "words",
    n_min: int = 1,
    n_max: int = 1,
    lower: bool = True,
) -> Iterator[Union[str, Tuple[str, str]]]:
    """Get ngrams from all neighbor Cells.

    Get the ngrams from all Cells that are within a given Cell distance in one
    direction from the given Mention.

    Note that if a candidate is passed in, all of its Mentions will be
    searched. If `directions=True``, each ngram will be returned with a
    direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}.
    Also note that if the mention is not tabular, nothing will be yielded.

    :param mention: The Mention whose neighbor Cells are being searched
    :param dist: The Cell distance within which a neighbor Cell must be to be
        considered
    :param directions: A Boolean expressing whether or not to return the
        direction of each ngram
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :return: a *generator* of ngrams (or (ngram, direction) tuples if directions=True)
    """
    # TODO: Fix this to be more efficient (optimize with SQL query)
    spans = _to_spans(mention)
    for span in spans:
        if not span.sentence.is_tabular():
            continue

        for ngram in get_sentence_ngrams(span,
                                         attrib=attrib,
                                         n_min=n_min,
                                         n_max=n_max,
                                         lower=lower):
            yield ngram
            root_cell = span.sentence.cell
            for sentence in chain.from_iterable([
                    _get_aligned_sentences(root_cell, "row"),
                    _get_aligned_sentences(root_cell, "col"),
            ]):
                row_diff = min_row_diff([sentence, root_cell], absolute=False)
                col_diff = min_col_diff([sentence, root_cell], absolute=False)
                if (row_diff ^ col_diff  # Exclusive OR
                        and abs(row_diff) + abs(col_diff) <= dist):
                    if directions:
                        if col_diff == 0:
                            direction = "DOWN" if 0 < row_diff else "UP"
                        else:
                            direction = "RIGHT" if 0 < col_diff else "LEFT"
                        for ngram in tokens_to_ngrams(
                                getattr(sentence, attrib),
                                n_min=n_min,
                                n_max=n_max,
                                lower=lower,
                        ):
                            yield (ngram, direction)
                    else:
                        for ngram in tokens_to_ngrams(
                                getattr(sentence, attrib),
                                n_min=n_min,
                                n_max=n_max,
                                lower=lower,
                        ):
                            yield ngram