def get_box(span: SpanMention) -> Tuple[int, int, int, int, int]:
    box = (
        min(span.get_attrib_tokens("page")),
        min(span.get_attrib_tokens("top")),
        min(span.get_attrib_tokens("left")),
        max(span.get_attrib_tokens("bottom")),
        max(span.get_attrib_tokens("right")),
    )
    return box
Example #2
0
def get_box(span: SpanMention) -> Bbox:
    """Get the bounding box."""
    warnings.warn(
        "get_box(span) is deprecated. Use span.get_bbox() instead.",
        DeprecationWarning,
    )
    return Bbox(
        min(span.get_attrib_tokens("page")),
        min(span.get_attrib_tokens("top")),
        max(span.get_attrib_tokens("bottom")),
        min(span.get_attrib_tokens("left")),
        max(span.get_attrib_tokens("right")),
    )
Example #3
0
def _get_word_feats(span: SpanMention) -> Iterator[str]:
    attrib = "words"

    if span.stable_id not in unary_word_feats:
        unary_word_feats[span.stable_id] = set()

        for ngram in tokens_to_ngrams(span.get_attrib_tokens(attrib),
                                      n_min=1,
                                      n_max=2):
            feature = f"CONTAINS_{attrib.upper()}_[{ngram}]"
            unary_word_feats[span.stable_id].add(feature)

        for ngram in get_left_ngrams(
                span,
                window=settings["featurization"]["textual"]["word_feature"]
            ["window"],
                n_max=2,
                attrib=attrib,
        ):
            feature = f"LEFT_{attrib.upper()}_[{ngram}]"
            unary_word_feats[span.stable_id].add(feature)

        for ngram in get_right_ngrams(
                span,
                window=settings["featurization"]["textual"]["word_feature"]
            ["window"],
                n_max=2,
                attrib=attrib,
        ):
            feature = f"RIGHT_{attrib.upper()}_[{ngram}]"
            unary_word_feats[span.stable_id].add(feature)

        unary_word_feats[span.stable_id].add((
            f"SPAN_TYPE_["
            f"{'IMPLICIT' if isinstance(span, ImplicitSpanMention) else 'EXPLICIT'}"
            f"]"))

        if span.get_span()[0].isupper():
            unary_word_feats[span.stable_id].add("STARTS_WITH_CAPITAL")

        unary_word_feats[span.stable_id].add(f"LENGTH_{span.get_num_words()}")

    for f in unary_word_feats[span.stable_id]:
        yield f