def LF_part_mismatch_header(c): ngrams_part = _filter_non_parts( set(list(get_head_ngrams(c[1], n_max=1, axis="col")))) return (ABSTAIN if len(ngrams_part) == 0 or any([ c.part.context.get_span().lower().startswith(_.lower()) for _ in ngrams_part ]) else FALSE)
def tablelib_unary_features(span): """ Table-/structure-related features for a single span """ if not span.sentence.is_tabular(): return sentence = span.sentence for attrib in settings["featurization"]["table"]["unary_features"]["attrib"]: for ngram in get_cell_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_cell_ngrams" ]["max"], attrib=attrib, ): yield f"CELL_{attrib.upper()}_[{ngram}]", DEF_VALUE for row_num in range(sentence.row_start, sentence.row_end + 1): yield f"ROW_NUM_[{row_num}]", DEF_VALUE for col_num in range(sentence.col_start, sentence.col_end + 1): yield f"COL_NUM_[{col_num}]", DEF_VALUE # NOTE: These two features could be accounted for by HTML_ATTR in # structural features yield f"ROW_SPAN_[{num_rows(sentence)}]", DEF_VALUE yield f"COL_SPAN_[{num_cols(sentence)}]", DEF_VALUE for axis in ["row", "col"]: for ngram in get_head_ngrams( span, axis, n_max=settings["featurization"]["table"]["unary_features"][ "get_head_ngrams" ]["max"], attrib=attrib, ): yield f"{axis.upper()}_HEAD_{attrib.upper()}_[{ngram}]", DEF_VALUE for ngram in get_row_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_row_ngrams" ]["max"], attrib=attrib, ): yield f"ROW_{attrib.upper()}_[{ngram}]", DEF_VALUE for ngram in get_col_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_col_ngrams" ]["max"], attrib=attrib, ): yield f"COL_{attrib.upper()}_[{ngram}]", DEF_VALUE
def LF_head_ends_with_ceo(c): return (TRUE if any( ngram.endswith("ceo") for ngram in get_head_ngrams(c[1])) else ABSTAIN)