def LF_part_ce_keywords_in_rows_cols_prefix(c): ngrams = set(list(get_row_ngrams(c[1], n_max=3))) ngrams = ngrams.union(set(list(get_col_ngrams(c[1], n_max=3)))) ngrams_part = _filter_non_parts(ngrams) return (TRUE if overlap(_CE_KEYWORDS.union(_CE_ABBREVS), ngrams) and any( [c.part.context.get_span().lower().startswith(_) for _ in ngrams_part]) else ABSTAIN)
def LF_ce_keywords_not_part_in_row_col_prefix(c): ngrams_part = set(list(get_col_ngrams(c[1], n_max=3, lower=False))) ngrams_part = _filter_non_parts( ngrams_part.union(set(list(get_row_ngrams(c[1], n_max=3, lower=False))))) return (TRUE if not same_table(c) and overlap( _CE_KEYWORDS.union(_CE_ABBREVS), get_row_ngrams(c[1], n_max=3)) and len(ngrams_part) == 0 and not overlap(_NON_CEV_KEYWORDS, get_row_ngrams(c.part, n_max=3)) and not overlap(_NON_CEV_KEYWORDS, get_row_ngrams(c[1], n_max=3)) and not LF_current_in_row(c) else ABSTAIN)
def get_row_and_column_ngrams(mention): row_ngrams = list(get_row_ngrams(mention)) col_ngrams = list(get_col_ngrams(mention)) if not mention.sentence.is_tabular(): assert len(row_ngrams) == 1 and row_ngrams[0] is None assert len(col_ngrams) == 1 and col_ngrams[0] is None else: assert not any(x is None for x in row_ngrams) assert not any(x is None for x in col_ngrams) if "birth_place" in row_ngrams: return True else: return False
def tablelib_unary_features(span): """ Table-/structure-related features for a single span """ if not span.sentence.is_tabular(): return sentence = span.sentence for attrib in settings["featurization"]["table"]["unary_features"]["attrib"]: for ngram in get_cell_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_cell_ngrams" ]["max"], attrib=attrib, ): yield f"CELL_{attrib.upper()}_[{ngram}]", DEF_VALUE for row_num in range(sentence.row_start, sentence.row_end + 1): yield f"ROW_NUM_[{row_num}]", DEF_VALUE for col_num in range(sentence.col_start, sentence.col_end + 1): yield f"COL_NUM_[{col_num}]", DEF_VALUE # NOTE: These two features could be accounted for by HTML_ATTR in # structural features yield f"ROW_SPAN_[{num_rows(sentence)}]", DEF_VALUE yield f"COL_SPAN_[{num_cols(sentence)}]", DEF_VALUE for axis in ["row", "col"]: for ngram in get_head_ngrams( span, axis, n_max=settings["featurization"]["table"]["unary_features"][ "get_head_ngrams" ]["max"], attrib=attrib, ): yield f"{axis.upper()}_HEAD_{attrib.upper()}_[{ngram}]", DEF_VALUE for ngram in get_row_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_row_ngrams" ]["max"], attrib=attrib, ): yield f"ROW_{attrib.upper()}_[{ngram}]", DEF_VALUE for ngram in get_col_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_col_ngrams" ]["max"], attrib=attrib, ): yield f"COL_{attrib.upper()}_[{ngram}]", DEF_VALUE
def neg_gain_keywords_in_column(c): col_ngrams = set(get_col_ngrams(c.gain, n_max=1, lower=True)) if overlap( [ "max", "min", "test", "condition", "conditions", "vgn", "f", "-3", "db", "dbc", ], col_ngrams, ): return FALSE else: return ABSTAIN
def _condition(attr): if overlap(["condition", "conditions"], get_col_ngrams(attr, n_max=1)): return False return True
def LF_part_mismatch_col(c): ngrams_part = _filter_non_parts(set(list(get_col_ngrams(c[1], n_max=1)))) return (ABSTAIN if len(ngrams_part) == 0 or any([ c.part.context.get_span().lower().startswith(_.lower()) for _ in ngrams_part ]) else FALSE)
def neg_current_keywords_in_column(c): return (FALSE if overlap( ["over", "temperature", "vgn", "f", "-3", "db", "dbc", "min", "max"], get_col_ngrams(c.supply_current, lower=True), ) else ABSTAIN)
def pos_current_typ(c): return (TRUE if overlap(["typ", "typ."], get_col_ngrams(c.supply_current, lower=True)) else ABSTAIN)