def hertz_units(attr): hertz_units = ["mhz", "khz"] keywords = [ "product", "gain", "gain", "unity", "bandwidth", "gbp", "gbw", "gbwp", ] filter_keywords = ["-3 db", "maximum", "minimum", "impedance"] related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True)) related_ngrams.update(get_row_ngrams(attr, n_max=1, spread=[-2, 2], lower=True)) cell_ngrams = set(get_cell_ngrams(attr, n_max=1, lower=True)) if "f" in cell_ngrams and "=" in cell_ngrams: return False if attr.get_span().strip() == "0": return False if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)): return False if overlap(hertz_units, related_ngrams) and overlap(keywords, related_ngrams): return True return False
def LF_ce_keywords_no_part_in_rows(c): for _ in get_row_ngrams(c[1], n_max=3, lower=False): if re.match("^([0-9]+[a-zA-Z]+|[a-zA-Z]+[0-9]+)[0-9a-zA-Z]*$", _.upper()): return ABSTAIN return (TRUE if overlap(_CE_KEYWORDS.union(_CE_ABBREVS), get_row_ngrams(c[1], n_max=3)) else ABSTAIN)
def LF_part_ce_keywords_in_row_prefix(c): ngrams_part = _filter_non_parts(get_row_ngrams(c[1], n_max=3)) return (TRUE if overlap(_CE_KEYWORDS.union(_CE_ABBREVS), get_row_ngrams(c[1], n_max=3)) and any([ c.part.context.get_span().lower().startswith(_) for _ in ngrams_part ]) and not overlap(_NON_CEV_KEYWORDS, get_row_ngrams(c[1], n_max=3)) and not LF_current_in_row(c) else ABSTAIN)
def LF_ce_keywords_not_part_in_row_col_prefix(c): ngrams_part = set(list(get_col_ngrams(c[1], n_max=3, lower=False))) ngrams_part = _filter_non_parts( ngrams_part.union(set(list(get_row_ngrams(c[1], n_max=3, lower=False))))) return (TRUE if not same_table(c) and overlap( _CE_KEYWORDS.union(_CE_ABBREVS), get_row_ngrams(c[1], n_max=3)) and len(ngrams_part) == 0 and not overlap(_NON_CEV_KEYWORDS, get_row_ngrams(c.part, n_max=3)) and not overlap(_NON_CEV_KEYWORDS, get_row_ngrams(c[1], n_max=3)) and not LF_current_in_row(c) else ABSTAIN)
def LF_part_ce_keywords_in_rows_cols_prefix(c): ngrams = set(list(get_row_ngrams(c[1], n_max=3))) ngrams = ngrams.union(set(list(get_col_ngrams(c[1], n_max=3)))) ngrams_part = _filter_non_parts(ngrams) return (TRUE if overlap(_CE_KEYWORDS.union(_CE_ABBREVS), ngrams) and any( [c.part.context.get_span().lower().startswith(_) for _ in ngrams_part]) else ABSTAIN)
def LF_part_miss_match_polarity(c): ngrams_part = set(list(get_row_ngrams(c.part, n_max=1, lower=False))) ngrams_part = _filter_non_polarity(ngrams_part) return (ABSTAIN if len(ngrams_part) == 0 or any([ c[1].context.get_span().lower().startswith(_.lower()) for _ in ngrams_part ]) else FALSE)
def LF_complement_left_row(c): return (-1 if (overlap( ["complement", "complementary"], chain.from_iterable( [get_row_ngrams(c.part), get_left_ngrams(c.part, window=10)]), )) else 0)
def pos_gain(c): row_ngrams = set(get_row_ngrams(c.gain, lower=True)) # print("row_ngrams", row_ngrams) if overlap(["gain"], row_ngrams): return TRUE else: ABSTAIN
def LF_bad_keywords_in_row(c): return ( FALSE if overlap( ["continuous", "cut-off", "gain", "breakdown"], get_row_ngrams(c.volt) ) else ABSTAIN )
def LF_complement_left_row(c): """Return False if temp mention's ngrams align with the following keywords.""" return (FALSE if (overlap( ["complement", "complementary"], chain.from_iterable( [get_row_ngrams(c.part), get_left_ngrams(c.part, window=10)]), )) else ABSTAIN)
def ce_v_max_conditions(attr): ngrams = set(get_row_ngrams(attr, n_max=1)) if not overlap(ce_keywords.union(ce_abbrevs), ngrams): return False if any(_ in attr.sentence.text.lower() for _ in ["vcb", "base"]): return False return True
def pos_gain_keywords(c): vert_ngrams = set(get_vert_ngrams(c.gain, n_max=1, lower=True)) row_ngrams = set(get_row_ngrams(c.gain, lower=True)) if overlap(["typ", "typ."], vert_ngrams) and overlap(["khz", "mhz"], row_ngrams): return TRUE return ABSTAIN
def is_birthplace_table_row(mention): if not mention.sentence.is_tabular(): return False ngrams = get_row_ngrams(mention, lower=True) if "birth_place" in ngrams: return True else: return False
def LF_part_miss_match_part(c): if not c[1].context.sentence.is_tabular(): return ABSTAIN ngrams_part = set(list(get_row_ngrams(c[1], n_max=1, lower=False))) ngrams_part = _filter_non_parts(ngrams_part) return (ABSTAIN if len(ngrams_part) == 0 or any([ c.part.context.get_span().lower().startswith(_.lower()) for _ in ngrams_part ]) else FALSE)
def is_in_birthplace_table_row(mention): if not mention.sentence.is_tabular(): return False ngrams = get_row_ngrams(mention, lower=True) birth_place_words = set(["birth", "place"]) if birth_place_words <= set(ngrams): return True else: return False
def current_units(attr): # NOTE: These two symbols for mu are unique, not duplicates. current_units = ["ma", "μa", "ua", "µa", "\uf06da"] keywords = ["supply", "quiescent", "iq", "is", "idd", "icc"] filter_keywords = ["offset", "bias", "logic", "shutdown"] related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True)) related_ngrams.update(get_row_ngrams(attr, n_max=1, spread=[-5, 5], lower=True)) if attr.get_span().strip() == "0": return False if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)): return False if overlap(current_units, related_ngrams) and overlap(keywords, related_ngrams): return True return False
def get_row_and_column_ngrams(mention): row_ngrams = list(get_row_ngrams(mention)) col_ngrams = list(get_col_ngrams(mention)) if not mention.sentence.is_tabular(): assert len(row_ngrams) == 1 and row_ngrams[0] is None assert len(col_ngrams) == 1 and col_ngrams[0] is None else: assert not any(x is None for x in row_ngrams) assert not any(x is None for x in col_ngrams) if "birth_place" in row_ngrams: return True else: return False
def tablelib_unary_features(span): """ Table-/structure-related features for a single span """ if not span.sentence.is_tabular(): return sentence = span.sentence for attrib in settings["featurization"]["table"]["unary_features"]["attrib"]: for ngram in get_cell_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_cell_ngrams" ]["max"], attrib=attrib, ): yield f"CELL_{attrib.upper()}_[{ngram}]", DEF_VALUE for row_num in range(sentence.row_start, sentence.row_end + 1): yield f"ROW_NUM_[{row_num}]", DEF_VALUE for col_num in range(sentence.col_start, sentence.col_end + 1): yield f"COL_NUM_[{col_num}]", DEF_VALUE # NOTE: These two features could be accounted for by HTML_ATTR in # structural features yield f"ROW_SPAN_[{num_rows(sentence)}]", DEF_VALUE yield f"COL_SPAN_[{num_cols(sentence)}]", DEF_VALUE for axis in ["row", "col"]: for ngram in get_head_ngrams( span, axis, n_max=settings["featurization"]["table"]["unary_features"][ "get_head_ngrams" ]["max"], attrib=attrib, ): yield f"{axis.upper()}_HEAD_{attrib.upper()}_[{ngram}]", DEF_VALUE for ngram in get_row_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_row_ngrams" ]["max"], attrib=attrib, ): yield f"ROW_{attrib.upper()}_[{ngram}]", DEF_VALUE for ngram in get_col_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_col_ngrams" ]["max"], attrib=attrib, ): yield f"COL_{attrib.upper()}_[{ngram}]", DEF_VALUE
def polarity_filter(c): (part, attr) = c # Check if the polarities are not matched with the part ngrams_part = set(x for x in get_row_ngrams(part, n_max=1, lower=False) if (x and polarity_pattern.match(x))) if len(ngrams_part) != 0 and all( not attr.context.get_span().lower().startswith(_.lower()) for _ in ngrams_part): logger.debug( f"ngrams_part: {ngrams_part}\nattr: {attr.context.get_span().lower()}" ) return False if same_table(c): return is_horz_aligned(c) or is_vert_aligned(c) return True
def neg_gain_keywords_in_row(c): row_ngrams = set(get_row_ngrams(c.gain, n_max=1, lower=True)) if overlap( [ "small", "full", "flat", "current", "thd", "signal", "flatness", "input", "noise", "f=", "f", "-3", "power", "db", "dbm", "output", "impedence", "delay", "capacitance", "range", "ratio", "dbc", "temperature", "common", "voltage", "range", ], row_ngrams, ): return FALSE else: return ABSTAIN
def LF_typ_row(c): return -1 if overlap(["typ", "typ."], list(get_row_ngrams(c.temp))) else 0
def LF_tstg_row(c): return 1 if overlap(["tstg", "stg", "ts"], list(get_row_ngrams( c.temp))) else 0
def LF_operating_row(c): return 1 if "operating" in get_row_ngrams(c.temp) else 0
def LF_temperature_row(c): return 1 if "temperature" in get_row_ngrams(c.temp) else 0
def LF_storage_row(c): return 1 if "storage" in get_row_ngrams(c.temp) else 0
def LF_part_mismatch_row(c): ngrams_part = _filter_non_parts(set(list(get_row_ngrams(c[1], n_max=1)))) return (ABSTAIN if len(ngrams_part) == 0 or any([ c.part.context.get_span().lower().startswith(_.lower()) for _ in ngrams_part ]) else FALSE)
def LF_operating_row(c): return FALSE if "operating" in get_row_ngrams(c[1]) else ABSTAIN
def LF_tstg_row(c): return (TRUE if overlap(["tstg", "stg", "ts"], list(get_row_ngrams(c[1]))) else ABSTAIN)
def LF_too_many_numbers_row(c): num_numbers = list(get_row_ngrams(c.temp, attrib="ner_tags")).count("number") return -1 if num_numbers >= 3 else 0
def LF_temperature_row(c): return TRUE if "temperature" in get_row_ngrams(c[1]) else ABSTAIN