def _tablelib_multinary_features( spans: Tuple[SpanMention, ...]) -> Iterator[Tuple[str, int]]: """Table-/structure-related features for multiple spans.""" multinary_features = settings["featurization"]["tabular"][ "multinary_features"] span_sentences = [span.sentence for span in spans] if all([sentence.is_tabular() for sentence in span_sentences]): span_tables = [sentence.table for sentence in span_sentences] if span_tables[1:] == span_tables[:-1]: yield "SAME_TABLE", DEF_VALUE if all([span.sentence.cell is not None for span in spans]): row_diff = min_row_diff( span_sentences, absolute=multinary_features["min_row_diff"]["absolute"], ) col_diff = min_col_diff( span_sentences, absolute=multinary_features["min_col_diff"]["absolute"], ) yield f"SAME_TABLE_ROW_DIFF_[{row_diff}]", DEF_VALUE yield f"SAME_TABLE_COL_DIFF_[{col_diff}]", DEF_VALUE yield ( f"SAME_TABLE_MANHATTAN_DIST_[{abs(row_diff) + abs(col_diff)}]" ), DEF_VALUE span_cells = [sentence.cell for sentence in span_sentences] if span_cells[1:] == span_cells[:-1]: yield "SAME_CELL", DEF_VALUE word_diff = sum([ s1.get_word_start_index() - s2.get_word_start_index() for s1, s2 in zip(spans[:-1], spans[1:]) ]) yield (f"WORD_DIFF_[{word_diff}]"), DEF_VALUE char_diff = sum([ s1.char_start - s2.char_start for s1, s2 in zip(spans[:-1], spans[1:]) ]) yield (f"CHAR_DIFF_[{char_diff}]"), DEF_VALUE if [span_sentences[1:] == span_sentences[:-1]]: yield "SAME_SENTENCE", DEF_VALUE else: if all([sentence.cell is not None for sentence in span_sentences]): yield "DIFF_TABLE", DEF_VALUE row_diff = min_row_diff( span_sentences, absolute=multinary_features["min_row_diff"]["absolute"], ) col_diff = min_col_diff( span_sentences, absolute=multinary_features["min_col_diff"]["absolute"], ) yield f"DIFF_TABLE_ROW_DIFF_[{row_diff}]", DEF_VALUE yield f"DIFF_TABLE_COL_DIFF_[{col_diff}]", DEF_VALUE yield ( f"DIFF_TABLE_MANHATTAN_DIST_[{abs(row_diff) + abs(col_diff)}]" ), DEF_VALUE
def tablelib_binary_features(span1, span2): """ Table-/structure-related features for a pair of spans """ binary_features = settings["featurization"]["table"]["binary_features"] if span1.sentence.is_tabular() and span2.sentence.is_tabular(): if span1.sentence.table == span2.sentence.table: yield "SAME_TABLE", DEF_VALUE if span1.sentence.cell is not None and span2.sentence.cell is not None: row_diff = min_row_diff( span1.sentence, span2.sentence, absolute=binary_features["min_row_diff"]["absolute"], ) col_diff = min_col_diff( span1.sentence, span2.sentence, absolute=binary_features["min_col_diff"]["absolute"], ) yield f"SAME_TABLE_ROW_DIFF_[{row_diff}]", DEF_VALUE yield f"SAME_TABLE_COL_DIFF_[{col_diff}]", DEF_VALUE yield ( f"SAME_TABLE_MANHATTAN_DIST_[{abs(row_diff) + abs(col_diff)}]" ), DEF_VALUE if span1.sentence.cell == span2.sentence.cell: yield "SAME_CELL", DEF_VALUE yield ( f"WORD_DIFF_[" f"{span1.get_word_start_index() - span2.get_word_start_index()}" f"]" ), DEF_VALUE yield ( f"CHAR_DIFF_[{span1.char_start - span2.char_start}]" ), DEF_VALUE if span1.sentence == span2.sentence: yield "SAME_SENTENCE", DEF_VALUE else: if span1.sentence.cell is not None and span2.sentence.cell is not None: yield "DIFF_TABLE", DEF_VALUE row_diff = min_row_diff( span1.sentence, span2.sentence, absolute=binary_features["min_row_diff"]["absolute"], ) col_diff = min_col_diff( span1.sentence, span2.sentence, absolute=binary_features["min_col_diff"]["absolute"], ) yield f"DIFF_TABLE_ROW_DIFF_[{row_diff}]", DEF_VALUE yield f"DIFF_TABLE_COL_DIFF_[{col_diff}]", DEF_VALUE yield ( f"DIFF_TABLE_MANHATTAN_DIST_[{abs(row_diff) + abs(col_diff)}]" ), DEF_VALUE
def tablelib_binary_features(span1, span2): """ Table-/structure-related features for a pair of spans """ if span1.sentence.is_tabular() and span2.sentence.is_tabular(): if span1.sentence.table == span2.sentence.table: yield u"SAME_TABLE", DEF_VALUE if span1.sentence.cell is not None and span2.sentence.cell is not None: row_diff = min_row_diff( span1.sentence, span2.sentence, absolute=settings.featurization.table.binary_features. min_row_diff.absolute, ) col_diff = min_col_diff( span1.sentence, span2.sentence, absolute=settings.featurization.table.binary_features. min_col_diff.absolute, ) yield u"SAME_TABLE_ROW_DIFF_[%s]" % row_diff, DEF_VALUE yield u"SAME_TABLE_COL_DIFF_[%s]" % col_diff, DEF_VALUE yield u"SAME_TABLE_MANHATTAN_DIST_[%s]" % str( abs(row_diff) + abs(col_diff)), DEF_VALUE if span1.sentence.cell == span2.sentence.cell: yield u"SAME_CELL", DEF_VALUE yield u"WORD_DIFF_[%s]" % ( span1.get_word_start() - span2.get_word_start()), DEF_VALUE yield u"CHAR_DIFF_[%s]" % (span1.char_start - span2.char_start), DEF_VALUE if span1.sentence == span2.sentence: yield u"SAME_SENTENCE", DEF_VALUE else: if span1.sentence.cell is not None and span2.sentence.cell is not None: yield u"DIFF_TABLE", DEF_VALUE row_diff = min_row_diff( span1.sentence, span2.sentence, absolute=settings.featurization.table.binary_features. min_row_diff.absolute, ) col_diff = min_col_diff( span1.sentence, span2.sentence, absolute=settings.featurization.table.binary_features. min_col_diff.absolute, ) yield u"DIFF_TABLE_ROW_DIFF_[%s]" % row_diff, DEF_VALUE yield u"DIFF_TABLE_COL_DIFF_[%s]" % col_diff, DEF_VALUE yield u"DIFF_TABLE_MANHATTAN_DIST_[%s]" % str( abs(row_diff) + abs(col_diff)), DEF_VALUE
def get_neighbor_cell_ngrams(mention, dist=1, directions=False, attrib="words", n_min=1, n_max=1, lower=True): """ Get the ngrams from all Cells that are within a given Cell distance in one direction from the given Mention. Note that if a candidate is passed in, all of its Mentions will be searched. If `directions=True``, each ngram will be returned with a direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}. :param mention: The Mention whose neighbor Cells are being searched :param dist: The Cell distance within which a neighbor Cell must be to be considered :param directions: A Boolean expressing whether or not to return the direction of each ngram :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a *generator* of ngrams (or (ngram, direction) tuples if directions=True) """ # TODO: Fix this to be more efficient (optimize with SQL query) spans = _to_spans(mention) for span in spans: for ngram in get_sentence_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram if span.sentence.is_tabular(): root_cell = span.sentence.cell for sentence in chain.from_iterable([ _get_aligned_sentences(root_cell, "row"), _get_aligned_sentences(root_cell, "col"), ]): row_diff = min_row_diff(sentence, root_cell, absolute=False) col_diff = min_col_diff(sentence, root_cell, absolute=False) if ((row_diff or col_diff) and not (row_diff and col_diff) and abs(row_diff) + abs(col_diff) <= dist): if directions: direction = "" if col_diff == 0: if 0 < row_diff and row_diff <= dist: direction = "UP" elif 0 > row_diff and row_diff >= -dist: direction = "DOWN" elif row_diff == 0: if 0 < col_diff and col_diff <= dist: direction = "RIGHT" elif 0 > col_diff and col_diff >= -dist: direction = "LEFT" for ngram in tokens_to_ngrams( getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower, ): yield (ngram, direction) else: for ngram in tokens_to_ngrams( getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower, ): yield ngram
def get_neighbor_cell_ngrams( mention: Union[Candidate, Mention, TemporarySpanMention], dist: int = 1, directions: bool = False, attrib: str = "words", n_min: int = 1, n_max: int = 1, lower: bool = True, ) -> Iterator[Union[str, Tuple[str, str]]]: """Get ngrams from all neighbor Cells. Get the ngrams from all Cells that are within a given Cell distance in one direction from the given Mention. Note that if a candidate is passed in, all of its Mentions will be searched. If `directions=True``, each ngram will be returned with a direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}. Also note that if the mention is not tabular, nothing will be yielded. :param mention: The Mention whose neighbor Cells are being searched :param dist: The Cell distance within which a neighbor Cell must be to be considered :param directions: A Boolean expressing whether or not to return the direction of each ngram :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :return: a *generator* of ngrams (or (ngram, direction) tuples if directions=True) """ # TODO: Fix this to be more efficient (optimize with SQL query) spans = _to_spans(mention) for span in spans: if not span.sentence.is_tabular(): continue for ngram in get_sentence_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram root_cell = span.sentence.cell for sentence in chain.from_iterable([ _get_aligned_sentences(root_cell, "row"), _get_aligned_sentences(root_cell, "col"), ]): row_diff = min_row_diff([sentence, root_cell], absolute=False) col_diff = min_col_diff([sentence, root_cell], absolute=False) if (row_diff ^ col_diff # Exclusive OR and abs(row_diff) + abs(col_diff) <= dist): if directions: if col_diff == 0: direction = "DOWN" if 0 < row_diff else "UP" else: direction = "RIGHT" if 0 < col_diff else "LEFT" for ngram in tokens_to_ngrams( getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower, ): yield (ngram, direction) else: for ngram in tokens_to_ngrams( getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower, ): yield ngram