def lemmatize(spans: Union[pd.Series, SpanArray, Iterable[Span]], token_features: pd.DataFrame, lemma_col_name: str = "lemma", token_span_col_name: str = "span") -> List[str]: """ Convert spans to their normal form using lemma information in a token features table. :param spans: Spans to be normalized. Each may represent zero or more tokens. :param token_features: DataFrame of token metadata. Index must be aligned with the token indices in `spans`. :param lemma_col_name: Optional custom name for the DataFrame column containing the lemmatized form of each token. :param token_span_col_name: Optional custom name for the DataFrame column containing the span of each token. :return: A list containing normalized versions of the tokens in `spans`, with each token separated by single space character. """ char_spans = SpanArray.make_array(spans) token_spans = TokenSpanArray.align_to_tokens(token_features[token_span_col_name], char_spans) ret = [] # Type: List[str] # TODO: Vectorize this loop for i in range(len(token_spans)): lemmas = token_features[lemma_col_name][ token_spans.begin_token[i]:token_spans.end_token[i] ] ret.append(" ".join(lemmas)) return ret
def __init__(self, tokens: Any, begin_token: int, end_token: int): """ :param tokens: Tokenization information about the document, including the target text. Must be a type that :func:`SpanArray.make_array()` can convert to a `SpanArray`. :param begin_token: Begin offset (inclusive) within the tokenized text, :param end_token: End offset; exclusive, one past the last token """ tokens = SpanArray.make_array(tokens) if TokenSpan.NULL_OFFSET_VALUE != begin_token and begin_token < 0: raise ValueError( f"Begin token offset must be NULL_OFFSET_VALUE or " f"greater than zero (got {begin_token})") if TokenSpan.NULL_OFFSET_VALUE != begin_token and end_token < begin_token: raise ValueError(f"End must be >= begin (got {begin_token} and " f"{end_token}") if begin_token > len(tokens): raise ValueError( f"Begin token offset of {begin_token} larger than " f"number of tokens ({len(tokens)})") if end_token > len(tokens) + 1: raise ValueError(f"End token offset of {end_token} larger than " f"number of tokens + 1 ({len(tokens)} + 1)") if len(tokens) == 0 and begin_token != TokenSpan.NULL_OFFSET_VALUE: raise ValueError( f"Tried to create a non-null TokenSpan over an empty list of tokens." ) if TokenSpan.NULL_OFFSET_VALUE == begin_token: if TokenSpan.NULL_OFFSET_VALUE != end_token: raise ValueError( "Begin offset with special 'null' value {} " "must be paired with an end offset of {}", TokenSpan.NULL_OFFSET_VALUE, TokenSpan.NULL_OFFSET_VALUE, ) begin_char_off = end_char_off = Span.NULL_OFFSET_VALUE else: begin_char_off = tokens.begin[begin_token] end_char_off = (begin_char_off if begin_token == end_token else tokens.end[end_token - 1]) if len(tokens) == 0: doc_text = None elif not tokens.is_single_document: raise ValueError("Tokens must be from exactly one document.") else: doc_text = tokens.document_text super().__init__(doc_text, begin_char_off, end_char_off) self._tokens = tokens self._begin_token = begin_token self._end_token = end_token
def from_char_offsets(tokens: Any) -> "TokenSpanArray": """ Convenience factory method for wrapping the character-level spans of a series of tokens into single-token token-based spans. :param tokens: character-based offsets of the tokens, as any type that :func:`SpanArray.make_array()` understands. :return: A TokenSpanArray containing single-token spans for each of the tokens in `tokens`. """ begin_tokens = np.arange(len(tokens)) tokens_array = SpanArray.make_array(tokens) return TokenSpanArray(tokens_array, begin_tokens, begin_tokens + 1)
def extract_regex_tok( tokens: Union[SpanArray, pd.Series], compiled_regex: regex.Regex, min_len=1, max_len=1, output_col_name: str = "match", ): """ Identify all (possibly overlapping) matches of a regular expression that start and end on token boundaries. :param tokens: ``SpanArray`` of token information, optionally wrapped in a `pd.Series`. :param compiled_regex: Regular expression to evaluate. :param min_len: Minimum match length in tokens :param max_len: Maximum match length (inclusive) in tokens :param output_col_name: (optional) name of column of matching spans in the returned DataFrame :returns: A single-column DataFrame containing a span for each match of the regex. """ tokens = SpanArray.make_array(tokens) num_tokens = len(tokens) matches_regex_f = np.vectorize( lambda s: compiled_regex.fullmatch(s) is not None) # The built-in regex functionality of Pandas/Python does not have # an optimized single-pass RegexTok, so generate all the places # where there might be a match and run them through regex.fullmatch(). # Note that this approach is asymptotically inefficient if max_len is large. # TODO: Performance tuning for both small and large max_len matches_list = [] for cur_len in range(min_len, max_len + 1): window_begin_toks = np.arange(0, num_tokens - cur_len + 1) window_end_toks = window_begin_toks + cur_len window_tok_spans = TokenSpanArray(tokens, window_begin_toks, window_end_toks) matches_list.append( pd.Series(window_tok_spans[matches_regex_f( window_tok_spans.covered_text)])) return pd.DataFrame({output_col_name: pd.concat(matches_list)})
def align_to_tokens(cls, tokens: Any, spans: Any): """ Align a set of character or token-based spans to a specified tokenization, producing a `TokenSpanArray` of token-based spans. :param tokens: The tokens to align to, as any type that `SpanArray.make_array()` accepts. :param spans: The spans to align. These spans must all target the same text as `tokens`. :return: An array of `TokenSpan`s aligned to the tokens of `tokens`. Raises `ValueError` if any of the spans in `spans` doesn't start and end on a token boundary. """ tokens = SpanArray.make_array(tokens) spans = SpanArray.make_array(spans) if not tokens.is_single_document: raise ValueError( f"Tokens cover more than one document (tokens are {tokens})") if not spans.is_single_document: raise ValueError( f"Spans cover more than one document (spans are {spans})") # Create and join temporary dataframes tokens_df = pd.DataFrame({ "token_index": np.arange(len(tokens)), "token_begin": tokens.begin, "token_end": tokens.end }) spans_df = pd.DataFrame({ "span_index": np.arange(len(spans)), "span_begin": spans.begin, "span_end": spans.end }) # Ignore zero-length tokens # TODO: Is this the right thing to do? tokens_df = tokens_df[ tokens_df["token_begin"] != tokens_df["token_end"]] begin_matches = pd.merge(tokens_df, spans_df, left_on="token_begin", right_on="span_begin", how="right", indicator=True) mismatched = begin_matches[begin_matches["_merge"] == "right_only"] if len(mismatched.index) > 0: raise ValueError( f"The following span(s) did not align with the begin offset\n" f"of any token:\n" f"{mismatched[['span_index', 'span_begin', 'span_end']]}") end_matches = pd.merge(tokens_df, spans_df, left_on="token_end", right_on="span_end", how="right", indicator=True) mismatched = end_matches[end_matches["_merge"] == "right_only"] if len(mismatched.index) > 0: raise ValueError( f"The following span(s) did not align with the end offset\n" f"of any token:\n" f"{mismatched[['span_index', 'span_begin', 'span_end']]}") # Join on span index to get (begin, end) pairs. begins_and_ends = pd.merge(begin_matches[["token_index", "span_index"]], end_matches[["token_index", "span_index"]], on="span_index", suffixes=("_begin", "_end"), sort=True) return TokenSpanArray(tokens, begins_and_ends["token_index_begin"], begins_and_ends["token_index_end"] + 1)