コード例 #1
0
def lemmatize(spans: Union[pd.Series, SpanArray, Iterable[Span]],
              token_features: pd.DataFrame,
              lemma_col_name: str = "lemma",
              token_span_col_name: str = "span") -> List[str]:
    """
    Convert spans to their normal form using lemma information in a token
    features table.

    :param spans: Spans to be normalized. Each may represent zero or more
    tokens.

    :param token_features: DataFrame of token metadata. Index must be aligned
    with the token indices in `spans`.

    :param lemma_col_name: Optional custom name for the DataFrame column
    containing the lemmatized form of each token.

    :param token_span_col_name: Optional custom name for the DataFrame column
    containing the span of each token.

    :return: A list containing normalized versions of the tokens
    in `spans`, with each token separated by single space character.
    """
    char_spans = SpanArray.make_array(spans)
    token_spans = TokenSpanArray.align_to_tokens(token_features[token_span_col_name],
                                                 char_spans)
    ret = []  # Type: List[str]
    # TODO: Vectorize this loop
    for i in range(len(token_spans)):
        lemmas = token_features[lemma_col_name][
                 token_spans.begin_token[i]:token_spans.end_token[i]
                 ]
        ret.append(" ".join(lemmas))
    return ret
コード例 #2
0
ファイル: token_span.py プロジェクト: frreiss/tep-fred
    def __init__(self, tokens: Any, begin_token: int, end_token: int):
        """
        :param tokens: Tokenization information about the document, including
        the target text. Must be a type that :func:`SpanArray.make_array()`
        can convert to a `SpanArray`.

        :param begin_token: Begin offset (inclusive) within the tokenized text,

        :param end_token: End offset; exclusive, one past the last token
        """
        tokens = SpanArray.make_array(tokens)
        if TokenSpan.NULL_OFFSET_VALUE != begin_token and begin_token < 0:
            raise ValueError(
                f"Begin token offset must be NULL_OFFSET_VALUE or "
                f"greater than zero (got {begin_token})")
        if TokenSpan.NULL_OFFSET_VALUE != begin_token and end_token < begin_token:
            raise ValueError(f"End must be >= begin (got {begin_token} and "
                             f"{end_token}")
        if begin_token > len(tokens):
            raise ValueError(
                f"Begin token offset of {begin_token} larger than "
                f"number of tokens ({len(tokens)})")
        if end_token > len(tokens) + 1:
            raise ValueError(f"End token offset of {end_token} larger than "
                             f"number of tokens + 1 ({len(tokens)} + 1)")
        if len(tokens) == 0 and begin_token != TokenSpan.NULL_OFFSET_VALUE:
            raise ValueError(
                f"Tried to create a non-null TokenSpan over an empty list of tokens."
            )
        if TokenSpan.NULL_OFFSET_VALUE == begin_token:
            if TokenSpan.NULL_OFFSET_VALUE != end_token:
                raise ValueError(
                    "Begin offset with special 'null' value {} "
                    "must be paired with an end offset of {}",
                    TokenSpan.NULL_OFFSET_VALUE,
                    TokenSpan.NULL_OFFSET_VALUE,
                )
            begin_char_off = end_char_off = Span.NULL_OFFSET_VALUE
        else:
            begin_char_off = tokens.begin[begin_token]
            end_char_off = (begin_char_off if begin_token == end_token else
                            tokens.end[end_token - 1])
        if len(tokens) == 0:
            doc_text = None
        elif not tokens.is_single_document:
            raise ValueError("Tokens must be from exactly one document.")
        else:
            doc_text = tokens.document_text

        super().__init__(doc_text, begin_char_off, end_char_off)
        self._tokens = tokens
        self._begin_token = begin_token
        self._end_token = end_token
コード例 #3
0
ファイル: token_span.py プロジェクト: frreiss/tep-fred
    def from_char_offsets(tokens: Any) -> "TokenSpanArray":
        """
        Convenience factory method for wrapping the character-level spans of a
        series of tokens into single-token token-based spans.

        :param tokens: character-based offsets of the tokens, as any type that
         :func:`SpanArray.make_array()` understands.

        :return: A TokenSpanArray containing single-token spans for each of the
        tokens in `tokens`.
        """
        begin_tokens = np.arange(len(tokens))
        tokens_array = SpanArray.make_array(tokens)
        return TokenSpanArray(tokens_array, begin_tokens, begin_tokens + 1)
コード例 #4
0
ファイル: extract.py プロジェクト: frreiss/tep-fred
def extract_regex_tok(
    tokens: Union[SpanArray, pd.Series],
    compiled_regex: regex.Regex,
    min_len=1,
    max_len=1,
    output_col_name: str = "match",
):
    """
    Identify all (possibly overlapping) matches of a regular expression
    that start and end on token boundaries.

    :param tokens: ``SpanArray`` of token information, optionally wrapped in a
    `pd.Series`.

    :param compiled_regex: Regular expression to evaluate.

    :param min_len: Minimum match length in tokens

    :param max_len: Maximum match length (inclusive) in tokens

    :param output_col_name: (optional) name of column of matching spans in the
    returned DataFrame

    :returns: A single-column DataFrame containing a span for each match of the
    regex.
    """
    tokens = SpanArray.make_array(tokens)

    num_tokens = len(tokens)
    matches_regex_f = np.vectorize(
        lambda s: compiled_regex.fullmatch(s) is not None)

    # The built-in regex functionality of Pandas/Python does not have
    # an optimized single-pass RegexTok, so generate all the places
    # where there might be a match and run them through regex.fullmatch().
    # Note that this approach is asymptotically inefficient if max_len is large.
    # TODO: Performance tuning for both small and large max_len
    matches_list = []
    for cur_len in range(min_len, max_len + 1):
        window_begin_toks = np.arange(0, num_tokens - cur_len + 1)
        window_end_toks = window_begin_toks + cur_len

        window_tok_spans = TokenSpanArray(tokens, window_begin_toks,
                                          window_end_toks)
        matches_list.append(
            pd.Series(window_tok_spans[matches_regex_f(
                window_tok_spans.covered_text)]))
    return pd.DataFrame({output_col_name: pd.concat(matches_list)})
コード例 #5
0
ファイル: token_span.py プロジェクト: frreiss/tep-fred
    def align_to_tokens(cls, tokens: Any, spans: Any):
        """
        Align a set of character or token-based spans to a specified
        tokenization, producing a `TokenSpanArray` of token-based spans.

        :param tokens: The tokens to align to, as any type that
         `SpanArray.make_array()` accepts.
        :param spans: The spans to align. These spans must all target the same text
         as `tokens`.
        :return: An array of `TokenSpan`s aligned to the tokens of `tokens`.
         Raises `ValueError` if any of the spans in `spans` doesn't start and
         end on a token boundary.
        """
        tokens = SpanArray.make_array(tokens)
        spans = SpanArray.make_array(spans)

        if not tokens.is_single_document:
            raise ValueError(
                f"Tokens cover more than one document (tokens are {tokens})")
        if not spans.is_single_document:
            raise ValueError(
                f"Spans cover more than one document (spans are {spans})")

        # Create and join temporary dataframes
        tokens_df = pd.DataFrame({
            "token_index": np.arange(len(tokens)),
            "token_begin": tokens.begin,
            "token_end": tokens.end
        })
        spans_df = pd.DataFrame({
            "span_index": np.arange(len(spans)),
            "span_begin": spans.begin,
            "span_end": spans.end
        })

        # Ignore zero-length tokens
        # TODO: Is this the right thing to do?
        tokens_df = tokens_df[
            tokens_df["token_begin"] != tokens_df["token_end"]]

        begin_matches = pd.merge(tokens_df,
                                 spans_df,
                                 left_on="token_begin",
                                 right_on="span_begin",
                                 how="right",
                                 indicator=True)

        mismatched = begin_matches[begin_matches["_merge"] == "right_only"]
        if len(mismatched.index) > 0:
            raise ValueError(
                f"The following span(s) did not align with the begin offset\n"
                f"of any token:\n"
                f"{mismatched[['span_index', 'span_begin', 'span_end']]}")

        end_matches = pd.merge(tokens_df,
                               spans_df,
                               left_on="token_end",
                               right_on="span_end",
                               how="right",
                               indicator=True)

        mismatched = end_matches[end_matches["_merge"] == "right_only"]
        if len(mismatched.index) > 0:
            raise ValueError(
                f"The following span(s) did not align with the end offset\n"
                f"of any token:\n"
                f"{mismatched[['span_index', 'span_begin', 'span_end']]}")

        # Join on span index to get (begin, end) pairs.
        begins_and_ends = pd.merge(begin_matches[["token_index",
                                                  "span_index"]],
                                   end_matches[["token_index", "span_index"]],
                                   on="span_index",
                                   suffixes=("_begin", "_end"),
                                   sort=True)

        return TokenSpanArray(tokens, begins_and_ends["token_index_begin"],
                              begins_and_ends["token_index_end"] + 1)