コード例 #1
0
def token_span_to_arrow(token_span: TokenSpanArray) -> pa.ExtensionArray:
    """
    Convert a TokenSpanArray to a pyarrow.ExtensionArray with a type
    of ArrowTokenSpanType and struct as the storage type. The resulting
    extension array can be serialized and transferred with standard
    Arrow protocols.

    :param token_span: A TokenSpanArray to be converted
    :return: pyarrow.ExtensionArray containing TokenSpan data
    """
    if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
        raise NotImplementedError(
            "Arrow serialization for TokenSpanArray is not supported with "
            "PyArrow versions < 2.0.0")
    # Create arrays for begins/ends
    token_begins_array = pa.array(token_span.begin_token)
    token_ends_array = pa.array(token_span.end_token)

    # Filter out any empty SpanArrays
    non_null_tokens = token_span.tokens[~token_span.isna()]
    assert len(non_null_tokens) > 0

    # Get either single document as a list or use a list of all if multiple docs
    if all([token is non_null_tokens[0] for token in non_null_tokens]):
        tokens_arrays = [non_null_tokens[0]]
        tokens_indices = pa.array([0] * len(token_span.tokens),
                                  mask=token_span.isna())
    else:
        raise NotImplementedError(
            "TokenSpan Multi-doc serialization not yet implemented due to "
            "ArrowNotImplementedError: Concat with dictionary unification NYI")
        tokens_arrays = non_null_tokens
        tokens_indices = np.zeros_like(token_span.tokens)
        tokens_indices[~token_span.isna()] = range(len(tokens_arrays))
        tokens_indices = pa.array(tokens_indices, mask=token_span.isna())

    # Convert each token SpanArray to Arrow and get as raw storage
    arrow_tokens_arrays = [span_to_arrow(sa).storage for sa in tokens_arrays]

    # Create a list array with each element is an ArrowSpanArray
    # TODO: pyarrow.lib.ArrowNotImplementedError: ('Sequence converter for type dictionary<values=string, indices=int8, ordered=0> not implemented', 'Conversion failed for column ts1 with type TokenSpanDtype')
    #arrow_tokens_arrays_array = pa.array(arrow_tokens_arrays, type=pa.list_(arrow_tokens_arrays[0].type))
    offsets = [0] + [len(a) for a in arrow_tokens_arrays]
    values = pa.concat_arrays(
        arrow_tokens_arrays)  # TODO: can't concat extension arrays?
    arrow_tokens_arrays_array = pa.ListArray.from_arrays(offsets, values)

    # Create a dictionary array mapping each token SpanArray index used to the list of ArrowSpanArrays
    tokens_dict_array = pa.DictionaryArray.from_arrays(
        tokens_indices, arrow_tokens_arrays_array)

    typ = ArrowTokenSpanType(token_begins_array.type, tokens_dict_array.type)
    fields = list(typ.storage_type)

    storage = pa.StructArray.from_arrays(
        [token_begins_array, token_ends_array, tokens_dict_array],
        fields=fields)

    return pa.ExtensionArray.from_storage(typ, storage)
コード例 #2
0
def make_tokens_and_features(
    target_text: str, language_model, add_left_and_right=False,
) -> pd.DataFrame:
    """
    :param target_text: Text to analyze
    :param language_model: Preconfigured spaCy language model (`spacy.language.Language`)
     object
    :param add_left_and_right: If ``True``, add columns "left" and "right"
     containing references to previous and next tokens.

    :return: A tuple of two dataframes:

             1. The tokens of the text plus additional linguistic features that the
                language model generates, represented as a `pd.DataFrame`.
             2. A table of named entities identified by the language model's named entity
                tagger, represented as a `pd.DataFrame`.
    """
    spacy_doc = language_model(target_text)

    # TODO: Performance tuning of the translation code that follows
    # Represent the character spans of the tokens
    tok_begins = np.array([t.idx for t in spacy_doc])
    tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
    tokens_array = SpanArray(target_text, tok_begins, tok_ends)
    tokens_series = pd.Series(tokens_array)
    # Also build single-token token-based spans to make it easier to build
    # larger token-based spans.
    token_spans = TokenSpanArray.from_char_offsets(tokens_series.array)
    # spaCy identifies tokens by semi-arbitrary integer "indexes" (in practice,
    # the offset of the first character in the token). Translate from these
    # to a dense range of integer IDs that will correspond to the index of our
    # returned DataFrame.
    idx_to_id = {spacy_doc[i].idx: i for i in range(len(spacy_doc))}
    # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2
    iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False)
    df_cols = {
        "id": range(len(tok_begins)),
        "span": tokens_series,
        "lemma": [t.lemma_ for t in spacy_doc],
        "pos": pd.Categorical([str(t.pos_) for t in spacy_doc]),
        "tag": pd.Categorical([str(t.tag_) for t in spacy_doc]),
        "dep": pd.Categorical([str(t.dep_) for t in spacy_doc]),
        "head": np.array([idx_to_id[t.head.idx] for t in spacy_doc]),
        "shape": pd.Categorical([t.shape_ for t in spacy_doc]),
        "ent_iob": pd.Categorical([str(t.ent_iob_) for t in spacy_doc],
                                  dtype=iob2_dtype),
        "ent_type": pd.Categorical([str(t.ent_type_) for t in spacy_doc]),
        "is_alpha": np.array([t.is_alpha for t in spacy_doc]),
        "is_stop": np.array([t.is_stop for t in spacy_doc]),
        "sentence": _make_sentences_series(spacy_doc, tokens_array),
    }
    if add_left_and_right:
        # Use nullable int type because these columns contain nulls
        df_cols["left"] = pd.array(
            [None] + list(range(len(tok_begins) - 1)), dtype=pd.Int32Dtype()
        )
        df_cols["right"] = pd.array(
            list(range(1, len(tok_begins))) + [None], dtype=pd.Int32Dtype()
        )
    return pd.DataFrame(df_cols)
コード例 #3
0
def align_bert_tokens_to_corpus_tokens(
        spans_df: pd.DataFrame, corpus_toks_df: pd.DataFrame) -> pd.DataFrame:
    """
    Expand entity matches from a BERT-based model so that they align
    with the corpus's original tokenization.

    :param spans_df: DataFrame of extracted entities. Must contain two
     columns: "span" and "ent_type". Other columns ignored.
    :param corpus_toks_df: DataFrame of the corpus's original tokenization,
     one row per token.
     Must contain a column "span" with character-based spans of
     the tokens.

    :returns: A new DataFrame with schema ["span", "ent_type"],
     where the "span" column contains token-based spans based off
     the *corpus* tokenization in `corpus_toks_df["span"]`.
    """
    if len(spans_df.index) == 0:
        return spans_df.copy()
    overlaps_df = (spanner.overlap_join(spans_df["span"],
                                        corpus_toks_df["span"], "span",
                                        "corpus_token").merge(spans_df))
    agg_df = (overlaps_df.groupby("span").aggregate({
        "corpus_token": "sum",
        "ent_type": "first"
    }).reset_index())
    cons_df = (spanner.consolidate(agg_df, "corpus_token")[[
        "corpus_token", "ent_type"
    ]].rename(columns={"corpus_token": "span"}))
    cons_df["span"] = TokenSpanArray.align_to_tokens(corpus_toks_df["span"],
                                                     cons_df["span"])
    return cons_df
コード例 #4
0
def lemmatize(spans: Union[pd.Series, SpanArray, Iterable[Span]],
              token_features: pd.DataFrame,
              lemma_col_name: str = "lemma",
              token_span_col_name: str = "span") -> List[str]:
    """
    Convert spans to their normal form using lemma information in a token
    features table.

    :param spans: Spans to be normalized. Each may represent zero or more
    tokens.

    :param token_features: DataFrame of token metadata. Index must be aligned
    with the token indices in `spans`.

    :param lemma_col_name: Optional custom name for the DataFrame column
    containing the lemmatized form of each token.

    :param token_span_col_name: Optional custom name for the DataFrame column
    containing the span of each token.

    :return: A list containing normalized versions of the tokens
    in `spans`, with each token separated by single space character.
    """
    char_spans = SpanArray.make_array(spans)
    token_spans = TokenSpanArray.align_to_tokens(token_features[token_span_col_name],
                                                 char_spans)
    ret = []  # Type: List[str]
    # TODO: Vectorize this loop
    for i in range(len(token_spans)):
        lemmas = token_features[lemma_col_name][
                 token_spans.begin_token[i]:token_spans.end_token[i]
                 ]
        ret.append(" ".join(lemmas))
    return ret
コード例 #5
0
 def _make_empty_series() -> pd.Series:
     """
     Zero-length TokenSpanArray wrapped in a series. Note that this array has
     zero spans but *does* contain token and text information.
     """
     return pd.Series(
         TokenSpanArray(_TOKENS_ARRAY, [], [])
     )
コード例 #6
0
 def _make_join_arg(self) -> pd.Series:
     """
     Shared example join argument used by most of the test cases that follow.
     """
     return pd.Series(
         TokenSpanArray._from_sequence(
             [
                 TokenSpan(_TOKENS_ARRAY, 23, 28),  # Knights of the Round Table
                 TokenSpan(_TOKENS_ARRAY, 17, 19),  # searching for
                 TokenSpan(_TOKENS_ARRAY, 1, 2),  # In
                 TokenSpan(_TOKENS_ARRAY, 1, 2),  # In (second copy)
                 TokenSpan(_TOKENS_ARRAY, 42, 45),  # Lancelot the Brave
             ]
         )
     )
コード例 #7
0
ファイル: extract.py プロジェクト: frreiss/tep-fred
def extract_regex_tok(
    tokens: Union[SpanArray, pd.Series],
    compiled_regex: regex.Regex,
    min_len=1,
    max_len=1,
    output_col_name: str = "match",
):
    """
    Identify all (possibly overlapping) matches of a regular expression
    that start and end on token boundaries.

    :param tokens: ``SpanArray`` of token information, optionally wrapped in a
    `pd.Series`.

    :param compiled_regex: Regular expression to evaluate.

    :param min_len: Minimum match length in tokens

    :param max_len: Maximum match length (inclusive) in tokens

    :param output_col_name: (optional) name of column of matching spans in the
    returned DataFrame

    :returns: A single-column DataFrame containing a span for each match of the
    regex.
    """
    tokens = SpanArray.make_array(tokens)

    num_tokens = len(tokens)
    matches_regex_f = np.vectorize(
        lambda s: compiled_regex.fullmatch(s) is not None)

    # The built-in regex functionality of Pandas/Python does not have
    # an optimized single-pass RegexTok, so generate all the places
    # where there might be a match and run them through regex.fullmatch().
    # Note that this approach is asymptotically inefficient if max_len is large.
    # TODO: Performance tuning for both small and large max_len
    matches_list = []
    for cur_len in range(min_len, max_len + 1):
        window_begin_toks = np.arange(0, num_tokens - cur_len + 1)
        window_end_toks = window_begin_toks + cur_len

        window_tok_spans = TokenSpanArray(tokens, window_begin_toks,
                                          window_end_toks)
        matches_list.append(
            pd.Series(window_tok_spans[matches_regex_f(
                window_tok_spans.covered_text)]))
    return pd.DataFrame({output_col_name: pd.concat(matches_list)})
コード例 #8
0
def align_bert_tokens_to_corpus_tokens(
    spans_df: pd.DataFrame,
    corpus_toks_df: pd.DataFrame,
    spans_df_token_col: str = "span",
    corpus_df_token_col: str = "span",
    entity_type_col: str = "ent_type",
) -> pd.DataFrame:
    """
    Expand entity matches from a BERT-based model so that they align
    with the corpus's original tokenization.

    :param spans_df: DataFrame of extracted entities. Must contain two
     columns with span and entity type information, respectively. Other columns ignored.
    :param corpus_toks_df: DataFrame of the corpus's original tokenization,
     one row per token.
     Must contain a column with character-based spans of
     the tokens.
    :param spans_df_token_col: the name of the column in ``spans_df``
     containing its tokenization. By default, ``'span'``
    :param corpus_df_token_col: the name of the column in ``corpus_toks_df``
     that contains its tokenization. By default ```'span'``
    :param entity_type_col: the name of the column in spans_df that
     contains the entity types of the elements


    :returns: A new DataFrame with schema ``["span", "ent_type"]``,
     where the "span" column contains token-based spans based off
     the *corpus* tokenization in ``corpus_toks_df["span"]``.
    """
    if len(spans_df.index) == 0:
        return spans_df.copy()

    overlaps_df = spanner.overlap_join(
        spans_df[spans_df_token_col],
        corpus_toks_df[corpus_df_token_col],
        "span",
        "corpus_token",
    ).merge(spans_df, left_on="span", right_on=spans_df_token_col)
    agg_df = (overlaps_df.groupby("span").aggregate({
        "corpus_token": "sum",
        entity_type_col: "first"
    }).reset_index())
    cons_df = spanner.consolidate(agg_df, "corpus_token")[[
        "corpus_token", entity_type_col
    ]].rename(columns={"corpus_token": "span"})
    cons_df["span"] = TokenSpanArray.align_to_tokens(
        corpus_toks_df[corpus_df_token_col], cons_df["span"])
    return cons_df
コード例 #9
0
ファイル: nlu.py プロジェクト: frreiss/tep-fred
def make_span_from_entities(char_span: SpanArray,
                            entities_frame: pd.DataFrame,
                            entity_col: str = "text") -> TokenSpanArray:
    """
    Create a token span array for entity text from the entities DataFrame, and an existing
    char span array with tokens from the entire analyzed text.

    :param char_span: Parsed tokens
    :param entities_frame: Entities DataFrame from `parse_response`
    :param entity_col: Column name for the entity text
    :return: TokenSpanArray for matching entities
    """
    entities = entities_frame[entity_col]
    entities_len = entities.str.len()
    begins = []
    ends = []

    i = 0
    while i < len(char_span):
        span = char_span[i]
        text = span.covered_text
        end = i
        num_tokens = 1
        stop = False
        while not stop:
            stop = True
            starts_with = entities.str.startswith(text)
            if any(starts_with):
                # Have a complete match, advance the end index
                if any(entities_len[starts_with] == len(text)):
                    end = i + num_tokens
                # Try the next token
                if i + num_tokens < len(char_span):
                    span = char_span[i + num_tokens]
                    text = text + " " + span.covered_text
                    num_tokens += 1
                    stop = False

        if i != end:
            begins.append(i)
            ends.append(end)
            i += (end - i)
        else:
            i += 1

    return TokenSpanArray(char_span, begins, ends)
コード例 #10
0
    def test_overlaps_join(self):
        join_arg = pd.Series(
            TokenSpanArray._from_sequence(
                [
                    TokenSpan(_TOKENS_ARRAY, 23, 28),  # Knights of the Round Table
                    TokenSpan(_TOKENS_ARRAY, 17, 19),  # searching for
                    TokenSpan(_TOKENS_ARRAY, 1, 2),  # In
                    TokenSpan(_TOKENS_ARRAY, 1, 2),  # In (second copy)
                    TokenSpan(_TOKENS_ARRAY, 42, 45),  # Lancelot the Brave
                ]
            )
        )

        result1 = overlap_join(join_arg, _CAPS_WORD["match"])
        self.assertEqual(
            str(result1),
            textwrap.dedent(
                """\
                                                first                second
            0  [23, 28): 'Knights of the Round Table'   [23, 24): 'Knights'
            1  [23, 28): 'Knights of the Round Table'     [26, 27): 'Round'
            2  [23, 28): 'Knights of the Round Table'     [27, 28): 'Table'
            3                            [1, 2): 'In'          [1, 2): 'In'
            4                            [1, 2): 'In'          [1, 2): 'In'
            5          [42, 45): 'Lancelot the Brave'  [42, 43): 'Lancelot'
            6          [42, 45): 'Lancelot the Brave'     [44, 45): 'Brave'"""
            ),
        )

        result2 = overlap_join(_CAPS_WORD["match"], join_arg)
        self.assertEqual(
            str(result2),
            textwrap.dedent(
                """\
                              first                                  second
            0          [1, 2): 'In'                            [1, 2): 'In'
            1          [1, 2): 'In'                            [1, 2): 'In'
            2   [23, 24): 'Knights'  [23, 28): 'Knights of the Round Table'
            3     [26, 27): 'Round'  [23, 28): 'Knights of the Round Table'
            4     [27, 28): 'Table'  [23, 28): 'Knights of the Round Table'
            5  [42, 43): 'Lancelot'          [42, 45): 'Lancelot the Brave'
            6     [44, 45): 'Brave'          [42, 45): 'Lancelot the Brave'"""
            ),
        )
コード例 #11
0
def _make_sentences_series(spacy_doc, tokens: SpanArray):
    """
    Subroutine of :func:`make_tokens_and_features`

    :param spacy_doc: parsed document (:class:`spacy.tokens.doc.Doc`) from a spaCy
     language model
    :param tokens: Token information for the current document as a
     :class:`SpanArray` object. Must contain the same tokens as `spacy_doc`.

    :returns: a Pandas DataFrame Series containing the token span of the (single)
     sentence that the token is in
    """
    num_toks = len(spacy_doc)
    # Generate the [begin, end) intervals that make up a series of spans
    begin_tokens = np.full(shape=num_toks, fill_value=-1, dtype=np.int32)
    end_tokens = np.full(shape=num_toks, fill_value=-1, dtype=np.int32)
    for sent in spacy_doc.sents:
        begin_tokens[sent.start: sent.end] = sent.start
        end_tokens[sent.start: sent.end] = sent.end
    return pd.Series(TokenSpanArray(tokens, begin_tokens, end_tokens))
コード例 #12
0
def conll_to_bert(
    df: pd.DataFrame,
    tokenizer: Any,
    bert: Any,
    token_class_dtype: pd.CategoricalDtype,
    compute_embeddings: bool = True,
    overlap: int = 32,
    non_overlap: int = 64,
) -> pd.DataFrame:
    """
    :param df: One DataFrame from the :func:`conll_2003_to_dataframes` function,
     representing the tokens of a single document in the original tokenization.
    :param tokenizer: BERT tokenizer instance from the `transformers` library
    :param bert: PyTorch-based BERT model from the `transformers` library
    :param token_class_dtype: Pandas categorical type for representing
     token class labels, as returned by :func:`make_iob_tag_categories`
    :param compute_embeddings: True to generate BERT embeddings at each token
     position and add a column "embedding" to the returned DataFrame with
     the embeddings
    :param overlap: (optional) how much overlap there should be between adjacent
     windows for embeddings
    :param non_overlap: (optional) how much non-overlapping content between the
     overlapping regions there should be at the middle of each window?

    :returns: A version of the same DataFrame, but with BERT tokens, BERT
     embeddings for each token (if ``compute_embeddings`` is ``True``),
     and token class labels.
    """
    spans_df = conll.iob_to_spans(df)
    bert_toks_df = make_bert_tokens(df["span"].values[0].target_text,
                                    tokenizer)
    bert_token_spans = TokenSpanArray.align_to_tokens(bert_toks_df["span"],
                                                      spans_df["span"])
    bert_toks_df[["ent_iob",
                  "ent_type"]] = conll.spans_to_iob(bert_token_spans,
                                                    spans_df["ent_type"])
    bert_toks_df = conll.add_token_classes(bert_toks_df, token_class_dtype)
    if compute_embeddings:
        bert_toks_df = add_embeddings(bert_toks_df, bert, overlap, non_overlap)
    return bert_toks_df
コード例 #13
0
ファイル: nlu.py プロジェクト: frreiss/tep-fred
def _make_syntax_dataframes(syntax_response, original_text):
    tokens = syntax_response.get("tokens", [])
    sentence = syntax_response.get("sentences", [])

    if len(tokens) > 0:
        token_table = util.make_table(tokens)
        location_col, location_name = util.find_column(token_table, "location")
        text_col, text_name = util.find_column(token_table, "text")
        char_span = util.make_char_span(location_col, text_col, original_text)

        # Drop location, text columns that is duplicated in char_span
        token_table = token_table.drop([location_name, text_name])

        # Add the span columns to the DataFrames
        token_df = token_table.to_pandas()
        token_df['span'] = char_span
    else:
        char_span = None
        token_df = pd.DataFrame()

    if len(sentence) > 0:
        sentence_table = util.make_table(sentence)
        sentence_df = sentence_table.to_pandas()
        if char_span is not None:
            location_col, _ = util.find_column(sentence_table, "location")
            text_col, _ = util.find_column(sentence_table, "text")
            sentence_char_span = util.make_char_span(location_col, text_col,
                                                     original_text)
            sentence_span = TokenSpanArray.align_to_tokens(
                char_span, sentence_char_span)
            sentence_df['span'] = sentence_char_span
            sentence_df['sentence_span'] = sentence_span
    else:
        sentence_df = pd.DataFrame()

    return token_df, sentence_df
コード例 #14
0
def iob_to_spans(
    token_features: pd.DataFrame,
    iob_col_name: str = "ent_iob",
    span_col_name: str = "span",
    entity_type_col_name: str = "ent_type",
):
    """
    Convert token tags in Inside–Outside–Beginning (IOB2) format to a series of
    `TokenSpan`s of entities. See https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
    for more information on IOB2 format.

    :param token_features: DataFrame of token features in the format returned by
     `make_tokens_and_features`.
    :param iob_col_name: Name of a column in `token_features` that contains the
     IOB2 tags as strings, "I", "O", or "B".
    :param span_col_name: Name of a column in `token_features` that
     contains the tokens as a `SpanArray`.
    :param entity_type_col_name: Optional name of a column in `token_features`
     that contains entity type information; or `None` if no such column exists.
    :return: A `pd.DataFrame` with the following columns:
    * `span`: Span (with token offsets) of each entity
    * `<value of entity_type_col_name>`: (optional) Entity type
    """
    # Start out with 1-token prefixes of all entities.
    begin_mask = token_features[iob_col_name] == "B"
    first_tokens = token_features[begin_mask].index
    if entity_type_col_name is None:
        entity_types = np.zeros(len(first_tokens))
    else:
        entity_types = token_features[begin_mask][entity_type_col_name]

    # Add an extra "O" tag to the end of the IOB column to simplify the logic
    # for handling the case where the document ends with an entity.
    iob_series = (token_features[iob_col_name].append(pd.Series(
        ["O"])).reset_index(drop=True))

    entity_prefixes = pd.DataFrame({
        "ent_type":
        entity_types,
        "begin":
        first_tokens,  # Inclusive
        "end":
        first_tokens + 1,  # Exclusive
        "next_tag":
        iob_series.iloc[first_tokens + 1].values,
    })

    df_list = []  # Type: pd.DataFrame

    if len(entity_prefixes.index) == 0:
        # Code below needs at least one element in the list for schema
        df_list = [entity_prefixes]

    # Iteratively expand the prefixes
    while len(entity_prefixes.index) > 0:
        complete_mask = entity_prefixes["next_tag"].isin(["O", "B"])
        complete_entities = entity_prefixes[complete_mask]
        incomplete_entities = entity_prefixes[~complete_mask].copy()
        incomplete_entities["end"] = incomplete_entities["end"] + 1
        incomplete_entities["next_tag"] = iob_series.iloc[
            incomplete_entities["end"]].values
        df_list.append(complete_entities)
        entity_prefixes = incomplete_entities
    all_entities = pd.concat(df_list)

    # Sort spans by location, not length.
    all_entities.sort_values("begin", inplace=True)

    # Convert [begin, end) pairs to spans
    entity_spans_array = TokenSpanArray(
        token_features[span_col_name].values,
        all_entities["begin"].values,
        all_entities["end"].values,
    )
    if entity_type_col_name is None:
        return pd.DataFrame({"span": entity_spans_array})
    else:
        return pd.DataFrame({
            "span":
            entity_spans_array,
            entity_type_col_name:
            all_entities["ent_type"].values,
        })
コード例 #15
0
def extract_dict(
    tokens: Union[SpanArray, pd.Series],
    dictionary: pd.DataFrame,
    output_col_name: str = "match",
):
    """
    Identify all matches of a dictionary on a sequence of tokens.

    :param tokens: `SpanArray` of token information, optionally wrapped in a
     `pd.Series`.
    :param dictionary: The dictionary to match, encoded as a `pd.DataFrame` in
     the format returned by `load_dict()`
    :param output_col_name: (optional) name of column of matching spans in the
     returned DataFrame

    :return: a single-column DataFrame of token ID spans of dictionary matches
    """
    # Box tokens into a pd.Series if not already boxed.
    if isinstance(tokens, SpanArray):
        tokens = pd.Series(tokens)

    # Wrap the important parts of the tokens series in a temporary dataframe.
    # noinspection PyUnresolvedReferences
    toks_tmp = pd.DataFrame({
        "token_id":
        tokens.index,
        "normalized_text":
        tokens.array.normalized_covered_text,
    })

    # Start by matching the first token.
    matches = pd.merge(dictionary,
                       toks_tmp,
                       left_on="toks_0",
                       right_on="normalized_text")
    matches.rename(columns={"token_id": "begin_token_id"}, inplace=True)
    matches_col_names = list(matches.columns)  # We'll need this later

    # Check against remaining elements of matching dictionary entries and
    # accumulate the full set of matches as a list of IntervalIndexes
    begins_list = []
    ends_list = []
    max_entry_len = len(dictionary.columns)
    for match_len in range(1, max_entry_len):
        # print("Match len: {}".format(match_len))
        # Find matches of length match_len. Dictionary entries of this length
        # will have None in the column "toks_<match_len>".
        match_locs = pd.isna(matches["toks_{}".format(match_len)])
        # print("Completed matches:\n{}".format(matches[match_locs]))
        match_begins = matches[match_locs]["begin_token_id"].to_numpy()
        match_ends = match_begins + match_len
        begins_list.append(match_begins)
        ends_list.append(match_ends)

        # For the remaining partial matches against longer dictionary entries,
        # check the next token by merging with the tokens dataframe.
        potential_matches = matches[~match_locs].copy()
        # print("Raw potential matches:\n{}".format(potential_matches))
        potential_matches.drop("normalized_text", axis=1, inplace=True)
        potential_matches["next_token_id"] = (
            potential_matches["begin_token_id"] + match_len)
        potential_matches = pd.merge(potential_matches,
                                     toks_tmp,
                                     left_on="next_token_id",
                                     right_on="token_id")
        # print("Filtered potential matches:\n{}".format(potential_matches))
        potential_matches = potential_matches[
            potential_matches["normalized_text"] == potential_matches[
                "toks_{}".format(match_len)]]
        # The result of the join has some extra columns that we don't need.
        matches = potential_matches[matches_col_names]
    # Gather together all the sets of matches and wrap in a dataframe.
    begins = np.concatenate(begins_list)
    ends = np.concatenate(ends_list)
    result = pd.DataFrame(
        {output_col_name: TokenSpanArray(tokens.values, begins, ends)})
    # Results are sorted by number of tokens; sort by location instead.
    result["__begin"] = result[output_col_name].values.begin
    return result.sort_values("__begin")[[output_col_name]]
コード例 #16
0
def spans_to_iob(
    token_spans: Union[TokenSpanArray, List[TokenSpan], pd.Series],
    span_ent_types: Union[str, Iterable, np.ndarray, pd.Series] = None
) -> pd.DataFrame:
    """
    Convert a series of `TokenSpan`s of entities to token tags in
    Inside–Outside–Beginning (IOB2) format. See
    https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
    for more information on IOB2 format.

    :param token_spans: An object that can be converted to a `TokenSpanArray` via
        `TokenSpanArray.make_array()`. Should contain `TokenSpan`s aligned with the
        target tokenization.
        Usually you create this array by calling `TokenSpanArray.align_to_tokens()`.
    :param span_ent_types: List of entity type strings corresponding to each of the
        elements of `token_spans`, or `None` to indicate null entity tags.
    :return: A `pd.DataFrame` with two columns:
      * "ent_iob": IOB2 tags as strings "ent_iob"
      * "ent_type": Entity type strings (or NaN values if `ent_types` is `None`)
    """
    # Normalize inputs
    token_spans = TokenSpanArray.make_array(token_spans)
    if span_ent_types is None:
        span_ent_types = [None] * len(token_spans)
    elif isinstance(span_ent_types, str):
        span_ent_types = [span_ent_types] * len(token_spans)
    elif isinstance(span_ent_types, pd.Series):
        span_ent_types = span_ent_types.values

    # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2
    iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False)

    # Handle an empty token span array
    if len(token_spans) == 0:
        return pd.DataFrame({
            "ent_iob": pd.Series(dtype=iob2_dtype),
            "ent_type": pd.Series(dtype="string")
        })

    # Initialize an IOB series with all 'O' entities
    iob_data = np.zeros_like(token_spans.tokens.begin, dtype=np.int64)
    iob_tags = pd.Categorical.from_codes(codes=iob_data, dtype=iob2_dtype)

    # Assign the begin tags
    iob_tags[token_spans.begin_token] = "B"

    # Fill in the remaining inside tags
    i_lengths = token_spans.end_token - (token_spans.begin_token + 1)
    i_mask = i_lengths > 0
    i_begins = token_spans.begin_token[i_mask] + 1
    i_ends = token_spans.end_token[i_mask]
    for begin, end in zip(i_begins, i_ends):
        iob_tags[begin:end] = "I"

    # Use a similar process to generate entity type tags
    ent_types = np.full(len(token_spans.tokens), None, dtype=object)
    for ent_type, begin, end in zip(span_ent_types, token_spans.begin_token,
                                    token_spans.end_token):
        ent_types[begin:end] = ent_type

    return pd.DataFrame({
        "ent_iob": iob_tags,
        "ent_type": pd.Series(ent_types, dtype="string")
    })
コード例 #17
0
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to
    a TokenSpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType
    :return: TokenSpanArray
    """
    if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
        raise NotImplementedError(
            "Arrow serialization for TokenSpanArray is not supported with "
            "PyArrow versions < 2.0.0")
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get the begins/ends pyarrow arrays
    token_begins_array = extension_array.storage.field(
        ArrowTokenSpanType.BEGINS_NAME)
    token_ends_array = extension_array.storage.field(
        ArrowTokenSpanType.ENDS_NAME)

    # Get the tokens as a dictionary array where indices map to a list of ArrowSpanArrays
    tokens_dict_array = extension_array.storage.field(
        ArrowTokenSpanType.TOKENS_NAME)
    tokens_indices = tokens_dict_array.indices
    arrow_tokens_arrays_array = tokens_dict_array.dictionary

    # Breakup the list of ArrowSpanArrays and convert back to individual SpanArrays
    tokens_arrays = []
    span_type = None
    for i in range(1, len(arrow_tokens_arrays_array.offsets)):
        start = arrow_tokens_arrays_array.offsets[i - 1].as_py()
        stop = arrow_tokens_arrays_array.offsets[i].as_py()
        arrow_tokens_array = arrow_tokens_arrays_array.values[start:stop]

        # Make an instance of ArrowSpanType
        if span_type is None:
            begins_array = arrow_tokens_array.field(ArrowSpanType.BEGINS_NAME)
            target_text_dict_array = arrow_tokens_array.field(
                ArrowSpanType.TARGET_TEXT_DICT_NAME)
            span_type = ArrowSpanType(begins_array.type,
                                      target_text_dict_array.type)

        # Re-make the Arrow extension type to convert back to a SpanArray
        tokens_array = arrow_to_span(
            pa.ExtensionArray.from_storage(span_type, arrow_tokens_array))
        tokens_arrays.append(tokens_array)

    # Map the token indices to the actual token SpanArray for each element in the TokenSpanArray
    tokens = [
        _EMPTY_SPAN_ARRAY_SINGLETON if i is None else tokens_arrays[i]
        for i in tokens_indices.to_pylist()
    ]

    # Zero-copy convert arrays to numpy
    token_begins = token_begins_array.to_numpy()
    token_ends = token_ends_array.to_numpy()

    return TokenSpanArray(tokens, token_begins, token_ends)
コード例 #18
0
# SpaCy tokenizer (only) setup
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
_tokenizer = nlp.tokenizer

# Build up some example relations for the tests in this file
_TEXT = """
In AD 932, King Arthur and his squire, Patsy, travel throughout Britain 
searching for men to join the Knights of the Round Table. Along the way, he 
recruits Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Galahad the Pure...
"""
_TOKENS_SERIES = make_tokens(_TEXT, _tokenizer)
_TOKENS_ARRAY = _TOKENS_SERIES.array  # type: SpanArray
_TOKEN_SPANS_ARRAY = TokenSpanArray.from_char_offsets(_TOKENS_ARRAY)
_CAPS_WORD = extract_regex_tok(_TOKENS_ARRAY, regex.compile("[A-Z][a-z]*"))
_CAPS_WORDS = extract_regex_tok(
    _TOKENS_ARRAY, regex.compile("[A-Z][a-z]*(\\s([A-Z][a-z]*))*"), 1, 2
)
_THE = extract_regex_tok(_TOKENS_ARRAY, regex.compile("[Tt]he"))


class JoinTest(TestBase):
    def setUp(self):
        # Make it easier to see what's going on with join results
        self._prev_token_offsets_flag_value = TokenSpan.USE_TOKEN_OFFSETS_IN_REPR
        TokenSpan.USE_TOKEN_OFFSETS_IN_REPR = True

    def tearDown(self):
        # Restore TokenSpan repr formatting to avoid messing up other tests.
コード例 #19
0
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to
    a TokenSpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType
    :return: TokenSpanArray
    """
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get target text from the begins field metadata and decode string
    metadata = extension_array.storage.type[
        ArrowTokenSpanType.BEGINS_NAME].metadata
    target_text = metadata[ArrowSpanType.TARGET_TEXT_KEY]
    if isinstance(target_text, bytes):
        target_text = target_text.decode()

    # Get the begins/ends pyarrow arrays
    token_begins_array = extension_array.storage.field(
        ArrowTokenSpanType.BEGINS_NAME)
    token_ends_array = extension_array.storage.field(
        ArrowTokenSpanType.ENDS_NAME)

    # Check if CharSpans have been split
    num_char_span_splits = extension_array.type.num_char_span_splits
    if num_char_span_splits > 0:
        char_begins_splits = []
        char_ends_splits = []
        for i in range(num_char_span_splits):
            char_begins_splits.append(
                extension_array.storage.field(ArrowSpanType.BEGINS_NAME +
                                              "_{}".format(i)))
            char_ends_splits.append(
                extension_array.storage.field(ArrowSpanType.ENDS_NAME +
                                              "_{}".format(i)))
        char_begins_array = pa.concat_arrays(char_begins_splits)
        char_ends_array = pa.concat_arrays(char_ends_splits)
    else:
        char_begins_array = extension_array.storage.field(
            ArrowSpanType.BEGINS_NAME)
        char_ends_array = extension_array.storage.field(
            ArrowSpanType.ENDS_NAME)

    # Remove any trailing padding
    if char_begins_array.null_count > 0:
        char_begins_array = char_begins_array[:-char_begins_array.null_count]
        char_ends_array = char_ends_array[:-char_ends_array.null_count]

    # Zero-copy convert arrays to numpy
    token_begins = token_begins_array.to_numpy()
    token_ends = token_ends_array.to_numpy()
    char_begins = char_begins_array.to_numpy()
    char_ends = char_ends_array.to_numpy()

    # Create the SpanArray, then the TokenSpanArray
    char_span = SpanArray(target_text, char_begins, char_ends)
    return TokenSpanArray(char_span, token_begins, token_ends)
コード例 #20
0
def _doc_to_df(doc: List[_SentenceData], column_names: List[str],
               iob_columns: List[bool],
               space_before_punct: bool) -> pd.DataFrame:
    """
    Convert the "Python objects" representation of a document from a
    CoNLL-2003 file into a `pd.DataFrame` of token metadata.

    :param doc: List of Python objects that represents the document.
    :param column_names: Names for the metadata columns that come after the
     token text. These names will be used to generate the names of the dataframe
     that this function returns.
    :param iob_columns: Mask indicating which of the metadata columns after the
     token text should be treated as being in IOB format. If a column is in IOB format,
     the returned dataframe will contain *two* columns, holding IOB2 tags and
     entity type tags, respectively. For example, an input column "ent" will turn into
     output columns "ent_iob" and "ent_type".
    :param space_before_punct: If `True`, add whitespace before
     punctuation characters (and after left parentheses)
     when reconstructing the text of the document.
    :return: DataFrame with four columns:
    * `span`: Span of each token, with character offsets.
      Backed by the concatenation of the tokens in the document into
      a single string with one sentence per line.
    * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared
      in the original file, with no corrections applied.
    * `ent_type`: Entity type names for tokens tagged "I" or "B" in
      the `ent_iob` column; `None` everywhere else.
    * `line_num`: line number of each token in the parsed file
    """

    # Character offsets of tokens in the reconstructed document
    begins_list = []  # Type: List[np.ndarray]
    ends_list = []  # Type: List[np.ndarray]

    # Reconstructed text of each sentence
    sentences_list = []  # Type: List[np.ndarray]

    # Token offsets of sentences containing each token in the document.
    sentence_begins_list = []  # Type: List[np.ndarray]
    sentence_ends_list = []  # Type: List[np.ndarray]

    # Token metadata column values. Key is column name, value is metadata for
    # each token.
    meta_lists = _make_empty_meta_values(column_names, iob_columns)

    # Line numbers of the parsed file for each token in the doc
    doc_line_nums = []

    char_position = 0
    token_position = 0
    for sentence_num in range(len(doc)):
        sentence = doc[sentence_num]
        tokens = sentence.tokens

        # Don't put spaces before punctuation in the reconstituted string.
        no_space_before_mask = (np.zeros(len(tokens), dtype=np.bool)
                                if space_before_punct else
                                _SPACE_BEFORE_MATCH_FN(tokens))
        no_space_after_mask = (np.zeros(len(tokens), dtype=np.bool)
                               if space_before_punct else
                               _SPACE_AFTER_MATCH_FN(tokens))
        no_space_before_mask[0] = True  # No space before first token
        no_space_after_mask[-1] = True  # No space after last token
        shifted_no_space_after_mask = np.roll(no_space_after_mask, 1)
        prefixes = np.where(
            np.logical_or(no_space_before_mask, shifted_no_space_after_mask),
            "", " ")
        string_parts = np.ravel((prefixes, tokens), order="F")
        sentence_text = "".join(string_parts)
        sentences_list.append(sentence_text)

        lengths = np.array([len(t) for t in tokens])
        prefix_lengths = np.array([len(p) for p in prefixes])

        # Begin and end offsets, accounting for which tokens have spaces
        # before them.
        e = np.cumsum(lengths + prefix_lengths)
        b = e - lengths
        begins_list.append(b + char_position)
        ends_list.append(e + char_position)

        sentence_begin_token = token_position
        sentence_end_token = token_position + len(e)
        sentence_begins = np.repeat(sentence_begin_token, len(e))
        sentence_ends = np.repeat(sentence_end_token, len(e))
        sentence_begins_list.append(sentence_begins)
        sentence_ends_list.append(sentence_ends)

        for k in sentence.token_metadata.keys():
            meta_lists[k].extend(sentence.token_metadata[k])

        char_position += e[-1] + 1  # "+ 1" to account for newline
        token_position += len(e)

        doc_line_nums.extend(sentence.line_nums)

    begins = np.concatenate(begins_list)
    ends = np.concatenate(ends_list)
    doc_text = "\n".join(sentences_list)
    char_spans = SpanArray(doc_text, begins, ends)
    sentence_spans = TokenSpanArray(char_spans,
                                    np.concatenate(sentence_begins_list),
                                    np.concatenate(sentence_ends_list))

    ret = pd.DataFrame({"span": char_spans})
    for k, v in meta_lists.items():
        ret[k] = v
    ret["sentence"] = sentence_spans
    ret["line_num"] = pd.Series(doc_line_nums)
    return ret