def arrow_to_char_span(extension_array: pa.ExtensionArray) -> CharSpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowCharSpanType to
    a CharSpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowCharSpanType
    :return: CharSpanArray
    """
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get target text from the begins field metadata and decode string
    metadata = extension_array.storage.type[
        ArrowCharSpanType.BEGINS_NAME].metadata
    target_text = metadata[ArrowCharSpanType.TARGET_TEXT_KEY]
    if isinstance(target_text, bytes):
        target_text = target_text.decode()

    # Get the begins/ends pyarrow arrays
    begins_array = extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME)
    ends_array = extension_array.storage.field(ArrowCharSpanType.ENDS_NAME)

    # Zero-copy convert arrays to numpy
    begins = begins_array.to_numpy()
    ends = ends_array.to_numpy()

    return CharSpanArray(target_text, begins, ends)
Example #2
0
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to
    a TokenSpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType
    :return: TokenSpanArray
    """
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError("Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get target text from the begins field metadata and decode string
    metadata = extension_array.storage.type[ArrowTokenSpanType.BEGINS_NAME].metadata
    target_text = metadata[ArrowCharSpanType.TARGET_TEXT_KEY]
    if isinstance(target_text, bytes):
        target_text = target_text.decode()

    # Get the begins/ends pyarrow arrays
    token_begins_array = extension_array.storage.field(ArrowTokenSpanType.BEGINS_NAME)
    token_ends_array = extension_array.storage.field(ArrowTokenSpanType.ENDS_NAME)

    # Check if CharSpans have been split
    num_char_span_splits = extension_array.type.num_char_span_splits
    if num_char_span_splits > 0:
        char_begins_splits = []
        char_ends_splits = []
        for i in range(num_char_span_splits):
            char_begins_splits.append(
                extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME + "_{}".format(i)))
            char_ends_splits.append(
                extension_array.storage.field(ArrowCharSpanType.ENDS_NAME + "_{}".format(i)))
        char_begins_array = pa.concat_arrays(char_begins_splits)
        char_ends_array = pa.concat_arrays(char_ends_splits)
    else:
        char_begins_array = extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME)
        char_ends_array = extension_array.storage.field(ArrowCharSpanType.ENDS_NAME)

    # Remove any trailing padding
    if char_begins_array.null_count > 0:
        char_begins_array = char_begins_array[:-char_begins_array.null_count]
        char_ends_array = char_ends_array[:-char_ends_array.null_count]

    # Zero-copy convert arrays to numpy
    token_begins = token_begins_array.to_numpy()
    token_ends = token_ends_array.to_numpy()
    char_begins = char_begins_array.to_numpy()
    char_ends = char_ends_array.to_numpy()

    # Create the CharSpanArray, then the TokenSpanArray
    char_span = CharSpanArray(target_text, char_begins, char_ends)
    return TokenSpanArray(char_span, token_begins, token_ends)
Example #3
0
def make_tokens(target_text: str, tokenizer) -> pd.Series:
    """
    :param target_text: Text to tokenize
    :param tokenizer: Preconfigured `spacy.tokenizer.Tokenizer` object
    :return: The tokens (and underlying text) as a Pandas Series wrapped around
        a `CharSpanArray` value.
    """
    spacy_doc = tokenizer(target_text)
    tok_begins = np.array([t.idx for t in spacy_doc])
    tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
    return pd.Series(CharSpanArray(target_text, tok_begins, tok_ends))
def _doc_to_df(doc: List[Dict[str, List[str]]],
               space_before_punct: bool) -> pd.DataFrame:
    """
    Convert the "Python objects" representation of a document from a
    CoNLL-2003 file into a `pd.DataFrame` of token metadata.

    :param doc: Tree of Python objects that represents the document,
     List with one dictionary per sentence.
    :param space_before_punct: If `True`, add whitespace before
     punctuation characters when reconstructing the text of the document.
    :return: DataFrame with four columns:
    * `char_span`: Span of each token, with character offsets.
      Backed by the concatenation of the tokens in the document into
      a single string with one sentence per line.
    * `token_span`: Span of each token, with token offsets.
      Backed by the contents of the `char_span` column.
    * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared
      in the original file, with no corrections applied.
    * `ent_type`: Entity type names for tokens tagged "I" or "B" in
      the `ent_iob` column; `None` everywhere else.
    """
    begins_list = []  # Type: List[np.ndarray]
    ends_list = []  # Type: List[np.ndarray]
    sentences_list = []  # Type: List[np.ndarray]
    iobs_list = []  # Type: List[np.ndarray]
    entities_list = []  # Type: List[np.ndarray]
    sentence_begins_list = []  # Type: List[np.ndarray]
    sentence_ends_list = []  # Type: List[np.ndarray]

    char_position = 0
    token_position = 0
    for sentence in doc:
        tokens = sentence["token"]

        # Don't put spaces before punctuation in the reconstituted string.
        no_space_mask = (
            np.zeros(len(tokens), dtype=np.bool) if space_before_punct
            else _PUNCT_MATCH_FN(tokens))
        no_space_mask[0] = True  # No space before first token
        prefixes = np.where(no_space_mask, "", " ")
        string_parts = np.ravel((prefixes, tokens), order='F')
        sentence_text = "".join(string_parts)
        sentences_list.append(sentence_text)

        lengths = np.array([len(t) for t in tokens])
        prefix_lengths = np.array([len(p) for p in prefixes])

        # Begin and end offsets, accounting for which tokens have spaces
        # before them.
        e = np.cumsum(lengths + prefix_lengths)
        b = e - lengths
        iobs = np.array(sentence["iob"])
        entities = np.array(sentence["entity"])
        sentence_begin_token = token_position
        sentence_end_token = token_position + len(e)
        sentence_begins = np.repeat(sentence_begin_token, len(e))
        sentence_ends = np.repeat(sentence_end_token, len(e))

        begins_list.append(b + char_position)
        ends_list.append(e + char_position)
        iobs_list.append(iobs)
        entities_list.append(entities)
        sentence_begins_list.append(sentence_begins)
        sentence_ends_list.append(sentence_ends)

        char_position += e[-1] + 1  # "+ 1" to account for newline
        token_position += len(e)

    begins = np.concatenate(begins_list)
    ends = np.concatenate(ends_list)
    doc_text = "\n".join(sentences_list)
    char_spans = CharSpanArray(doc_text, begins, ends)
    token_begins = np.arange(len(begins))
    token_spans = TokenSpanArray(char_spans, token_begins, token_begins + 1)
    sentence_spans = TokenSpanArray(char_spans,
                                    np.concatenate(sentence_begins_list),
                                    np.concatenate(sentence_ends_list))
    return pd.DataFrame(
        {"char_span": char_spans,
         "token_span": token_spans,
         "ent_iob": np.concatenate(iobs_list),
         "ent_type": np.concatenate(entities_list),
         "sentence": sentence_spans})
def _doc_to_df(doc: List[_SentenceData], column_names: List[str],
               iob_columns: List[bool],
               space_before_punct: bool) -> pd.DataFrame:
    """
    Convert the "Python objects" representation of a document from a
    CoNLL-2003 file into a `pd.DataFrame` of token metadata.

    :param doc: List of Python objects that represents the document.
    :param column_names: Names for the metadata columns that come after the
     token text. These names will be used to generate the names of the dataframe
     that this function returns.
    :param iob_columns: Mask indicating which of the metadata columns after the
     token text should be treated as being in IOB format. If a column is in IOB format,
     the returned dataframe will contain *two* columns, holding IOB2 tags and
     entity type tags, respectively. For example, an input column "ent" will turn into
     output columns "ent_iob" and "ent_type".

    :param space_before_punct: If `True`, add whitespace before
     punctuation characters (and after left parentheses)
     when reconstructing the text of the document.
    :return: DataFrame with four columns:
    * `char_span`: Span of each token, with character offsets.
      Backed by the concatenation of the tokens in the document into
      a single string with one sentence per line.
    * `token_span`: Span of each token, with token offsets.
      Backed by the contents of the `char_span` column.
    * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared
      in the original file, with no corrections applied.
    * `ent_type`: Entity type names for tokens tagged "I" or "B" in
      the `ent_iob` column; `None` everywhere else.
    """

    # Character offsets of tokens in the reconstructed document
    begins_list = []  # Type: List[np.ndarray]
    ends_list = []  # Type: List[np.ndarray]

    # Reconstructed text of each sentence
    sentences_list = []  # Type: List[np.ndarray]

    # Token offsets of sentences containing each token in the document.
    sentence_begins_list = []  # Type: List[np.ndarray]
    sentence_ends_list = []  # Type: List[np.ndarray]

    # Token metadata column values. Key is column name, value is metadata for
    # each token.
    meta_lists = _make_empty_meta_values(column_names, iob_columns)

    char_position = 0
    token_position = 0
    for sentence_num in range(len(doc)):
        sentence = doc[sentence_num]
        tokens = sentence.tokens

        # Don't put spaces before punctuation in the reconstituted string.
        no_space_before_mask = (np.zeros(len(tokens), dtype=np.bool)
                                if space_before_punct else
                                _SPACE_BEFORE_MATCH_FN(tokens))
        no_space_after_mask = (np.zeros(len(tokens), dtype=np.bool)
                               if space_before_punct else
                               _SPACE_AFTER_MATCH_FN(tokens))
        no_space_before_mask[0] = True  # No space before first token
        no_space_after_mask[-1] = True  # No space after last token
        shifted_no_space_after_mask = np.roll(no_space_after_mask, 1)
        prefixes = np.where(
            np.logical_or(no_space_before_mask, shifted_no_space_after_mask),
            "", " ")
        string_parts = np.ravel((prefixes, tokens), order="F")
        sentence_text = "".join(string_parts)
        sentences_list.append(sentence_text)

        lengths = np.array([len(t) for t in tokens])
        prefix_lengths = np.array([len(p) for p in prefixes])

        # Begin and end offsets, accounting for which tokens have spaces
        # before them.
        e = np.cumsum(lengths + prefix_lengths)
        b = e - lengths
        begins_list.append(b + char_position)
        ends_list.append(e + char_position)

        sentence_begin_token = token_position
        sentence_end_token = token_position + len(e)
        sentence_begins = np.repeat(sentence_begin_token, len(e))
        sentence_ends = np.repeat(sentence_end_token, len(e))
        sentence_begins_list.append(sentence_begins)
        sentence_ends_list.append(sentence_ends)

        for k in sentence.token_metadata.keys():
            meta_lists[k].extend(sentence.token_metadata[k])

        char_position += e[-1] + 1  # "+ 1" to account for newline
        token_position += len(e)

    begins = np.concatenate(begins_list)
    ends = np.concatenate(ends_list)
    doc_text = "\n".join(sentences_list)
    char_spans = CharSpanArray(doc_text, begins, ends)
    token_begins = np.arange(len(begins))
    token_spans = TokenSpanArray(char_spans, token_begins, token_begins + 1)
    sentence_spans = TokenSpanArray(char_spans,
                                    np.concatenate(sentence_begins_list),
                                    np.concatenate(sentence_ends_list))

    ret = pd.DataFrame({"char_span": char_spans, "token_span": token_spans})
    for k, v in meta_lists.items():
        ret[k] = v
    ret["sentence"] = sentence_spans
    return ret
Example #6
0
def make_tokens_and_features(
    target_text: str,
    language_model,
    add_left_and_right=False,
) -> pd.DataFrame:
    """
    :param target_text: Text to analyze

    :param language_model: Preconfigured spaCy language model (`spacy.language.Language`)
     object

    :param add_left_and_right: If `True`, add columns "left" and "right"
    containing references to previous and next tokens.

    :return: A tuple of two dataframes:
    1. The tokens of the text plus additional linguistic features that the
       language model generates, represented as a `pd.DataFrame`.
    2. A table of named entities identified by the language model's named entity
       tagger, represented as a `pd.DataFrame`.
    """
    spacy_doc = language_model(target_text)

    # TODO: Performance tuning of the translation code that follows
    # Represent the character spans of the tokens
    tok_begins = np.array([t.idx for t in spacy_doc])
    tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
    tokens_array = CharSpanArray(target_text, tok_begins, tok_ends)
    tokens_series = pd.Series(tokens_array)
    # Also build single-token token-based spans to make it easier to build
    # larger token-based spans.
    token_spans = TokenSpanArray.from_char_offsets(tokens_series.values)
    # spaCy identifies tokens by semi-arbitrary integer "indexes" (in practice,
    # the offset of the first character in the token). Translate from these
    # to a dense range of integer IDs that will correspond to the index of our
    # returned DataFrame.
    idx_to_id = {spacy_doc[i].idx: i for i in range(len(spacy_doc))}
    # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2
    iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False)
    df_cols = {
        "id":
        range(len(tok_begins)),
        "char_span":
        tokens_series,
        "token_span":
        token_spans,
        "lemma": [t.lemma_ for t in spacy_doc],
        "pos":
        pd.Categorical([str(t.pos_) for t in spacy_doc]),
        "tag":
        pd.Categorical([str(t.tag_) for t in spacy_doc]),
        "dep":
        pd.Categorical([str(t.dep_) for t in spacy_doc]),
        "head":
        np.array([idx_to_id[t.head.idx] for t in spacy_doc]),
        "shape":
        pd.Categorical([t.shape_ for t in spacy_doc]),
        "ent_iob":
        pd.Categorical([str(t.ent_iob_) for t in spacy_doc], dtype=iob2_dtype),
        "ent_type":
        pd.Categorical([str(t.ent_type_) for t in spacy_doc]),
        "is_alpha":
        np.array([t.is_alpha for t in spacy_doc]),
        "is_stop":
        np.array([t.is_stop for t in spacy_doc]),
        "sentence":
        _make_sentences_series(spacy_doc, tokens_array),
    }
    if add_left_and_right:
        # Use nullable int type because these columns contain nulls
        df_cols["left"] = pd.array([None] + list(range(len(tok_begins) - 1)),
                                   dtype=pd.Int32Dtype())
        df_cols["right"] = pd.array(list(range(1, len(tok_begins))) + [None],
                                    dtype=pd.Int32Dtype())
    return pd.DataFrame(df_cols)
def make_bert_tokens(target_text: str, tokenizer) -> pd.DataFrame:
    """
    Tokenize the indicated text for BERT embeddings and return a DataFrame
    with one row per token.

    :param: target_text: string to tokenize
    :param: tokenizer: A tokenizer that is a subclass of huggingface transformers
                       PreTrainingTokenizerFast which supports `encode_plus` with
                       return_offsets_mapping=True.

    :returns: A `pd.DataFrame` with the following columns:
     * "id": unique integer ID for each token
     * "char_span": span of the token with character offsets
     * "token_span": span of the token with token offsets
     * "input_id": integer ID suitable for input to a BERT embedding model
     * "token_type_id": list of token type ids to be fed to a model
     * "attention_mask": list of indices specifying which tokens should be
                         attended to by the model
     * "special_tokens_mask": `True` if the token is a zero-length special token
       such as "start of document"
    """
    from transformers.tokenization_utils import PreTrainedTokenizerFast

    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise TypeError("Tokenizer must be an instance of "
                        "transformers.PreTrainedTokenizerFast that supports "
                        "encode_plus with return_offsets_mapping=True.")
    tokenized_result = tokenizer.encode_plus(target_text,
                                             return_special_tokens_mask=True,
                                             return_offsets_mapping=True)

    # Get offset mapping from tokenizer
    offsets = tokenized_result["offset_mapping"]

    # Init any special tokens at beginning
    i = 0
    while offsets[i] is None:
        offsets[i] = (0, 0)
        i += 1

    # Make a DataFrame to unzip (begin, end) offsets
    offset_df = pd.DataFrame(offsets, columns=["begin", "end"])

    # Convert special tokens mask to boolean
    special_tokens_mask = pd.Series(
        tokenized_result["special_tokens_mask"]).astype("bool")

    # Fill remaining special tokens to zero-length spans
    ends = offset_df["end"].fillna(method="ffill").astype("int32")
    begins = offset_df["begin"].mask(special_tokens_mask,
                                     other=ends).astype("int32")

    # Create char and token span arrays
    char_spans = CharSpanArray(target_text, begins, ends)
    token_spans = TokenSpanArray(char_spans, np.arange(len(char_spans)),
                                 np.arange(1,
                                           len(char_spans) + 1))

    token_features = pd.DataFrame({
        "id":
        special_tokens_mask.index,
        # Use values instead of series because different indexes
        "char_span":
        pd.Series(char_spans).values,
        "token_span":
        pd.Series(token_spans).values,
        "input_id":
        tokenized_result["input_ids"],
        "token_type_id":
        tokenized_result["token_type_ids"],
        "attention_mask":
        tokenized_result["attention_mask"],
        "special_tokens_mask":
        special_tokens_mask,
    })

    return token_features