def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to a TokenSpanArray. :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType :return: TokenSpanArray """ if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError("Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) assert pa.types.is_struct(extension_array.storage.type) # Get target text from the begins field metadata and decode string metadata = extension_array.storage.type[ArrowTokenSpanType.BEGINS_NAME].metadata target_text = metadata[ArrowCharSpanType.TARGET_TEXT_KEY] if isinstance(target_text, bytes): target_text = target_text.decode() # Get the begins/ends pyarrow arrays token_begins_array = extension_array.storage.field(ArrowTokenSpanType.BEGINS_NAME) token_ends_array = extension_array.storage.field(ArrowTokenSpanType.ENDS_NAME) # Check if CharSpans have been split num_char_span_splits = extension_array.type.num_char_span_splits if num_char_span_splits > 0: char_begins_splits = [] char_ends_splits = [] for i in range(num_char_span_splits): char_begins_splits.append( extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME + "_{}".format(i))) char_ends_splits.append( extension_array.storage.field(ArrowCharSpanType.ENDS_NAME + "_{}".format(i))) char_begins_array = pa.concat_arrays(char_begins_splits) char_ends_array = pa.concat_arrays(char_ends_splits) else: char_begins_array = extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME) char_ends_array = extension_array.storage.field(ArrowCharSpanType.ENDS_NAME) # Remove any trailing padding if char_begins_array.null_count > 0: char_begins_array = char_begins_array[:-char_begins_array.null_count] char_ends_array = char_ends_array[:-char_ends_array.null_count] # Zero-copy convert arrays to numpy token_begins = token_begins_array.to_numpy() token_ends = token_ends_array.to_numpy() char_begins = char_begins_array.to_numpy() char_ends = char_ends_array.to_numpy() # Create the CharSpanArray, then the TokenSpanArray char_span = CharSpanArray(target_text, char_begins, char_ends) return TokenSpanArray(char_span, token_begins, token_ends)
def _make_sentences_series(spacy_doc, tokens: CharSpanArray): """ Subroutine of `make_tokens_and_features()` :param spacy_doc: parsed document (`spacy.tokens.doc.Doc`) from a spaCy language model :param tokens: Token information for the current document as a `CharSpanArray` object. Must contain the same tokens as `spacy_doc`. :return: a Pandas DataFrame Series containing the token span of the (single) sentence that the token is in """ num_toks = len(spacy_doc) # Generate the [begin, end) intervals that make up a series of spans begin_tokens = np.full(shape=num_toks, fill_value=-1, dtype=np.int) end_tokens = np.full(shape=num_toks, fill_value=-1, dtype=np.int) for sent in spacy_doc.sents: begin_tokens[sent.start:sent.end] = sent.start end_tokens[sent.start:sent.end] = sent.end return pd.Series(TokenSpanArray(tokens, begin_tokens, end_tokens))
def spans_to_iob( token_spans: Union[TokenSpanArray, List[TokenSpan], pd.Series] ) -> pd.Series: """ Convert a series of `TokenSpan`s of entities to token tags in Inside–Outside–Beginning (IOB2) format. See https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) for more information on IOB2 format. :param token_spans: An object that can be converted to a `TokenSpanArray` via `TokenSpanArray.make_array()`. Should contain `TokenSpan`s aligned with the target tokenization. Usually you create this array by calling `TokenSpanArray.align_to_tokens()`. :return: A `pd.Series` of IOB2 tags as strings and a series name of "ent_iob". """ token_spans = TokenSpanArray.make_array(token_spans) # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2 iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False) # Handle an empty token span array if len(token_spans) == 0: return pd.Series(dtype=iob2_dtype) # Initialize an IOB series with all 'O' entities iob_data = np.zeros_like(token_spans.tokens.begin, dtype=np.int64) iob_tags = pd.Categorical.from_codes(codes=iob_data, dtype=iob2_dtype) # Assign the begin entities iob_tags[token_spans.begin_token] = "B" # Fill in the remaining inside entities i_lengths = token_spans.end_token - (token_spans.begin_token + 1) i_mask = i_lengths > 0 i_begins = token_spans.begin_token[i_mask] + 1 i_ends = token_spans.end_token[i_mask] for begin, end in zip(i_begins, i_ends): iob_tags[begin:end] = "I" return pd.Series(iob_tags, name="ent_iob")
def iob_to_spans( token_features: pd.DataFrame, iob_col_name: str = "ent_iob", char_span_col_name: str = "char_span", entity_type_col_name: str = "ent_type", ): """ Convert token tags in Inside–Outside–Beginning (IOB2) format to a series of `TokenSpan`s of entities. See https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) for more information on IOB2 format. :param token_features: DataFrame of token features in the format returned by `make_tokens_and_features`. :param iob_col_name: Name of a column in `token_features` that contains the IOB2 tags as strings, "I", "O", or "B". :param char_span_col_name: Name of a column in `token_features` that contains the tokens as a `CharSpanArray`. :param entity_type_col_name: Optional name of a column in `token_features` that contains entity type information; or `None` if no such column exists. :return: A `pd.DataFrame` with the following columns: * `token_span`: Span (with token offsets) of each entity * `<value of entity_type_col_name>`: (optional) Entity type """ # Start out with 1-token prefixes of all entities. begin_mask = token_features[iob_col_name] == "B" first_tokens = token_features[begin_mask].index if entity_type_col_name is None: entity_types = np.zeros(len(first_tokens)) else: entity_types = token_features[begin_mask][entity_type_col_name] # Add an extra "O" tag to the end of the IOB column to simplify the logic # for handling the case where the document ends with an entity. iob_series = ( token_features[iob_col_name].append(pd.Series(["O"])).reset_index(drop=True) ) entity_prefixes = pd.DataFrame( { "ent_type": entity_types, "begin": first_tokens, # Inclusive "end": first_tokens + 1, # Exclusive "next_tag": iob_series.iloc[first_tokens + 1].values, } ) df_list = [] # Type: pd.DataFrame if len(entity_prefixes.index) == 0: # Code below needs at least one element in the list for schema df_list = [entity_prefixes] # Iteratively expand the prefixes while len(entity_prefixes.index) > 0: complete_mask = entity_prefixes["next_tag"].isin(["O", "B"]) complete_entities = entity_prefixes[complete_mask] incomplete_entities = entity_prefixes[~complete_mask].copy() incomplete_entities["end"] = incomplete_entities["end"] + 1 incomplete_entities["next_tag"] = iob_series.iloc[ incomplete_entities["end"] ].values df_list.append(complete_entities) entity_prefixes = incomplete_entities all_entities = pd.concat(df_list) # Sort spans by location, not length. all_entities.sort_values("begin", inplace=True) # Convert [begin, end) pairs to spans entity_spans_array = TokenSpanArray( token_features[char_span_col_name].values, all_entities["begin"].values, all_entities["end"].values, ) if entity_type_col_name is None: return pd.DataFrame({"token_span": entity_spans_array}) else: return pd.DataFrame( { "token_span": entity_spans_array, entity_type_col_name: all_entities["ent_type"].values, } )
def _doc_to_df(doc: List[Dict[str, List[str]]], space_before_punct: bool) -> pd.DataFrame: """ Convert the "Python objects" representation of a document from a CoNLL-2003 file into a `pd.DataFrame` of token metadata. :param doc: Tree of Python objects that represents the document, List with one dictionary per sentence. :param space_before_punct: If `True`, add whitespace before punctuation characters when reconstructing the text of the document. :return: DataFrame with four columns: * `char_span`: Span of each token, with character offsets. Backed by the concatenation of the tokens in the document into a single string with one sentence per line. * `token_span`: Span of each token, with token offsets. Backed by the contents of the `char_span` column. * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared in the original file, with no corrections applied. * `ent_type`: Entity type names for tokens tagged "I" or "B" in the `ent_iob` column; `None` everywhere else. """ begins_list = [] # Type: List[np.ndarray] ends_list = [] # Type: List[np.ndarray] sentences_list = [] # Type: List[np.ndarray] iobs_list = [] # Type: List[np.ndarray] entities_list = [] # Type: List[np.ndarray] sentence_begins_list = [] # Type: List[np.ndarray] sentence_ends_list = [] # Type: List[np.ndarray] char_position = 0 token_position = 0 for sentence in doc: tokens = sentence["token"] # Don't put spaces before punctuation in the reconstituted string. no_space_mask = ( np.zeros(len(tokens), dtype=np.bool) if space_before_punct else _PUNCT_MATCH_FN(tokens)) no_space_mask[0] = True # No space before first token prefixes = np.where(no_space_mask, "", " ") string_parts = np.ravel((prefixes, tokens), order='F') sentence_text = "".join(string_parts) sentences_list.append(sentence_text) lengths = np.array([len(t) for t in tokens]) prefix_lengths = np.array([len(p) for p in prefixes]) # Begin and end offsets, accounting for which tokens have spaces # before them. e = np.cumsum(lengths + prefix_lengths) b = e - lengths iobs = np.array(sentence["iob"]) entities = np.array(sentence["entity"]) sentence_begin_token = token_position sentence_end_token = token_position + len(e) sentence_begins = np.repeat(sentence_begin_token, len(e)) sentence_ends = np.repeat(sentence_end_token, len(e)) begins_list.append(b + char_position) ends_list.append(e + char_position) iobs_list.append(iobs) entities_list.append(entities) sentence_begins_list.append(sentence_begins) sentence_ends_list.append(sentence_ends) char_position += e[-1] + 1 # "+ 1" to account for newline token_position += len(e) begins = np.concatenate(begins_list) ends = np.concatenate(ends_list) doc_text = "\n".join(sentences_list) char_spans = CharSpanArray(doc_text, begins, ends) token_begins = np.arange(len(begins)) token_spans = TokenSpanArray(char_spans, token_begins, token_begins + 1) sentence_spans = TokenSpanArray(char_spans, np.concatenate(sentence_begins_list), np.concatenate(sentence_ends_list)) return pd.DataFrame( {"char_span": char_spans, "token_span": token_spans, "ent_iob": np.concatenate(iobs_list), "ent_type": np.concatenate(entities_list), "sentence": sentence_spans})
def spans_to_iob( token_spans: Union[TokenSpanArray, List[TokenSpan], pd.Series], span_ent_types: Union[str, Iterable, np.ndarray, pd.Series] = None ) -> pd.DataFrame: """ Convert a series of `TokenSpan`s of entities to token tags in Inside–Outside–Beginning (IOB2) format. See https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) for more information on IOB2 format. :param token_spans: An object that can be converted to a `TokenSpanArray` via `TokenSpanArray.make_array()`. Should contain `TokenSpan`s aligned with the target tokenization. Usually you create this array by calling `TokenSpanArray.align_to_tokens()`. :param span_ent_types: List of entity type strings corresponding to each of the elements of `token_spans`, or `None` to indicate null entity tags. :return: A `pd.DataFrame` with two columns: * "ent_iob": IOB2 tags as strings "ent_iob" * "ent_type": Entity type strings (or NaN values if `ent_types` is `None`) """ # Normalize inputs token_spans = TokenSpanArray.make_array(token_spans) if span_ent_types is None: span_ent_types = [None] * len(token_spans) elif isinstance(span_ent_types, str): span_ent_types = [span_ent_types] * len(token_spans) elif isinstance(span_ent_types, pd.Series): span_ent_types = span_ent_types.values # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2 iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False) # Handle an empty token span array if len(token_spans) == 0: return pd.DataFrame({ "ent_iob": pd.Series(dtype=iob2_dtype), "ent_type": pd.Series(dtype="string") }) # Initialize an IOB series with all 'O' entities iob_data = np.zeros_like(token_spans.tokens.begin, dtype=np.int64) iob_tags = pd.Categorical.from_codes(codes=iob_data, dtype=iob2_dtype) # Assign the begin tags iob_tags[token_spans.begin_token] = "B" # Fill in the remaining inside tags i_lengths = token_spans.end_token - (token_spans.begin_token + 1) i_mask = i_lengths > 0 i_begins = token_spans.begin_token[i_mask] + 1 i_ends = token_spans.end_token[i_mask] for begin, end in zip(i_begins, i_ends): iob_tags[begin:end] = "I" # Use a similar process to generate entity type tags ent_types = np.full(len(token_spans.tokens), None, dtype=object) for ent_type, begin, end in zip(span_ent_types, token_spans.begin_token, token_spans.end_token): ent_types[begin:end] = ent_type return pd.DataFrame({ "ent_iob": iob_tags, "ent_type": pd.Series(ent_types, dtype="string") })
def _doc_to_df(doc: List[_SentenceData], column_names: List[str], iob_columns: List[bool], space_before_punct: bool) -> pd.DataFrame: """ Convert the "Python objects" representation of a document from a CoNLL-2003 file into a `pd.DataFrame` of token metadata. :param doc: List of Python objects that represents the document. :param column_names: Names for the metadata columns that come after the token text. These names will be used to generate the names of the dataframe that this function returns. :param iob_columns: Mask indicating which of the metadata columns after the token text should be treated as being in IOB format. If a column is in IOB format, the returned dataframe will contain *two* columns, holding IOB2 tags and entity type tags, respectively. For example, an input column "ent" will turn into output columns "ent_iob" and "ent_type". :param space_before_punct: If `True`, add whitespace before punctuation characters (and after left parentheses) when reconstructing the text of the document. :return: DataFrame with four columns: * `char_span`: Span of each token, with character offsets. Backed by the concatenation of the tokens in the document into a single string with one sentence per line. * `token_span`: Span of each token, with token offsets. Backed by the contents of the `char_span` column. * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared in the original file, with no corrections applied. * `ent_type`: Entity type names for tokens tagged "I" or "B" in the `ent_iob` column; `None` everywhere else. """ # Character offsets of tokens in the reconstructed document begins_list = [] # Type: List[np.ndarray] ends_list = [] # Type: List[np.ndarray] # Reconstructed text of each sentence sentences_list = [] # Type: List[np.ndarray] # Token offsets of sentences containing each token in the document. sentence_begins_list = [] # Type: List[np.ndarray] sentence_ends_list = [] # Type: List[np.ndarray] # Token metadata column values. Key is column name, value is metadata for # each token. meta_lists = _make_empty_meta_values(column_names, iob_columns) char_position = 0 token_position = 0 for sentence_num in range(len(doc)): sentence = doc[sentence_num] tokens = sentence.tokens # Don't put spaces before punctuation in the reconstituted string. no_space_before_mask = (np.zeros(len(tokens), dtype=np.bool) if space_before_punct else _SPACE_BEFORE_MATCH_FN(tokens)) no_space_after_mask = (np.zeros(len(tokens), dtype=np.bool) if space_before_punct else _SPACE_AFTER_MATCH_FN(tokens)) no_space_before_mask[0] = True # No space before first token no_space_after_mask[-1] = True # No space after last token shifted_no_space_after_mask = np.roll(no_space_after_mask, 1) prefixes = np.where( np.logical_or(no_space_before_mask, shifted_no_space_after_mask), "", " ") string_parts = np.ravel((prefixes, tokens), order="F") sentence_text = "".join(string_parts) sentences_list.append(sentence_text) lengths = np.array([len(t) for t in tokens]) prefix_lengths = np.array([len(p) for p in prefixes]) # Begin and end offsets, accounting for which tokens have spaces # before them. e = np.cumsum(lengths + prefix_lengths) b = e - lengths begins_list.append(b + char_position) ends_list.append(e + char_position) sentence_begin_token = token_position sentence_end_token = token_position + len(e) sentence_begins = np.repeat(sentence_begin_token, len(e)) sentence_ends = np.repeat(sentence_end_token, len(e)) sentence_begins_list.append(sentence_begins) sentence_ends_list.append(sentence_ends) for k in sentence.token_metadata.keys(): meta_lists[k].extend(sentence.token_metadata[k]) char_position += e[-1] + 1 # "+ 1" to account for newline token_position += len(e) begins = np.concatenate(begins_list) ends = np.concatenate(ends_list) doc_text = "\n".join(sentences_list) char_spans = CharSpanArray(doc_text, begins, ends) token_begins = np.arange(len(begins)) token_spans = TokenSpanArray(char_spans, token_begins, token_begins + 1) sentence_spans = TokenSpanArray(char_spans, np.concatenate(sentence_begins_list), np.concatenate(sentence_ends_list)) ret = pd.DataFrame({"char_span": char_spans, "token_span": token_spans}) for k, v in meta_lists.items(): ret[k] = v ret["sentence"] = sentence_spans return ret
def make_tokens_and_features( target_text: str, language_model, add_left_and_right=False, ) -> pd.DataFrame: """ :param target_text: Text to analyze :param language_model: Preconfigured spaCy language model (`spacy.language.Language`) object :param add_left_and_right: If `True`, add columns "left" and "right" containing references to previous and next tokens. :return: A tuple of two dataframes: 1. The tokens of the text plus additional linguistic features that the language model generates, represented as a `pd.DataFrame`. 2. A table of named entities identified by the language model's named entity tagger, represented as a `pd.DataFrame`. """ spacy_doc = language_model(target_text) # TODO: Performance tuning of the translation code that follows # Represent the character spans of the tokens tok_begins = np.array([t.idx for t in spacy_doc]) tok_ends = np.array([t.idx + len(t) for t in spacy_doc]) tokens_array = CharSpanArray(target_text, tok_begins, tok_ends) tokens_series = pd.Series(tokens_array) # Also build single-token token-based spans to make it easier to build # larger token-based spans. token_spans = TokenSpanArray.from_char_offsets(tokens_series.values) # spaCy identifies tokens by semi-arbitrary integer "indexes" (in practice, # the offset of the first character in the token). Translate from these # to a dense range of integer IDs that will correspond to the index of our # returned DataFrame. idx_to_id = {spacy_doc[i].idx: i for i in range(len(spacy_doc))} # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2 iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False) df_cols = { "id": range(len(tok_begins)), "char_span": tokens_series, "token_span": token_spans, "lemma": [t.lemma_ for t in spacy_doc], "pos": pd.Categorical([str(t.pos_) for t in spacy_doc]), "tag": pd.Categorical([str(t.tag_) for t in spacy_doc]), "dep": pd.Categorical([str(t.dep_) for t in spacy_doc]), "head": np.array([idx_to_id[t.head.idx] for t in spacy_doc]), "shape": pd.Categorical([t.shape_ for t in spacy_doc]), "ent_iob": pd.Categorical([str(t.ent_iob_) for t in spacy_doc], dtype=iob2_dtype), "ent_type": pd.Categorical([str(t.ent_type_) for t in spacy_doc]), "is_alpha": np.array([t.is_alpha for t in spacy_doc]), "is_stop": np.array([t.is_stop for t in spacy_doc]), "sentence": _make_sentences_series(spacy_doc, tokens_array), } if add_left_and_right: # Use nullable int type because these columns contain nulls df_cols["left"] = pd.array([None] + list(range(len(tok_begins) - 1)), dtype=pd.Int32Dtype()) df_cols["right"] = pd.array(list(range(1, len(tok_begins))) + [None], dtype=pd.Int32Dtype()) return pd.DataFrame(df_cols)
def make_bert_tokens(target_text: str, tokenizer) -> pd.DataFrame: """ Tokenize the indicated text for BERT embeddings and return a DataFrame with one row per token. :param: target_text: string to tokenize :param: tokenizer: A tokenizer that is a subclass of huggingface transformers PreTrainingTokenizerFast which supports `encode_plus` with return_offsets_mapping=True. :returns: A `pd.DataFrame` with the following columns: * "id": unique integer ID for each token * "char_span": span of the token with character offsets * "token_span": span of the token with token offsets * "input_id": integer ID suitable for input to a BERT embedding model * "token_type_id": list of token type ids to be fed to a model * "attention_mask": list of indices specifying which tokens should be attended to by the model * "special_tokens_mask": `True` if the token is a zero-length special token such as "start of document" """ from transformers.tokenization_utils import PreTrainedTokenizerFast if not isinstance(tokenizer, PreTrainedTokenizerFast): raise TypeError("Tokenizer must be an instance of " "transformers.PreTrainedTokenizerFast that supports " "encode_plus with return_offsets_mapping=True.") tokenized_result = tokenizer.encode_plus(target_text, return_special_tokens_mask=True, return_offsets_mapping=True) # Get offset mapping from tokenizer offsets = tokenized_result["offset_mapping"] # Init any special tokens at beginning i = 0 while offsets[i] is None: offsets[i] = (0, 0) i += 1 # Make a DataFrame to unzip (begin, end) offsets offset_df = pd.DataFrame(offsets, columns=["begin", "end"]) # Convert special tokens mask to boolean special_tokens_mask = pd.Series( tokenized_result["special_tokens_mask"]).astype("bool") # Fill remaining special tokens to zero-length spans ends = offset_df["end"].fillna(method="ffill").astype("int32") begins = offset_df["begin"].mask(special_tokens_mask, other=ends).astype("int32") # Create char and token span arrays char_spans = CharSpanArray(target_text, begins, ends) token_spans = TokenSpanArray(char_spans, np.arange(len(char_spans)), np.arange(1, len(char_spans) + 1)) token_features = pd.DataFrame({ "id": special_tokens_mask.index, # Use values instead of series because different indexes "char_span": pd.Series(char_spans).values, "token_span": pd.Series(token_spans).values, "input_id": tokenized_result["input_ids"], "token_type_id": tokenized_result["token_type_ids"], "attention_mask": tokenized_result["attention_mask"], "special_tokens_mask": special_tokens_mask, }) return token_features