def arrow_to_char_span(extension_array: pa.ExtensionArray) -> CharSpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowCharSpanType to a CharSpanArray. :param extension_array: pyarrow.ExtensionArray with type ArrowCharSpanType :return: CharSpanArray """ if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError( "Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) assert pa.types.is_struct(extension_array.storage.type) # Get target text from the begins field metadata and decode string metadata = extension_array.storage.type[ ArrowCharSpanType.BEGINS_NAME].metadata target_text = metadata[ArrowCharSpanType.TARGET_TEXT_KEY] if isinstance(target_text, bytes): target_text = target_text.decode() # Get the begins/ends pyarrow arrays begins_array = extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME) ends_array = extension_array.storage.field(ArrowCharSpanType.ENDS_NAME) # Zero-copy convert arrays to numpy begins = begins_array.to_numpy() ends = ends_array.to_numpy() return CharSpanArray(target_text, begins, ends)
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to a TokenSpanArray. :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType :return: TokenSpanArray """ if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError("Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) assert pa.types.is_struct(extension_array.storage.type) # Get target text from the begins field metadata and decode string metadata = extension_array.storage.type[ArrowTokenSpanType.BEGINS_NAME].metadata target_text = metadata[ArrowCharSpanType.TARGET_TEXT_KEY] if isinstance(target_text, bytes): target_text = target_text.decode() # Get the begins/ends pyarrow arrays token_begins_array = extension_array.storage.field(ArrowTokenSpanType.BEGINS_NAME) token_ends_array = extension_array.storage.field(ArrowTokenSpanType.ENDS_NAME) # Check if CharSpans have been split num_char_span_splits = extension_array.type.num_char_span_splits if num_char_span_splits > 0: char_begins_splits = [] char_ends_splits = [] for i in range(num_char_span_splits): char_begins_splits.append( extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME + "_{}".format(i))) char_ends_splits.append( extension_array.storage.field(ArrowCharSpanType.ENDS_NAME + "_{}".format(i))) char_begins_array = pa.concat_arrays(char_begins_splits) char_ends_array = pa.concat_arrays(char_ends_splits) else: char_begins_array = extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME) char_ends_array = extension_array.storage.field(ArrowCharSpanType.ENDS_NAME) # Remove any trailing padding if char_begins_array.null_count > 0: char_begins_array = char_begins_array[:-char_begins_array.null_count] char_ends_array = char_ends_array[:-char_ends_array.null_count] # Zero-copy convert arrays to numpy token_begins = token_begins_array.to_numpy() token_ends = token_ends_array.to_numpy() char_begins = char_begins_array.to_numpy() char_ends = char_ends_array.to_numpy() # Create the CharSpanArray, then the TokenSpanArray char_span = CharSpanArray(target_text, char_begins, char_ends) return TokenSpanArray(char_span, token_begins, token_ends)
def make_tokens(target_text: str, tokenizer) -> pd.Series: """ :param target_text: Text to tokenize :param tokenizer: Preconfigured `spacy.tokenizer.Tokenizer` object :return: The tokens (and underlying text) as a Pandas Series wrapped around a `CharSpanArray` value. """ spacy_doc = tokenizer(target_text) tok_begins = np.array([t.idx for t in spacy_doc]) tok_ends = np.array([t.idx + len(t) for t in spacy_doc]) return pd.Series(CharSpanArray(target_text, tok_begins, tok_ends))
def _doc_to_df(doc: List[Dict[str, List[str]]], space_before_punct: bool) -> pd.DataFrame: """ Convert the "Python objects" representation of a document from a CoNLL-2003 file into a `pd.DataFrame` of token metadata. :param doc: Tree of Python objects that represents the document, List with one dictionary per sentence. :param space_before_punct: If `True`, add whitespace before punctuation characters when reconstructing the text of the document. :return: DataFrame with four columns: * `char_span`: Span of each token, with character offsets. Backed by the concatenation of the tokens in the document into a single string with one sentence per line. * `token_span`: Span of each token, with token offsets. Backed by the contents of the `char_span` column. * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared in the original file, with no corrections applied. * `ent_type`: Entity type names for tokens tagged "I" or "B" in the `ent_iob` column; `None` everywhere else. """ begins_list = [] # Type: List[np.ndarray] ends_list = [] # Type: List[np.ndarray] sentences_list = [] # Type: List[np.ndarray] iobs_list = [] # Type: List[np.ndarray] entities_list = [] # Type: List[np.ndarray] sentence_begins_list = [] # Type: List[np.ndarray] sentence_ends_list = [] # Type: List[np.ndarray] char_position = 0 token_position = 0 for sentence in doc: tokens = sentence["token"] # Don't put spaces before punctuation in the reconstituted string. no_space_mask = ( np.zeros(len(tokens), dtype=np.bool) if space_before_punct else _PUNCT_MATCH_FN(tokens)) no_space_mask[0] = True # No space before first token prefixes = np.where(no_space_mask, "", " ") string_parts = np.ravel((prefixes, tokens), order='F') sentence_text = "".join(string_parts) sentences_list.append(sentence_text) lengths = np.array([len(t) for t in tokens]) prefix_lengths = np.array([len(p) for p in prefixes]) # Begin and end offsets, accounting for which tokens have spaces # before them. e = np.cumsum(lengths + prefix_lengths) b = e - lengths iobs = np.array(sentence["iob"]) entities = np.array(sentence["entity"]) sentence_begin_token = token_position sentence_end_token = token_position + len(e) sentence_begins = np.repeat(sentence_begin_token, len(e)) sentence_ends = np.repeat(sentence_end_token, len(e)) begins_list.append(b + char_position) ends_list.append(e + char_position) iobs_list.append(iobs) entities_list.append(entities) sentence_begins_list.append(sentence_begins) sentence_ends_list.append(sentence_ends) char_position += e[-1] + 1 # "+ 1" to account for newline token_position += len(e) begins = np.concatenate(begins_list) ends = np.concatenate(ends_list) doc_text = "\n".join(sentences_list) char_spans = CharSpanArray(doc_text, begins, ends) token_begins = np.arange(len(begins)) token_spans = TokenSpanArray(char_spans, token_begins, token_begins + 1) sentence_spans = TokenSpanArray(char_spans, np.concatenate(sentence_begins_list), np.concatenate(sentence_ends_list)) return pd.DataFrame( {"char_span": char_spans, "token_span": token_spans, "ent_iob": np.concatenate(iobs_list), "ent_type": np.concatenate(entities_list), "sentence": sentence_spans})
def _doc_to_df(doc: List[_SentenceData], column_names: List[str], iob_columns: List[bool], space_before_punct: bool) -> pd.DataFrame: """ Convert the "Python objects" representation of a document from a CoNLL-2003 file into a `pd.DataFrame` of token metadata. :param doc: List of Python objects that represents the document. :param column_names: Names for the metadata columns that come after the token text. These names will be used to generate the names of the dataframe that this function returns. :param iob_columns: Mask indicating which of the metadata columns after the token text should be treated as being in IOB format. If a column is in IOB format, the returned dataframe will contain *two* columns, holding IOB2 tags and entity type tags, respectively. For example, an input column "ent" will turn into output columns "ent_iob" and "ent_type". :param space_before_punct: If `True`, add whitespace before punctuation characters (and after left parentheses) when reconstructing the text of the document. :return: DataFrame with four columns: * `char_span`: Span of each token, with character offsets. Backed by the concatenation of the tokens in the document into a single string with one sentence per line. * `token_span`: Span of each token, with token offsets. Backed by the contents of the `char_span` column. * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared in the original file, with no corrections applied. * `ent_type`: Entity type names for tokens tagged "I" or "B" in the `ent_iob` column; `None` everywhere else. """ # Character offsets of tokens in the reconstructed document begins_list = [] # Type: List[np.ndarray] ends_list = [] # Type: List[np.ndarray] # Reconstructed text of each sentence sentences_list = [] # Type: List[np.ndarray] # Token offsets of sentences containing each token in the document. sentence_begins_list = [] # Type: List[np.ndarray] sentence_ends_list = [] # Type: List[np.ndarray] # Token metadata column values. Key is column name, value is metadata for # each token. meta_lists = _make_empty_meta_values(column_names, iob_columns) char_position = 0 token_position = 0 for sentence_num in range(len(doc)): sentence = doc[sentence_num] tokens = sentence.tokens # Don't put spaces before punctuation in the reconstituted string. no_space_before_mask = (np.zeros(len(tokens), dtype=np.bool) if space_before_punct else _SPACE_BEFORE_MATCH_FN(tokens)) no_space_after_mask = (np.zeros(len(tokens), dtype=np.bool) if space_before_punct else _SPACE_AFTER_MATCH_FN(tokens)) no_space_before_mask[0] = True # No space before first token no_space_after_mask[-1] = True # No space after last token shifted_no_space_after_mask = np.roll(no_space_after_mask, 1) prefixes = np.where( np.logical_or(no_space_before_mask, shifted_no_space_after_mask), "", " ") string_parts = np.ravel((prefixes, tokens), order="F") sentence_text = "".join(string_parts) sentences_list.append(sentence_text) lengths = np.array([len(t) for t in tokens]) prefix_lengths = np.array([len(p) for p in prefixes]) # Begin and end offsets, accounting for which tokens have spaces # before them. e = np.cumsum(lengths + prefix_lengths) b = e - lengths begins_list.append(b + char_position) ends_list.append(e + char_position) sentence_begin_token = token_position sentence_end_token = token_position + len(e) sentence_begins = np.repeat(sentence_begin_token, len(e)) sentence_ends = np.repeat(sentence_end_token, len(e)) sentence_begins_list.append(sentence_begins) sentence_ends_list.append(sentence_ends) for k in sentence.token_metadata.keys(): meta_lists[k].extend(sentence.token_metadata[k]) char_position += e[-1] + 1 # "+ 1" to account for newline token_position += len(e) begins = np.concatenate(begins_list) ends = np.concatenate(ends_list) doc_text = "\n".join(sentences_list) char_spans = CharSpanArray(doc_text, begins, ends) token_begins = np.arange(len(begins)) token_spans = TokenSpanArray(char_spans, token_begins, token_begins + 1) sentence_spans = TokenSpanArray(char_spans, np.concatenate(sentence_begins_list), np.concatenate(sentence_ends_list)) ret = pd.DataFrame({"char_span": char_spans, "token_span": token_spans}) for k, v in meta_lists.items(): ret[k] = v ret["sentence"] = sentence_spans return ret
def make_tokens_and_features( target_text: str, language_model, add_left_and_right=False, ) -> pd.DataFrame: """ :param target_text: Text to analyze :param language_model: Preconfigured spaCy language model (`spacy.language.Language`) object :param add_left_and_right: If `True`, add columns "left" and "right" containing references to previous and next tokens. :return: A tuple of two dataframes: 1. The tokens of the text plus additional linguistic features that the language model generates, represented as a `pd.DataFrame`. 2. A table of named entities identified by the language model's named entity tagger, represented as a `pd.DataFrame`. """ spacy_doc = language_model(target_text) # TODO: Performance tuning of the translation code that follows # Represent the character spans of the tokens tok_begins = np.array([t.idx for t in spacy_doc]) tok_ends = np.array([t.idx + len(t) for t in spacy_doc]) tokens_array = CharSpanArray(target_text, tok_begins, tok_ends) tokens_series = pd.Series(tokens_array) # Also build single-token token-based spans to make it easier to build # larger token-based spans. token_spans = TokenSpanArray.from_char_offsets(tokens_series.values) # spaCy identifies tokens by semi-arbitrary integer "indexes" (in practice, # the offset of the first character in the token). Translate from these # to a dense range of integer IDs that will correspond to the index of our # returned DataFrame. idx_to_id = {spacy_doc[i].idx: i for i in range(len(spacy_doc))} # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2 iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False) df_cols = { "id": range(len(tok_begins)), "char_span": tokens_series, "token_span": token_spans, "lemma": [t.lemma_ for t in spacy_doc], "pos": pd.Categorical([str(t.pos_) for t in spacy_doc]), "tag": pd.Categorical([str(t.tag_) for t in spacy_doc]), "dep": pd.Categorical([str(t.dep_) for t in spacy_doc]), "head": np.array([idx_to_id[t.head.idx] for t in spacy_doc]), "shape": pd.Categorical([t.shape_ for t in spacy_doc]), "ent_iob": pd.Categorical([str(t.ent_iob_) for t in spacy_doc], dtype=iob2_dtype), "ent_type": pd.Categorical([str(t.ent_type_) for t in spacy_doc]), "is_alpha": np.array([t.is_alpha for t in spacy_doc]), "is_stop": np.array([t.is_stop for t in spacy_doc]), "sentence": _make_sentences_series(spacy_doc, tokens_array), } if add_left_and_right: # Use nullable int type because these columns contain nulls df_cols["left"] = pd.array([None] + list(range(len(tok_begins) - 1)), dtype=pd.Int32Dtype()) df_cols["right"] = pd.array(list(range(1, len(tok_begins))) + [None], dtype=pd.Int32Dtype()) return pd.DataFrame(df_cols)
def make_bert_tokens(target_text: str, tokenizer) -> pd.DataFrame: """ Tokenize the indicated text for BERT embeddings and return a DataFrame with one row per token. :param: target_text: string to tokenize :param: tokenizer: A tokenizer that is a subclass of huggingface transformers PreTrainingTokenizerFast which supports `encode_plus` with return_offsets_mapping=True. :returns: A `pd.DataFrame` with the following columns: * "id": unique integer ID for each token * "char_span": span of the token with character offsets * "token_span": span of the token with token offsets * "input_id": integer ID suitable for input to a BERT embedding model * "token_type_id": list of token type ids to be fed to a model * "attention_mask": list of indices specifying which tokens should be attended to by the model * "special_tokens_mask": `True` if the token is a zero-length special token such as "start of document" """ from transformers.tokenization_utils import PreTrainedTokenizerFast if not isinstance(tokenizer, PreTrainedTokenizerFast): raise TypeError("Tokenizer must be an instance of " "transformers.PreTrainedTokenizerFast that supports " "encode_plus with return_offsets_mapping=True.") tokenized_result = tokenizer.encode_plus(target_text, return_special_tokens_mask=True, return_offsets_mapping=True) # Get offset mapping from tokenizer offsets = tokenized_result["offset_mapping"] # Init any special tokens at beginning i = 0 while offsets[i] is None: offsets[i] = (0, 0) i += 1 # Make a DataFrame to unzip (begin, end) offsets offset_df = pd.DataFrame(offsets, columns=["begin", "end"]) # Convert special tokens mask to boolean special_tokens_mask = pd.Series( tokenized_result["special_tokens_mask"]).astype("bool") # Fill remaining special tokens to zero-length spans ends = offset_df["end"].fillna(method="ffill").astype("int32") begins = offset_df["begin"].mask(special_tokens_mask, other=ends).astype("int32") # Create char and token span arrays char_spans = CharSpanArray(target_text, begins, ends) token_spans = TokenSpanArray(char_spans, np.arange(len(char_spans)), np.arange(1, len(char_spans) + 1)) token_features = pd.DataFrame({ "id": special_tokens_mask.index, # Use values instead of series because different indexes "char_span": pd.Series(char_spans).values, "token_span": pd.Series(token_spans).values, "input_id": tokenized_result["input_ids"], "token_type_id": tokenized_result["token_type_ids"], "attention_mask": tokenized_result["attention_mask"], "special_tokens_mask": special_tokens_mask, }) return token_features