def extract_split( doc_text: str, split_points: Union[Sequence[int], np.ndarray, SpanArray]) -> SpanArray: """ Split a document into spans along a specified set of split points. :param doc_text: Text of the document; will be the target text of the returned spans. :param split_points: A series of offsets into ``doc_text``, expressed as either: * A sequence of integers (split at certain locations and return a set of splits that covers every character in the document) as a list or 1-d Numpy array * A sequence of spans (split around the indicated locations, but discard the parts of the document that are within a split point) :returns: An ``SpanArray`` that splits the document in the specified way. """ if isinstance(split_points, (collections.abc.Sequence, np.ndarray)): # Single-integer split points ==> zero-length spans split_points = SpanArray(doc_text, split_points, split_points) elif not isinstance(split_points, SpanArray): raise TypeError( f"Split points are of type {type(split_points)}. Expected a " f"sequence of integers or a SpanArray.") # Make sure split points are in order sorted_indices = split_points.argsort() sorted_split_points = split_points[sorted_indices] # Break out the split points. split_begins = sorted_split_points.begin.tolist() # type: List[int] split_ends = sorted_split_points.end.tolist() # type: List[int] # Tack on an additional split point at the very end to simplify the logic below. split_begins.append(len(doc_text)) split_ends.append(len(doc_text)) # Walk through the document, generating the begin and end offsets of spans begins = [] ends = [] begin = 0 for split_begin, split_end in zip(split_begins, split_ends): end = split_begin if end > begin: # Ignore zero-length and negative-length chunks begins.append(begin) ends.append(end) begin = split_end return SpanArray(doc_text, begins, ends)
def lemmatize(spans: Union[pd.Series, SpanArray, Iterable[Span]], token_features: pd.DataFrame, lemma_col_name: str = "lemma", token_span_col_name: str = "span") -> List[str]: """ Convert spans to their normal form using lemma information in a token features table. :param spans: Spans to be normalized. Each may represent zero or more tokens. :param token_features: DataFrame of token metadata. Index must be aligned with the token indices in `spans`. :param lemma_col_name: Optional custom name for the DataFrame column containing the lemmatized form of each token. :param token_span_col_name: Optional custom name for the DataFrame column containing the span of each token. :return: A list containing normalized versions of the tokens in `spans`, with each token separated by single space character. """ char_spans = SpanArray.make_array(spans) token_spans = TokenSpanArray.align_to_tokens(token_features[token_span_col_name], char_spans) ret = [] # Type: List[str] # TODO: Vectorize this loop for i in range(len(token_spans)): lemmas = token_features[lemma_col_name][ token_spans.begin_token[i]:token_spans.end_token[i] ] ret.append(" ".join(lemmas)) return ret
def arrow_to_span(extension_array: pa.ExtensionArray) -> SpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowSpanType to a SpanArray. :param extension_array: pyarrow.ExtensionArray with type ArrowSpanType :return: SpanArray """ if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError( "Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) assert pa.types.is_struct(extension_array.storage.type) # Get target text from the begins field metadata and decode string metadata = extension_array.storage.type[ArrowSpanType.BEGINS_NAME].metadata target_text = metadata[ArrowSpanType.TARGET_TEXT_KEY] if isinstance(target_text, bytes): target_text = target_text.decode() # Get the begins/ends pyarrow arrays begins_array = extension_array.storage.field(ArrowSpanType.BEGINS_NAME) ends_array = extension_array.storage.field(ArrowSpanType.ENDS_NAME) # Zero-copy convert arrays to numpy begins = begins_array.to_numpy() ends = ends_array.to_numpy() return SpanArray(target_text, begins, ends)
def make_tokens_and_features( target_text: str, language_model, add_left_and_right=False, ) -> pd.DataFrame: """ :param target_text: Text to analyze :param language_model: Preconfigured spaCy language model (`spacy.language.Language`) object :param add_left_and_right: If ``True``, add columns "left" and "right" containing references to previous and next tokens. :return: A tuple of two dataframes: 1. The tokens of the text plus additional linguistic features that the language model generates, represented as a `pd.DataFrame`. 2. A table of named entities identified by the language model's named entity tagger, represented as a `pd.DataFrame`. """ spacy_doc = language_model(target_text) # TODO: Performance tuning of the translation code that follows # Represent the character spans of the tokens tok_begins = np.array([t.idx for t in spacy_doc]) tok_ends = np.array([t.idx + len(t) for t in spacy_doc]) tokens_array = SpanArray(target_text, tok_begins, tok_ends) tokens_series = pd.Series(tokens_array) # Also build single-token token-based spans to make it easier to build # larger token-based spans. token_spans = TokenSpanArray.from_char_offsets(tokens_series.array) # spaCy identifies tokens by semi-arbitrary integer "indexes" (in practice, # the offset of the first character in the token). Translate from these # to a dense range of integer IDs that will correspond to the index of our # returned DataFrame. idx_to_id = {spacy_doc[i].idx: i for i in range(len(spacy_doc))} # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2 iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False) df_cols = { "id": range(len(tok_begins)), "span": tokens_series, "lemma": [t.lemma_ for t in spacy_doc], "pos": pd.Categorical([str(t.pos_) for t in spacy_doc]), "tag": pd.Categorical([str(t.tag_) for t in spacy_doc]), "dep": pd.Categorical([str(t.dep_) for t in spacy_doc]), "head": np.array([idx_to_id[t.head.idx] for t in spacy_doc]), "shape": pd.Categorical([t.shape_ for t in spacy_doc]), "ent_iob": pd.Categorical([str(t.ent_iob_) for t in spacy_doc], dtype=iob2_dtype), "ent_type": pd.Categorical([str(t.ent_type_) for t in spacy_doc]), "is_alpha": np.array([t.is_alpha for t in spacy_doc]), "is_stop": np.array([t.is_stop for t in spacy_doc]), "sentence": _make_sentences_series(spacy_doc, tokens_array), } if add_left_and_right: # Use nullable int type because these columns contain nulls df_cols["left"] = pd.array( [None] + list(range(len(tok_begins) - 1)), dtype=pd.Int32Dtype() ) df_cols["right"] = pd.array( list(range(1, len(tok_begins))) + [None], dtype=pd.Int32Dtype() ) return pd.DataFrame(df_cols)
def __eq__(self, other): """ Pandas/Numpy-style array/series comparison function. :param other: Second operand of a Pandas "==" comparison with the series that wraps this TokenSpanArray. :return: Returns a boolean mask indicating which rows match `other`. """ if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented if isinstance(other, TokenSpan) and self.tokens.equals(other.tokens): mask = np.full(len(self), True, dtype=np.bool) mask[self.begin_token != other.begin_token] = False mask[self.end_token != other.end_token] = False return mask elif isinstance(other, TokenSpanArray) and self.tokens.equals( other.tokens): if len(self) != len(other): raise ValueError("Can't compare arrays of differing lengths " "{} and {}".format(len(self), len(other))) return np.logical_and(self.begin_token == other.begin_token, self.end_token == other.end_token) else: # Different tokens, no tokens, unexpected type ==> fall back on superclass return SpanArray.__eq__(self, other)
def make_tokens(target_text: str, tokenizer) -> pd.Series: """ :param target_text: Text to tokenize :param tokenizer: Preconfigured `spacy.tokenizer.Tokenizer` object :return: The tokens (and underlying text) as a Pandas Series wrapped around a `SpanArray` value. """ spacy_doc = tokenizer(target_text) tok_begins = np.array([t.idx for t in spacy_doc]) tok_ends = np.array([t.idx + len(t) for t in spacy_doc]) return pd.Series(SpanArray(target_text, tok_begins, tok_ends))
def __init__(self, tokens: Any, begin_token: int, end_token: int): """ :param tokens: Tokenization information about the document, including the target text. Must be a type that :func:`SpanArray.make_array()` can convert to a `SpanArray`. :param begin_token: Begin offset (inclusive) within the tokenized text, :param end_token: End offset; exclusive, one past the last token """ tokens = SpanArray.make_array(tokens) if TokenSpan.NULL_OFFSET_VALUE != begin_token and begin_token < 0: raise ValueError( f"Begin token offset must be NULL_OFFSET_VALUE or " f"greater than zero (got {begin_token})") if TokenSpan.NULL_OFFSET_VALUE != begin_token and end_token < begin_token: raise ValueError(f"End must be >= begin (got {begin_token} and " f"{end_token}") if begin_token > len(tokens): raise ValueError( f"Begin token offset of {begin_token} larger than " f"number of tokens ({len(tokens)})") if end_token > len(tokens) + 1: raise ValueError(f"End token offset of {end_token} larger than " f"number of tokens + 1 ({len(tokens)} + 1)") if len(tokens) == 0 and begin_token != TokenSpan.NULL_OFFSET_VALUE: raise ValueError( f"Tried to create a non-null TokenSpan over an empty list of tokens." ) if TokenSpan.NULL_OFFSET_VALUE == begin_token: if TokenSpan.NULL_OFFSET_VALUE != end_token: raise ValueError( "Begin offset with special 'null' value {} " "must be paired with an end offset of {}", TokenSpan.NULL_OFFSET_VALUE, TokenSpan.NULL_OFFSET_VALUE, ) begin_char_off = end_char_off = Span.NULL_OFFSET_VALUE else: begin_char_off = tokens.begin[begin_token] end_char_off = (begin_char_off if begin_token == end_token else tokens.end[end_token - 1]) if len(tokens) == 0: doc_text = None elif not tokens.is_single_document: raise ValueError("Tokens must be from exactly one document.") else: doc_text = tokens.document_text super().__init__(doc_text, begin_char_off, end_char_off) self._tokens = tokens self._begin_token = begin_token self._end_token = end_token
def _make_entity_mentions_dataframe( entities: List, original_text: str, apply_standard_schema: bool) -> pd.DataFrame: """ Unroll the records of the "mentions" element of NLU entities into a flat DataFrame. Schema of this DataFrame is `_entity_mentions_schema` above. :param entities: The "entities" section of a parsed NLU response :param original_text: Text of the document. This argument must be provided if there are entity mention spans. :param apply_standard_schema: Value of the eponymous argument from `parse_response`. """ if 0 == len(entities) or "mentions" not in entities[0].keys(): # No mentions to unroll. Return an empty DataFrame. return util.apply_schema( pd.DataFrame(columns=[e[0] for e in _entity_mentions_schema]), _entity_mentions_schema, apply_standard_schema) if original_text is None: raise ValueError( "Unable to construct target text for converting entity mentions to spans" ) # Explode out the nested relations containing entity location information. # If there was a version of DataFrame.explode() that could handle structs, # we would be able to vectorize this operation. # Instead we build up the values one row at a time. # Some columns come from "parent" entity records, and some columns come from the # "child" entity mention records. num_parent_cols = len(_entity_mentions_parent_elems) parent_cols = [[] for i in range(num_parent_cols)] begins = [] ends = [] confidences = [] for e in entities: for m in e["mentions"]: for i in range(num_parent_cols): parent_elem = e[_entity_mentions_parent_names[i]] parent_cols[i].append(parent_elem) begins.append(m["location"][0]) ends.append(m["location"][1]) confidences.append( m["confidence"]) # N.B. confidence of mention, not entity # Construct columns, then convert to a DataFrame df_cols = { _entity_mentions_parent_names[i]: parent_cols[i] for i in range(len(_entity_mentions_parent_names)) } df_cols["span"] = SpanArray(original_text, begins, ends) df_cols["confidence"] = confidences return util.apply_schema(pd.DataFrame(df_cols), _entity_mentions_schema, apply_standard_schema)
def from_char_offsets(tokens: Any) -> "TokenSpanArray": """ Convenience factory method for wrapping the character-level spans of a series of tokens into single-token token-based spans. :param tokens: character-based offsets of the tokens, as any type that :func:`SpanArray.make_array()` understands. :return: A TokenSpanArray containing single-token spans for each of the tokens in `tokens`. """ begin_tokens = np.arange(len(tokens)) tokens_array = SpanArray.make_array(tokens) return TokenSpanArray(tokens_array, begin_tokens, begin_tokens + 1)
def make_tokens(target_text: str, tokenizer: "spacy.tokenizer.Tokenizer" = None) \ -> pd.Series: """ :param target_text: Text to tokenize :param tokenizer: Preconfigured `spacy.tokenizer.Tokenizer` object, or None to use the tokenizer returned by :func:`simple_tokenizer()` :return: The tokens (and underlying text) as a Pandas Series wrapped around a `SpanArray` value. """ if tokenizer is None: tokenizer = simple_tokenizer() spacy_doc = tokenizer(target_text) tok_begins = np.array([t.idx for t in spacy_doc]) tok_ends = np.array([t.idx + len(t) for t in spacy_doc]) return pd.Series(SpanArray(target_text, tok_begins, tok_ends))
def extract_regex_tok( tokens: Union[SpanArray, pd.Series], compiled_regex: regex.Regex, min_len=1, max_len=1, output_col_name: str = "match", ): """ Identify all (possibly overlapping) matches of a regular expression that start and end on token boundaries. :param tokens: ``SpanArray`` of token information, optionally wrapped in a `pd.Series`. :param compiled_regex: Regular expression to evaluate. :param min_len: Minimum match length in tokens :param max_len: Maximum match length (inclusive) in tokens :param output_col_name: (optional) name of column of matching spans in the returned DataFrame :returns: A single-column DataFrame containing a span for each match of the regex. """ tokens = SpanArray.make_array(tokens) num_tokens = len(tokens) matches_regex_f = np.vectorize( lambda s: compiled_regex.fullmatch(s) is not None) # The built-in regex functionality of Pandas/Python does not have # an optimized single-pass RegexTok, so generate all the places # where there might be a match and run them through regex.fullmatch(). # Note that this approach is asymptotically inefficient if max_len is large. # TODO: Performance tuning for both small and large max_len matches_list = [] for cur_len in range(min_len, max_len + 1): window_begin_toks = np.arange(0, num_tokens - cur_len + 1) window_end_toks = window_begin_toks + cur_len window_tok_spans = TokenSpanArray(tokens, window_begin_toks, window_end_toks) matches_list.append( pd.Series(window_tok_spans[matches_regex_f( window_tok_spans.covered_text)])) return pd.DataFrame({output_col_name: pd.concat(matches_list)})
def arrow_to_span(extension_array: pa.ExtensionArray) -> SpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowSpanType to a SpanArray. ..NOTE: Only supported with PyArrow >= 2.0.0 :param extension_array: pyarrow.ExtensionArray with type ArrowSpanType :return: SpanArray """ if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise NotImplementedError( "Arrow serialization for SpanArray is not supported with " "PyArrow versions < 2.0.0") if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError( "Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) # NOTE: workaround for bug in parquet reading if pa.types.is_struct(extension_array.type): index_dtype = extension_array.field(ArrowSpanType.BEGINS_NAME).type target_text_dict_dtype = extension_array.field( ArrowSpanType.TARGET_TEXT_DICT_NAME).type extension_array = pa.ExtensionArray.from_storage( ArrowSpanType(index_dtype, target_text_dict_dtype), extension_array) assert pa.types.is_struct(extension_array.storage.type) # Create target text StringTable and text_ids from dictionary array target_text_dict_array = extension_array.storage.field( ArrowSpanType.TARGET_TEXT_DICT_NAME) table_texts = target_text_dict_array.dictionary.to_pylist() string_table = StringTable.from_things(table_texts) text_ids = target_text_dict_array.indices.to_numpy() # Get the begins/ends pyarrow arrays begins_array = extension_array.storage.field(ArrowSpanType.BEGINS_NAME) ends_array = extension_array.storage.field(ArrowSpanType.ENDS_NAME) # Zero-copy convert arrays to numpy begins = begins_array.to_numpy() ends = ends_array.to_numpy() return SpanArray((string_table, text_ids), begins, ends)
def test_left_to_right(self): test_text = "Is it weird in here, or is it just me?" spans = [ Span(test_text, 0, 3), Span(test_text, 2, 3), Span(test_text, 3, 3), Span(test_text, 1, 3), Span(test_text, 0, 4), # index 4 Span(test_text, 5, 7), # index 5 Span(test_text, 6, 9), Span(test_text, 8, 9), # index 7 ] df = pd.DataFrame({ "s": SpanArray._from_sequence(spans), "ix": range(len(spans)) }) c_df = consolidate(df, on="s", how="left_to_right") self._assertArrayEquals(list(c_df.index), [4, 5, 7])
def __eq__(self, other): """ Pandas/Numpy-style array/series comparison function. :param other: Second operand of a Pandas "==" comparison with the series that wraps this TokenSpanArray. :return: Returns a boolean mask indicating which rows match `other`. """ if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndex)): # Rely on pandas to unbox and dispatch to us. return NotImplemented elif (isinstance(other, TokenSpanArray) and len(self) == len(other) and self.same_tokens(other)): return np.logical_and(self.begin_token == other.begin_token, self.end_token == other.end_token) else: # Different tokens, no tokens, unexpected type ==> fall back on superclass return SpanArray.__eq__(self, other)
def extract_regex( doc_text: str, compiled_regex: "re.Pattern" # Double quotes for Python 3.6 compatibility ): """ Identify all non-overlapping matches of a regular expression, as returned by ``re.Pattern.finditer()``, and return those locations as an array of spans. :param doc_text: Text of the document; will be the target text of the returned spans. :param compiled_regex: Regular expression to evaluate, compiled with either the ``re`` or the ``regex`` package. :returns: A ``SpanArray`` containing a span for each match of the regex. """ begins = [] ends = [] for a in compiled_regex.finditer(doc_text): begins.append(a.start()) ends.append(a.end()) return SpanArray(doc_text, begins, ends)
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to a TokenSpanArray. :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType :return: TokenSpanArray """ if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError( "Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) assert pa.types.is_struct(extension_array.storage.type) # Get target text from the begins field metadata and decode string metadata = extension_array.storage.type[ ArrowTokenSpanType.BEGINS_NAME].metadata target_text = metadata[ArrowSpanType.TARGET_TEXT_KEY] if isinstance(target_text, bytes): target_text = target_text.decode() # Get the begins/ends pyarrow arrays token_begins_array = extension_array.storage.field( ArrowTokenSpanType.BEGINS_NAME) token_ends_array = extension_array.storage.field( ArrowTokenSpanType.ENDS_NAME) # Check if CharSpans have been split num_char_span_splits = extension_array.type.num_char_span_splits if num_char_span_splits > 0: char_begins_splits = [] char_ends_splits = [] for i in range(num_char_span_splits): char_begins_splits.append( extension_array.storage.field(ArrowSpanType.BEGINS_NAME + "_{}".format(i))) char_ends_splits.append( extension_array.storage.field(ArrowSpanType.ENDS_NAME + "_{}".format(i))) char_begins_array = pa.concat_arrays(char_begins_splits) char_ends_array = pa.concat_arrays(char_ends_splits) else: char_begins_array = extension_array.storage.field( ArrowSpanType.BEGINS_NAME) char_ends_array = extension_array.storage.field( ArrowSpanType.ENDS_NAME) # Remove any trailing padding if char_begins_array.null_count > 0: char_begins_array = char_begins_array[:-char_begins_array.null_count] char_ends_array = char_ends_array[:-char_ends_array.null_count] # Zero-copy convert arrays to numpy token_begins = token_begins_array.to_numpy() token_ends = token_ends_array.to_numpy() char_begins = char_begins_array.to_numpy() char_ends = char_ends_array.to_numpy() # Create the SpanArray, then the TokenSpanArray char_span = SpanArray(target_text, char_begins, char_ends) return TokenSpanArray(char_span, token_begins, token_ends)
def _doc_to_df(doc: List[_SentenceData], column_names: List[str], iob_columns: List[bool], space_before_punct: bool) -> pd.DataFrame: """ Convert the "Python objects" representation of a document from a CoNLL-2003 file into a `pd.DataFrame` of token metadata. :param doc: List of Python objects that represents the document. :param column_names: Names for the metadata columns that come after the token text. These names will be used to generate the names of the dataframe that this function returns. :param iob_columns: Mask indicating which of the metadata columns after the token text should be treated as being in IOB format. If a column is in IOB format, the returned dataframe will contain *two* columns, holding IOB2 tags and entity type tags, respectively. For example, an input column "ent" will turn into output columns "ent_iob" and "ent_type". :param space_before_punct: If `True`, add whitespace before punctuation characters (and after left parentheses) when reconstructing the text of the document. :return: DataFrame with four columns: * `span`: Span of each token, with character offsets. Backed by the concatenation of the tokens in the document into a single string with one sentence per line. * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared in the original file, with no corrections applied. * `ent_type`: Entity type names for tokens tagged "I" or "B" in the `ent_iob` column; `None` everywhere else. * `line_num`: line number of each token in the parsed file """ # Character offsets of tokens in the reconstructed document begins_list = [] # Type: List[np.ndarray] ends_list = [] # Type: List[np.ndarray] # Reconstructed text of each sentence sentences_list = [] # Type: List[np.ndarray] # Token offsets of sentences containing each token in the document. sentence_begins_list = [] # Type: List[np.ndarray] sentence_ends_list = [] # Type: List[np.ndarray] # Token metadata column values. Key is column name, value is metadata for # each token. meta_lists = _make_empty_meta_values(column_names, iob_columns) # Line numbers of the parsed file for each token in the doc doc_line_nums = [] char_position = 0 token_position = 0 for sentence_num in range(len(doc)): sentence = doc[sentence_num] tokens = sentence.tokens # Don't put spaces before punctuation in the reconstituted string. no_space_before_mask = (np.zeros(len(tokens), dtype=np.bool) if space_before_punct else _SPACE_BEFORE_MATCH_FN(tokens)) no_space_after_mask = (np.zeros(len(tokens), dtype=np.bool) if space_before_punct else _SPACE_AFTER_MATCH_FN(tokens)) no_space_before_mask[0] = True # No space before first token no_space_after_mask[-1] = True # No space after last token shifted_no_space_after_mask = np.roll(no_space_after_mask, 1) prefixes = np.where( np.logical_or(no_space_before_mask, shifted_no_space_after_mask), "", " ") string_parts = np.ravel((prefixes, tokens), order="F") sentence_text = "".join(string_parts) sentences_list.append(sentence_text) lengths = np.array([len(t) for t in tokens]) prefix_lengths = np.array([len(p) for p in prefixes]) # Begin and end offsets, accounting for which tokens have spaces # before them. e = np.cumsum(lengths + prefix_lengths) b = e - lengths begins_list.append(b + char_position) ends_list.append(e + char_position) sentence_begin_token = token_position sentence_end_token = token_position + len(e) sentence_begins = np.repeat(sentence_begin_token, len(e)) sentence_ends = np.repeat(sentence_end_token, len(e)) sentence_begins_list.append(sentence_begins) sentence_ends_list.append(sentence_ends) for k in sentence.token_metadata.keys(): meta_lists[k].extend(sentence.token_metadata[k]) char_position += e[-1] + 1 # "+ 1" to account for newline token_position += len(e) doc_line_nums.extend(sentence.line_nums) begins = np.concatenate(begins_list) ends = np.concatenate(ends_list) doc_text = "\n".join(sentences_list) char_spans = SpanArray(doc_text, begins, ends) sentence_spans = TokenSpanArray(char_spans, np.concatenate(sentence_begins_list), np.concatenate(sentence_ends_list)) ret = pd.DataFrame({"span": char_spans}) for k, v in meta_lists.items(): ret[k] = v ret["sentence"] = sentence_spans ret["line_num"] = pd.Series(doc_line_nums) return ret
def make_bert_tokens(target_text: str, tokenizer) -> pd.DataFrame: """ Tokenize the indicated text for BERT embeddings and return a DataFrame with one row per token. :param: target_text: string to tokenize :param: tokenizer: A tokenizer that is a subclass of huggingface transformers PreTrainingTokenizerFast which supports `encode_plus` with return_offsets_mapping=True. :returns: ``pd.DataFrame`` with following columns: * "id": unique integer ID for each token * "span": span of the token (with offsets measured in characters) * "input_id": integer ID suitable for input to a BERT embedding model * "token_type_id": list of token type ids to be fed to a model * "attention_mask": list of indices specifying which tokens should be attended to by the model * "special_tokens_mask": `True` if the token is a zero-length special token such as "start of document" """ # noinspection PyPackageRequirements from transformers import PreTrainedTokenizerFast if not isinstance(tokenizer, PreTrainedTokenizerFast): raise TypeError("Tokenizer must be an instance of " "transformers.PreTrainedTokenizerFast that supports " "encode_plus with return_offsets_mapping=True.") tokenized_result = tokenizer.encode_plus(target_text, return_special_tokens_mask=True, return_offsets_mapping=True) # Get offset mapping from tokenizer offsets = tokenized_result["offset_mapping"] # Init any special tokens at beginning i = 0 while offsets[i] is None: offsets[i] = (0, 0) i += 1 # Make a DataFrame to unzip (begin, end) offsets offset_df = pd.DataFrame(offsets, columns=["begin", "end"]) # Convert special tokens mask to boolean special_tokens_mask = pd.Series( tokenized_result["special_tokens_mask"]).astype("bool") # Fill remaining special tokens to zero-length spans ends = offset_df["end"].fillna(method="ffill").astype("int32") begins = offset_df["begin"].mask(special_tokens_mask, other=ends).astype("int32") spans = SpanArray(target_text, begins, ends) token_features = pd.DataFrame({ "token_id": special_tokens_mask.index, "span": spans, "input_id": tokenized_result["input_ids"], "token_type_id": tokenized_result["token_type_ids"], "attention_mask": tokenized_result["attention_mask"], "special_tokens_mask": special_tokens_mask, }) return token_features
def align_to_tokens(cls, tokens: Any, spans: Any): """ Align a set of character or token-based spans to a specified tokenization, producing a `TokenSpanArray` of token-based spans. :param tokens: The tokens to align to, as any type that `SpanArray.make_array()` accepts. :param spans: The spans to align. These spans must all target the same text as `tokens`. :return: An array of `TokenSpan`s aligned to the tokens of `tokens`. Raises `ValueError` if any of the spans in `spans` doesn't start and end on a token boundary. """ tokens = SpanArray.make_array(tokens) spans = SpanArray.make_array(spans) if not tokens.is_single_document: raise ValueError( f"Tokens cover more than one document (tokens are {tokens})") if not spans.is_single_document: raise ValueError( f"Spans cover more than one document (spans are {spans})") # Create and join temporary dataframes tokens_df = pd.DataFrame({ "token_index": np.arange(len(tokens)), "token_begin": tokens.begin, "token_end": tokens.end }) spans_df = pd.DataFrame({ "span_index": np.arange(len(spans)), "span_begin": spans.begin, "span_end": spans.end }) # Ignore zero-length tokens # TODO: Is this the right thing to do? tokens_df = tokens_df[ tokens_df["token_begin"] != tokens_df["token_end"]] begin_matches = pd.merge(tokens_df, spans_df, left_on="token_begin", right_on="span_begin", how="right", indicator=True) mismatched = begin_matches[begin_matches["_merge"] == "right_only"] if len(mismatched.index) > 0: raise ValueError( f"The following span(s) did not align with the begin offset\n" f"of any token:\n" f"{mismatched[['span_index', 'span_begin', 'span_end']]}") end_matches = pd.merge(tokens_df, spans_df, left_on="token_end", right_on="span_end", how="right", indicator=True) mismatched = end_matches[end_matches["_merge"] == "right_only"] if len(mismatched.index) > 0: raise ValueError( f"The following span(s) did not align with the end offset\n" f"of any token:\n" f"{mismatched[['span_index', 'span_begin', 'span_end']]}") # Join on span index to get (begin, end) pairs. begins_and_ends = pd.merge(begin_matches[["token_index", "span_index"]], end_matches[["token_index", "span_index"]], on="span_index", suffixes=("_begin", "_end"), sort=True) return TokenSpanArray(tokens, begins_and_ends["token_index_begin"], begins_and_ends["token_index_end"] + 1)
def __hash__(self): if self._hash is None: # Use superclass hash function so that hash() and == are consistent # across type. self._hash = SpanArray.__hash__(self) return self._hash
return Span.__lt__(self, other) @property def tokens(self): return self._tokens @property def begin_token(self): return self._begin_token @property def end_token(self): return self._end_token _EMPTY_SPAN_ARRAY_SINGLETON = SpanArray("", [], []) _NULL_TOKEN_SPAN_SINGLETON = TokenSpan(_EMPTY_SPAN_ARRAY_SINGLETON, Span.NULL_OFFSET_VALUE, Span.NULL_OFFSET_VALUE) @pd.api.extensions.register_extension_dtype class TokenSpanDtype(SpanDtype): """ Pandas datatype for a span that represents a range of tokens within a target string. """ @property def type(self): # The type for a single row of a column of type TokenSpan
def tokens(self) -> SpanArray: return SpanArray(self._text, self._begins, self._ends)