def predict_on_df(df: pd.DataFrame, id_to_class: Dict[int, str], predictor): """ Run a trained model on a DataFrame of tokens with embeddings. :param df: DataFrame of tokens for a document, containing a TokenSpan column called "embedding" for each token. :param id_to_class: Mapping from class ID to class name, as returned by :func:`text_extensions_for_pandas.make_iob_tag_categories` :param predictor: Python object with a `predict` method that accepts a numpy array of embeddings. :returns: A copy of `df`, with the following additional columns: `predicted_id`, `predicted_class`, `predicted_iob`, and `predicted_type` and `predicted_class_pr`. """ x_values = df["embedding"].values result_df = df.copy() result_df["predicted_id"] = predictor.predict(x_values) result_df["predicted_class"] = [ id_to_class[i] for i in result_df["predicted_id"].values ] iobs, types = tp.io.conll.decode_class_labels( result_df["predicted_class"].values) result_df["predicted_iob"] = iobs result_df["predicted_type"] = types prob_values = predictor.predict_proba(x_values) result_df["predicted_class_pr"] = tp.TensorArray(prob_values) return result_df
def collapse_time_series(df: pd.DataFrame, ts_cols: Sequence[str]) -> pd.DataFrame: """ Collapse one or more time series in a dataframe into tensors. :param df: Input dataframe with time series arranged vertically. The dataframe must have a 2-level index, and the first level containing time series ID and the second level time for each row of the time series. All time series must be of the same length and have matching times at all points. :param ts_cols: Names of one or more columns in `df` containing time series data. **Currently, all time series must be of the same length.** :returns: Two items: * A transformed version of `df` in which the time series that were originally stored "vertically" across rows have been collapsed down to 1-D tensors and stored in columns of type `TensorType`. * A numpy array of the times that correspond to the elements of the time series """ if (not isinstance(df.index, pd.MultiIndex) or len(df.index.names) != 2): raise ValueError(f"Dataframe must have a 2-level index, " f"with the first level containing time " f"series ID and the second level position " f"within the time series " f"(index was {df.index}).") # Pass through metadata columns from the original table. # We assume that the first value in each time series will suffice. meta_cols = [c for c in df.columns if c not in ts_cols] result = df.groupby(df.index.names[0]).aggregate( {c: "first" for c in meta_cols}) # Pull out the time values for the time series' points ts_times = df.index.levels[1].values ts_times_name = df.index.names[1] id_values = df.index.levels[0].values num_id_values = len(id_values) # TODO: Figure out why the following code doesn't work # result[ts_times_name] = tp.TensorArray( # np.tile(ts_times, num_id_values).reshape([num_id_values, -1]) # ) for ts_col in ts_cols: # Build up a list of numpy views on the source dataframe to_stack = [df.loc[id_value][ts_col].values for id_value in id_values] # Concatenate all the views into a single new array, then # wrap that array in a column of our output dataframe. result[ts_col] = tp.TensorArray(np.stack(to_stack)) times = df.index.levels[1].values return result, times
def agg_func(series: pd.Series): # util function for predicting the probabilities of each class when multiple sub-tokens are combined. # this method assumes independence between subtoken classes and calculates the probabilities of # all subtokens being the same class, then re-normalizes so the vector components sum to one again vec = series.to_numpy().prod(axis=0) if np.sum( vec ) == 0: # if we underflow, (only happens in rare cases) log everything and continue mat = np.log2(series.to_numpy()) vec = mat.sum(axis=0) vec -= np.logaddexp2.reduce(vec) return np.exp2(vec) return tp.TensorArray(vec / np.sum(vec))
def infer_on_df(df: pd.DataFrame, id_to_class_dict, predictor, iob=False, embeddings_col="embedding"): """ Takes a dataframe containing bert embeddings and a model trained on bert embeddings, and runs inference on the dataframe. if IOB is specified, predicted id and type are broken out from the raw probabilities given. :param df: the document on which to perform inference; of the form output by the `preprocess_documents` method of this module, and containing BERT embeddings, references to fold and document numbers, as well as some column containing unique identifiers for the raw tokenization of the document (i.e. `'raw_token_id'` field in output DataFrames from `preprocess_documents`) :param id_to_class_dict: Mapping from class ID to class name, as returned by :func:`text_extensions_for_pandas.make_iob_tag_categories` :param predictor: Python object with a `predict` method that accepts a numpy array of embeddings. :param iob: a boolean value, when set to true, additional logic for iob-formatted classes is activated :param embeddings_col: the column in `df` that contains BERT embeddings for that document :returns: a Pandas DataFrame, mirroring df, and conaining three extra columns: * `'predicted_id'` with the id as predicted by the model of the categorical element * `'predicted_class'` containing the predicted categorical value corresponding to predicted_id * `'raw_output'` a TensorArray containing the raw output vectors from the model """ result_df = df.copy() raw_outputs = tp.TensorArray( predictor.predict_proba(result_df[embeddings_col])) result_df["predicted_id"] = np.argmax(raw_outputs, axis=1) result_df["predicted_class"] = result_df["predicted_id"].apply( lambda p_id: id_to_class_dict[p_id]) if iob: iobs, types = tp.io.conll.decode_class_labels( result_df["predicted_class"].values) result_df["predicted_iob"] = iobs result_df["predicted_type"] = types result_df["raw_output"] = raw_outputs return result_df
def add_embeddings(df: pd.DataFrame, bert: Any) -> pd.DataFrame: """ Add BERT embeddings to a dataframe of BERT tokens. :param df: Dataframe containing BERT tokens. Must contain a column "input_id" containing token IDs. :param bert: PyTorch-based BERT model from the `transformers` library :returns: A copy of `df` with a new column, "embedding" containing BERT embeddings as a `TensorArray`. """ _OVERLAP = 32 _NON_OVERLAP = 64 flat_input_ids = df["input_id"].values windows = tp.seq_to_windows(flat_input_ids, _OVERLAP, _NON_OVERLAP) bert_result = bert(input_ids=torch.tensor(windows["input_ids"]), attention_mask=torch.tensor(windows["attention_masks"])) hidden_states = tp.windows_to_seq(flat_input_ids, bert_result[0].detach().numpy(), _OVERLAP, _NON_OVERLAP) embeddings = tp.TensorArray(hidden_states) ret = df.copy() ret["embedding"] = embeddings return ret
def infer_and_extract_raw_entites( doc: pd.DataFrame, id_to_class_dict, predictor, raw_span_id_col="raw_span_id", fold_col="fold", doc_col="doc_num", agg_func=None, keep_cols: List[str] = None, ): """ Takes a dataframe containing bert embeddings and a model trained on bert embeddings, and runs inference on the dataframe. Then using references to the original spans, reconstucts the predicted value of each token of the original tokenization. :param doc: the document on which to perform inference; of the form output by the `preprocess_documents` method of this module, and containing BERT embeddings, references to fold and document numbers, as well as some column containing unique identifiers for the raw tokenization of the document :param id_to_class_dict: Mapping from class ID to class name, as returned by :func:`text_extensions_for_pandas.make_iob_tag_categories` :param predictor: Python object with a `predict` method that accepts a numpy array of embeddings. :param fold_col: the name of the column of `doc` containing the fold of each token :param doc_col: the name of the column of `doc` containing the document number of each token :param raw_span_id_col: the name of the column of `doc` containing some identifier of the raw token that each bert token came from. :param agg_func: if specified, a function that takes in a series of tensorArrays and returns a pandas-compatible type; used to aggregate the predictions of multiple subtokens when multiple subtokens all describe the same original token. :param keep_cols: any column that you wish to be carried over to the output dataframe, by default the column 'raw_span' is the only column to be carried over, if it exists. """ if agg_func is None: def agg_func(series: pd.Series): # util function for predicting the probabilities of each class when multiple sub-tokens are combined. # this method assumes independence between subtoken classes and calculates the probabilities of # all subtokens being the same class, then re-normalizes so the vector components sum to one again vec = series.to_numpy().prod(axis=0) if np.sum( vec ) == 0: # if we underflow, (only happens in rare cases) log everything and continue mat = np.log2(series.to_numpy()) vec = mat.sum(axis=0) vec -= np.logaddexp2.reduce(vec) return np.exp2(vec) return tp.TensorArray(vec / np.sum(vec)) # build aggregation fields keep_cols = (keep_cols if keep_cols is not None else [ "fold", "doc_num", "token_id", "raw_span", ]) sort_cols = [ col for col in [fold_col, doc_col, raw_span_id_col] if col in doc.columns ] keep_cols = [ c for c in keep_cols if c in doc.columns and c not in sort_cols ] # filter out cols not in df aggby = {k: "first" for k in keep_cols} aggby["raw_output"] = agg_func df = doc[["embedding"] + keep_cols + sort_cols].copy() # first, run inference df.loc[:, "raw_output"] = tp.TensorArray( predictor.predict_proba(df["embedding"])) # group by original tag groupby = df.groupby(sort_cols) results_df = groupby.agg(aggby).reset_index().sort_values(sort_cols) # repeat translation results_df["predicted_id"] = results_df.raw_output.apply( lambda s: np.array(s).argmax()) results_df["predicted_class"] = results_df["predicted_id"].apply( lambda p_id: id_to_class_dict[p_id]) return results_df