def predict_on_df(df: pd.DataFrame, id_to_class: Dict[int, str], predictor):
    """
    Run a trained model on a DataFrame of tokens with embeddings.

    :param df: DataFrame of tokens for a document, containing a TokenSpan column
     called "embedding" for each token.
    :param id_to_class: Mapping from class ID to class name, as returned by
     :func:`text_extensions_for_pandas.make_iob_tag_categories`
    :param predictor: Python object with a `predict` method that accepts a
     numpy array of embeddings.
    :returns: A copy of `df`, with the following additional columns:
     `predicted_id`, `predicted_class`, `predicted_iob`, and `predicted_type`
     and `predicted_class_pr`.
    """
    x_values = df["embedding"].values
    result_df = df.copy()
    result_df["predicted_id"] = predictor.predict(x_values)
    result_df["predicted_class"] = [
        id_to_class[i] for i in result_df["predicted_id"].values
    ]
    iobs, types = tp.io.conll.decode_class_labels(
        result_df["predicted_class"].values)
    result_df["predicted_iob"] = iobs
    result_df["predicted_type"] = types
    prob_values = predictor.predict_proba(x_values)
    result_df["predicted_class_pr"] = tp.TensorArray(prob_values)
    return result_df
Beispiel #2
0
def collapse_time_series(df: pd.DataFrame,
                         ts_cols: Sequence[str]) -> pd.DataFrame:
    """
    Collapse one or more time series in a dataframe into tensors.
    
    :param df: Input dataframe with time series arranged vertically.
     The dataframe must have a 2-level index, and the first level
     containing time series ID and the second level time for each
     row of the time series.
     All time series must be of the same length and have matching 
     times at all points.
    :param ts_cols: Names of one or more columns in `df` containing
     time series data.
     **Currently, all time series must be of the same length.**
     
    :returns: Two items:
     * A transformed version of `df` in which the time series
       that were originally stored "vertically" across rows have been
       collapsed down to 1-D tensors and stored in columns of type
       `TensorType`.
     * A numpy array of the times that correspond to the elements of
       the time series 
    """
    if (not isinstance(df.index, pd.MultiIndex) or len(df.index.names) != 2):
        raise ValueError(f"Dataframe must have a 2-level index, "
                         f"with the first level containing time "
                         f"series ID and the second level position "
                         f"within the time series "
                         f"(index was {df.index}).")

    # Pass through metadata columns from the original table.
    # We assume that the first value in each time series will suffice.
    meta_cols = [c for c in df.columns if c not in ts_cols]
    result = df.groupby(df.index.names[0]).aggregate(
        {c: "first"
         for c in meta_cols})

    # Pull out the time values for the time series' points
    ts_times = df.index.levels[1].values
    ts_times_name = df.index.names[1]

    id_values = df.index.levels[0].values
    num_id_values = len(id_values)

    # TODO: Figure out why the following code doesn't work
    # result[ts_times_name] = tp.TensorArray(
    #    np.tile(ts_times, num_id_values).reshape([num_id_values, -1])
    # )

    for ts_col in ts_cols:
        # Build up a list of numpy views on the source dataframe
        to_stack = [df.loc[id_value][ts_col].values for id_value in id_values]
        # Concatenate all the views into a single new array, then
        # wrap that array in a column of our output dataframe.
        result[ts_col] = tp.TensorArray(np.stack(to_stack))

    times = df.index.levels[1].values

    return result, times
        def agg_func(series: pd.Series):
            # util function for predicting the probabilities of each class when multiple sub-tokens are combined.
            # this method assumes independence between subtoken classes and calculates the probabilities of
            # all subtokens being the same class, then re-normalizes so the vector components sum to one again
            vec = series.to_numpy().prod(axis=0)
            if np.sum(
                    vec
            ) == 0:  # if we underflow, (only happens in rare cases) log everything and continue
                mat = np.log2(series.to_numpy())
                vec = mat.sum(axis=0)
                vec -= np.logaddexp2.reduce(vec)
                return np.exp2(vec)

            return tp.TensorArray(vec / np.sum(vec))
def infer_on_df(df: pd.DataFrame,
                id_to_class_dict,
                predictor,
                iob=False,
                embeddings_col="embedding"):
    """
    Takes a dataframe containing bert embeddings and a model trained on bert embeddings,
    and runs inference on the dataframe. if IOB is specified, predicted id and type are
    broken out from the raw probabilities given.
    :param df: the document on which to perform inference; of the form output by  the
     `preprocess_documents` method of this module, and containing BERT embeddings,
     references to fold and document numbers, as well as some column containing unique
     identifiers for the raw tokenization of the document (i.e. `'raw_token_id'` field in
     output DataFrames from `preprocess_documents`)
    :param id_to_class_dict:  Mapping from class ID to class name, as returned by
      :func:`text_extensions_for_pandas.make_iob_tag_categories`
    :param predictor: Python object with a `predict` method that accepts a
     numpy array of embeddings.
    :param iob: a boolean value, when set to true, additional logic for iob-formatted
     classes is activated
    :param embeddings_col: the column in `df` that contains BERT embeddings for that document
    :returns: a Pandas DataFrame, mirroring df, and conaining three extra columns:
        *  `'predicted_id'` with the id as predicted by the model of the categorical element
        *  `'predicted_class'` containing the predicted categorical value corresponding to
            predicted_id
        *  `'raw_output'` a TensorArray containing the raw output vectors from the model
    """
    result_df = df.copy()
    raw_outputs = tp.TensorArray(
        predictor.predict_proba(result_df[embeddings_col]))
    result_df["predicted_id"] = np.argmax(raw_outputs, axis=1)
    result_df["predicted_class"] = result_df["predicted_id"].apply(
        lambda p_id: id_to_class_dict[p_id])
    if iob:
        iobs, types = tp.io.conll.decode_class_labels(
            result_df["predicted_class"].values)
        result_df["predicted_iob"] = iobs
        result_df["predicted_type"] = types
    result_df["raw_output"] = raw_outputs

    return result_df
Beispiel #5
0
def add_embeddings(df: pd.DataFrame, bert: Any) -> pd.DataFrame:
    """
    Add BERT embeddings to a dataframe of BERT tokens.
    
    :param df: Dataframe containing BERT tokens. Must contain a column
     "input_id" containing token IDs.
    :param bert: PyTorch-based BERT model from the `transformers` library
    :returns: A copy of `df` with a new column, "embedding" containing
     BERT embeddings as a `TensorArray`.
    """
    _OVERLAP = 32
    _NON_OVERLAP = 64
    flat_input_ids = df["input_id"].values
    windows = tp.seq_to_windows(flat_input_ids, _OVERLAP, _NON_OVERLAP)
    bert_result = bert(input_ids=torch.tensor(windows["input_ids"]),
                       attention_mask=torch.tensor(windows["attention_masks"]))
    hidden_states = tp.windows_to_seq(flat_input_ids,
                                      bert_result[0].detach().numpy(),
                                      _OVERLAP, _NON_OVERLAP)
    embeddings = tp.TensorArray(hidden_states)
    ret = df.copy()
    ret["embedding"] = embeddings
    return ret
def infer_and_extract_raw_entites(
    doc: pd.DataFrame,
    id_to_class_dict,
    predictor,
    raw_span_id_col="raw_span_id",
    fold_col="fold",
    doc_col="doc_num",
    agg_func=None,
    keep_cols: List[str] = None,
):
    """
    Takes a dataframe containing bert embeddings and a model trained on bert embeddings, and
    runs inference on the dataframe. Then using references to the original spans, reconstucts
    the predicted value of each token of the original tokenization.
    :param doc: the document on which to perform inference; of the form output by  the
     `preprocess_documents` method of this module, and containing BERT embeddings, references to
     fold and document numbers, as well as some column containing unique identifiers for the raw
     tokenization of the document
    :param id_to_class_dict:  Mapping from class ID to class name, as returned by
      :func:`text_extensions_for_pandas.make_iob_tag_categories`
    :param predictor: Python object with a `predict` method that accepts a
     numpy array of embeddings.
    :param fold_col: the name of the column of `doc` containing the fold of each token
    :param doc_col: the name of the column of `doc` containing the document number of each token
    :param raw_span_id_col: the name of the column of `doc` containing some identifier of the raw
      token that each bert token came from.
    :param agg_func: if specified, a function that takes in a series of tensorArrays and returns a
      pandas-compatible type; used to aggregate the predictions of multiple subtokens when
      multiple subtokens all describe the same original token.
    :param keep_cols: any column that you wish to be carried over to the output dataframe, by default
      the column 'raw_span' is the only column to be carried over, if it exists.
    """
    if agg_func is None:

        def agg_func(series: pd.Series):
            # util function for predicting the probabilities of each class when multiple sub-tokens are combined.
            # this method assumes independence between subtoken classes and calculates the probabilities of
            # all subtokens being the same class, then re-normalizes so the vector components sum to one again
            vec = series.to_numpy().prod(axis=0)
            if np.sum(
                    vec
            ) == 0:  # if we underflow, (only happens in rare cases) log everything and continue
                mat = np.log2(series.to_numpy())
                vec = mat.sum(axis=0)
                vec -= np.logaddexp2.reduce(vec)
                return np.exp2(vec)

            return tp.TensorArray(vec / np.sum(vec))

    # build aggregation fields
    keep_cols = (keep_cols if keep_cols is not None else [
        "fold",
        "doc_num",
        "token_id",
        "raw_span",
    ])
    sort_cols = [
        col for col in [fold_col, doc_col, raw_span_id_col]
        if col in doc.columns
    ]
    keep_cols = [
        c for c in keep_cols if c in doc.columns and c not in sort_cols
    ]  # filter out cols not in df
    aggby = {k: "first" for k in keep_cols}
    aggby["raw_output"] = agg_func
    df = doc[["embedding"] + keep_cols + sort_cols].copy()
    # first, run inference
    df.loc[:, "raw_output"] = tp.TensorArray(
        predictor.predict_proba(df["embedding"]))
    # group by original tag
    groupby = df.groupby(sort_cols)
    results_df = groupby.agg(aggby).reset_index().sort_values(sort_cols)
    # repeat translation
    results_df["predicted_id"] = results_df.raw_output.apply(
        lambda s: np.array(s).argmax())

    results_df["predicted_class"] = results_df["predicted_id"].apply(
        lambda p_id: id_to_class_dict[p_id])
    return results_df