Example #1
0
def process_ordinal_input(data,
                          key,
                          drop_percent=None,
                          drop_val=np.nan,
                          nac=False,
                          _print=print):
    '''Helper function to perform processing on ordinal input,
    where note this definition of ordinal means categorical ordinal...
    so converting input to ordinal, but from categorical!

    Parameters
    ----------
    data : pandas DataFrame
        BPt formatted df. Must contain a column with `key`

    key : str
        Column key of the column to process within `data` input.

    drop_percent : float
        % to drop

    drop_val : NaN or int
        If a row needs to be dropped, replace with drop_val

        (default = np.nan)

    Returns
    ----------
    pandas DataFrame
        The post-processed BPt formatted input df.

    sklearn LabelEncoder
        The sklearn labelencoder object mapping input to
        transformed ordinal label
    '''

    # If requested, perform categorical dropping by percent
    if drop_percent:
        data = cat_drop_by_percent(data, key, drop_percent, drop_val, _print)

    # Work on only non_dropped data / non NaN data if applicable
    non_drop_data, non_drop_subjects =\
        get_non_drop(data, key, drop_val)

    # Encode ordinally
    label_encoder = LabelEncoder()
    non_drop_data[key] = label_encoder.fit_transform(non_drop_data[key])
    non_drop_data[key] = non_drop_data[key].astype('category')

    # Re-create data
    data = put_non_drop_back(data, non_drop_subjects, non_drop_data)

    # Check if nan was encoded if nac
    if nac:
        try:
            nan_val = label_encoder.transform(['nan'])[0]
            label_encoder.nan_val = nan_val

        # Still need a NaN value, even if not already set
        except ValueError:

            label_encoder.classes_ =\
                np.array(list(label_encoder.classes_) + ['nan'])
            label_encoder.nan_val = len(label_encoder.classes_) - 1

    return data, label_encoder