Example #1
0
def cols_to_multilabel(self):
    '''Utility function for correlation and other reducers
    that require transforming hyperparameter values into
    multilabel values before applying the reduction strategy.'''

    import wrangle
    import pandas as pd

    # read in the experiment log
    data = pd.read_csv(self.experiment_name + '.csv')

    # apply recuction window
    data = data.tail(self.reduction_window)

    # drop all other metric columns except reduction_metric
    data = data[[self.reduction_metric] + self._param_dict_keys]

    # convert all hyperparameter columns to multi label columns
    for col in data.iloc[:, 1:].columns:

        # get the dtype of the column data
        col_dtype = data[col].dtype

        # parse column name to contain label, value and dtype
        data = wrangle.col_to_multilabel(data,
                                         col,
                                         extended_colname=True,
                                         extended_separator='~' +
                                         str(col_dtype) + '~')

    return data
Example #2
0
def telco_churn(quantile=.5):
    '''Returns dataset in format x, [y1, y2]. This dataset
    is useful for demonstrating multi-output model or for
    experimenting with reduction strategy creation.

    The data is from hyperparameter optimization experiment with
    Kaggle telco churn dataset.

    x: features
    y1: val_loss
    y2: val_f1score

    quantile is for transforming the otherwise continuous y variables into
    labels so that higher value is stronger. If set to 0 then original
    continuous will be returned.'''

    import wrangle
    import pandas as pd

    df = pd.read_csv(
        'https://raw.githubusercontent.com/autonomio/examples/master/telco_churn/telco_churn_for_sensitivity.csv'
    )

    df = df.drop(['val_acc', 'loss', 'f1score', 'acc', 'round_epochs'], 1)

    for col in df.iloc[:, 2:].columns:
        df = wrangle.col_to_multilabel(df, col)

    df = wrangle.df_rename_cols(df)

    if quantile > 0:
        y1 = (df.C0 < df.C0.quantile(quantile)).astype(int).values
        y2 = (df.C1 > df.C1.quantile(quantile)).astype(int).values
    else:
        y1 = df.C0.values
        y2 = df.C1.values

    x = df.drop(['C0', 'C1'], 1).values

    return x, [y1, y2]
# use estimated age for missing contact age values
estimated_age = (df.cnt_age_est_min + df.cnt_age_est_max) / 2
estimated_age = estimated_age.fillna(0).astype(int)
df['contact_age'] = (df.cnt_age_exact.fillna(0) + estimated_age).astype(int)

# keep these cols
cols = [
    'part_id', 'part_gender', 'contact_age', 'part_age', 'country', 'hh_size',
    'cnt_gender', 'cnt_home', 'cnt_work', 'cnt_school', 'cnt_transport',
    'cnt_leisure', 'cnt_otherplace'
]

df = df[cols]

# convert string label values to multi-label columns
df = wrangle.col_to_multilabel(df, 'part_gender')
df = wrangle.col_to_multilabel(df, 'country')

# drop redundant columns
df.drop(['cnt_gender'], 1, inplace=True)

# use these column names instead
cols = [
    'participant_id', 'contact_age', 'age_group', 'household_size',
    'contact_home', 'contact_work', 'contact_school', 'contact_transport',
    'contact_leisure', 'contact_other', 'gender_female', 'gender_male',
    'country_be', 'country_de', 'country_fi', 'country_gb', 'country_it',
    'country_lu', 'country_nl', 'country_pl'
]

# wrap up
Example #4
0
_null = wr.col_corr_ols(df.head(50), 'bouncerate1', 'bouncerate1')
_null = wr.col_drop_outliers(df, 'bouncerate1', threshold=1)
_null = wr.col_fill_nan(df, 'admin_city')
_null = wr.col_groupby_cdf(df, 'bouncerate1', 'adnetworks', ascending=True)
_null = wr.col_groupby_pdf(df, 'bouncerate1', 'adnetworks', ascending=False)
_null = wr.col_groupby_stats(df_cont_cat, 'bouncerate1', 'binary')
_null = wr.col_impute_nan(df.bouncerate1)
_null = wr.col_move_place(df, 'bouncerate1', 'first')
_null = wr.col_move_place(df, 'bouncerate1', 'last')
_null = wr.col_resample_equal(df.head(50), 'adnetworks', 1)
# _null = wr.col_resample_interval() # No datetime column
_null = wr.col_rescale_max(df.bouncerate1.values)
_null = wr.col_to_biclass(df, 'category', 'NEWS_AND_MEDIA')
_null = wr.col_to_binary(df, 'bouncerate1')
_null = wr.col_to_buckets(df, 'bouncerate1', 4)
_null = wr.col_to_cols(df[['adnetworks', 'bouncerate1']].reset_index(),
                       'adnetworks', 'index')
_null = wr.col_to_multilabel(df, 'category')
_null = wr.col_to_split(df.head(10), 'top_downstream', sep='.')

# test all the attributes starting with array_
_null = wr.array_random_shuffle(df[['bouncerate1', 'bouncerate2']].values,
                                df.bouncerate2)
_null = wr.array_random_weighted(df.bouncerate1.head(10), 'normal', 10)
_null = wr.array_reshape_conv1d(df.values)
_null = wr.array_reshape_lstm(df.bouncerate1, 10, 10)
_null = wr.array_split(df.values, df.bouncerate1.values, .1)
_null = wr.array_to_generator(df.values, df.bouncerate1, 20)
_null = wr.array_to_kfold(df.values, df.bouncerate1)
_null = wr.array_to_multilabel(df.head(5).adnetworks.values)