Ejemplo n.º 1
0
def fill_na(df):
    """
    Fill in na's for those who have no recorded measurement
    within 48 hours. If this is common maybe add an indicator
    variable denoting missing.

    - slope = 0
    - tsl = 48
    - last = mean
    - min = mean
    - max = mean
    - std = mean
    - skew = mean
    - mean = mean
    - sum = 0
    """
    prefix_dict = generate_prefix_dict(list(df.columns))

    lst, tsl, slp, men, std, mn, mx, sm, skw = (
        prefix_dict[i] for i in
        ['last', 'tsl', 'slope', 'mean', 'std', 'min', 'max', 'sum', 'skew'])
    df.loc[:, slp + sm] = df[slp + sm].fillna(0)

    df.loc[:, tsl] = df[tsl].fillna(48)

    df.loc[:, lst + men + std + mn + mx +
           skw] = (df[lst + men + std + mn + mx +
                      skw].pipe(lambda df: df.fillna(df.mean())))

    return df
def scale(train, val, test):
    """
    SCALING PROTOCOL

    All TSL are MinMaxScaled

    EMS - StandardScaled, not LogNormal because outlier events are truly outliers
    EMA - StandardScaled
    TSL - LogScaled -> StandardScaled or MinMax


    """
    train, val, test = train.copy(), val.copy(), test.copy()
    pref_dict = generate_prefix_dict(train)
    suff_dict = generate_suffix_dict(train)

    curr_ios  = list(set(pref_dict['curr']) &
                    (set(suff_dict['io']) | set(suff_dict['occ'])))
    curr_nums = list(set(pref_dict['curr']) &
                    (set(suff_dict['vitals']) | set(suff_dict['labs'])))
    stand_cols  = pref_dict['ems'] + pref_dict['ema'] + ['age_enc'] + curr_nums
    minmax_cols = pref_dict['tsl'] + curr_ios + ['time_of_day_enc']

    scaler = StandardScaler()
    train.loc[:,stand_cols] = scaler.fit_transform(train.loc[:,stand_cols])
    val.loc[:,stand_cols]   = scaler.fit_transform(val.loc[:,stand_cols])
    test.loc[:,stand_cols]  = scaler.fit_transform(test.loc[:,stand_cols])

    minmax = MinMaxScaler()
    train.loc[:,minmax_cols] = minmax.fit_transform(train.loc[:,minmax_cols])
    val.loc[:,minmax_cols]   = minmax.fit_transform(val.loc[:,minmax_cols])
    test.loc[:,minmax_cols]  = minmax.fit_transform(test.loc[:,minmax_cols])

    return train.round(5), val.round(5), test.round(5)
def ffill_curr(df):
    df = df.copy()
    pref_dict = generate_prefix_dict(df)
    suff_dict = generate_suffix_dict(df)
    df.loc[:,pref_dict['curr']] = (
        df[pref_dict['curr']]
        .fillna(method='ffill'))
    df.loc[:,suff_dict['img']] = (
        df[suff_dict['img']]
        .fillna(method='ffill', limit=6))

    return df
def generate_fold_datasets(x, y, fold_dict, train_means_dict, scalers_dict):
    """
    Usage:
    fold_dataset_generator = generate_fold_datasets(
        x, y, fold_dict, train_means_dict, scalers_dict)
    for fold_split in fold_dataset_generator:
        x_train, x_test, y_train, y_test = fold_split
        # Do something

    Or if only want one fold:
    fold_dataset_generator = generate_fold_datasets(
        x, y, fold_dict, train_means_dict, scalers_dict)
    x_train, x_test, y_train, y_test = next(fold_dataset_generator)
    # Do something

    """
    print('\n>>> Applying preprocessing...')
    suff_dict = generate_suffix_dict(x)
    pref_dict = generate_prefix_dict(x)
    for i, (fold_num, mrns) in enumerate(fold_dict.items()):
        print('Fold {}'.format(i))
        start = time.time()
        xf, yf = x.copy(), y.copy()
        # Fill NA
        train_means, scalers = train_means_dict[i], scalers_dict[i]
        xf.loc[:, suff_dict['img']] = xf.loc[:, suff_dict['img']].fillna(0)
        xf = xf.fillna(value=train_means.to_dict())
        mid = time.time()
        print('Filled NA in {}s'.format(round(mid - start, 1)))

        # Apply scaler objects
        curr_ios = list(
            set(pref_dict['curr'])
            & (set(suff_dict['io']) | set(suff_dict['occ'])))
        curr_nums = list(
            set(pref_dict['curr'])
            & (set(suff_dict['vitals']) | set(suff_dict['labs'])))
        stand_cols = pref_dict['ema'] + ['age_enc'] + curr_nums
        minmax_cols = curr_ios + ['hsa_enc']
        #, 'time_of_day_enc', 'duke_loc_enc',
        #                           'past_sbo_enc', 'raleigh_loc_enc', 'regional_loc_enc',
        #                           'hsa_enc'] + suff_dict['img']
        robust_cols = pref_dict['tsl'] + pref_dict['ems']

        standard, minmax, robust = scalers
        xf.loc[:, stand_cols] = standard.transform(xf[stand_cols])
        xf.loc[:, minmax_cols] = minmax.transform(xf[minmax_cols])
        xf.loc[:, robust_cols] = robust.transform(xf[robust_cols])

        x_train, y_train = xf.drop(mrns), yf.drop(mrns)
        x_test, y_test = xf.loc[mrns], yf.loc[mrns]
        print('Scaled in {}s'.format(round(time.time() - mid, 1)))
        yield x_train, x_test, y_train, y_test
def fill_na(df, train_means):
    """
    PROTOCOL
    Curr - ffill then fill with training set means
    EMA - mean (should only be at beginning of encounter)
    EMS - 0
    """
    df = df.copy()
    pref_dict = generate_prefix_dict(df)
    df.loc[:, pref_dict['curr']] = (df[pref_dict['curr']].fillna(
        method='ffill').fillna(train_means.loc[pref_dict['curr']]))
    df.loc[:, pref_dict['ema']] = (df[pref_dict['ema']].fillna(
        train_means.loc[pref_dict['ema']]))
    #df['word_log_ratio_img'] = df['word_log_ratio_img'].fillna(train_means.loc['word_log_ratio_img'])
    return df
def apply_postsum_transforms(df):
    """
    TRANSFORMATIONS
    - take log+1 of IO's
    - take log+1 of ems
    - take log+1 of tsl
    """
    pref_dict = generate_prefix_dict(df)
    suff_dict = generate_suffix_dict(df)

    log_plus_one = lambda s: np.log(s+1)
    curr_ios = list(set(pref_dict['curr']) & set(suff_dict['io']))
    df.loc[:, curr_ios] = df[curr_ios].apply(log_plus_one)
    df.loc[:, pref_dict['ems']] = df[pref_dict['ems']].apply(log_plus_one)
    df.loc[:, pref_dict['tsl']] = df[pref_dict['tsl']].apply(log_plus_one)
    return df
def cache_preprocessing_info(x, fold_dict):
    print('\n>>> Caching preprocessing...')
    suff_dict = generate_suffix_dict(x)
    pref_dict = generate_prefix_dict(x)
    train_means_dict = dict()
    scalers_dict = dict()
    for i, (fold_num, mrns) in enumerate(fold_dict.items()):
        print('Fold {}'.format(i))
        start = time.time()
        x_train = x.copy().drop(mrns)
        # Calculate train means
        x_train.loc[:,
                    suff_dict['img']] = x_train.loc[:,
                                                    suff_dict['img']].fillna(0)
        train_means = x_train.mean(axis=0)
        x_train = x_train.fillna(value=train_means.to_dict())

        # Fit Scaler objects
        standard = StandardScaler()
        minmax = MinMaxScaler()
        robust = RobustScaler()
        curr_ios = list(
            set(pref_dict['curr'])
            & (set(suff_dict['io']) | set(suff_dict['occ'])))
        curr_nums = list(
            set(pref_dict['curr'])
            & (set(suff_dict['vitals']) | set(suff_dict['labs'])))
        stand_cols = pref_dict['ema'] + ['age_enc'] + curr_nums
        minmax_cols = curr_ios + ['hsa_enc']
        #, 'time_of_day_enc', 'duke_loc_enc',
        #                           'past_sbo_enc', 'raleigh_loc_enc', 'regional_loc_enc',
        #                           'hsa_enc'] + suff_dict['img']
        robust_cols = pref_dict['tsl'] + pref_dict['ems']

        standard.fit(x_train[stand_cols])
        minmax.fit(x_train[minmax_cols])
        robust.fit(x_train[robust_cols])

        train_means_dict[i] = train_means
        scalers_dict[i] = [standard, minmax, robust]
        print('Finished in {}s'.format(round(time.time() - start, 1)))
    return train_means_dict, scalers_dict
def scale(train, test):
    train, test = train.copy(), test.copy()
    pref_dict = generate_prefix_dict(train)
    suff_dict = generate_suffix_dict(train)

    #curr_ios  = list(set(pref_dict['curr']) &
    #                (set(suff_dict['io']) | set(suff_dict['occ'])))
    #curr_nums = list(set(pref_dict['curr']) &
    #                (set(suff_dict['vitals']) | set(suff_dict['labs'])))
    #stand_cols  =  pref_dict['ema'] + ['age_enc'] + curr_nums
    #minmax_cols = pref_dict['tsl'] + curr_ios + pref_dict['ems']

    curr_ios = list(
        set(pref_dict['curr'])
        & (set(suff_dict['io']) | set(suff_dict['occ'])))
    curr_nums = list(
        set(pref_dict['curr'])
        & (set(suff_dict['vitals']) | set(suff_dict['labs'])))
    stand_cols = (
        pref_dict['ema'] + ['age_enc'] + curr_nums
        #+ ['word_log_ratio_img']
    )
    minmax_cols = curr_ios + ['time_of_day_enc'] + ['hsa_enc']
    robust_cols = pref_dict['tsl'] + pref_dict['ems']

    scaler = StandardScaler()
    train.loc[:, stand_cols] = scaler.fit_transform(train.loc[:, stand_cols])
    test.loc[:, stand_cols] = scaler.transform(test.loc[:, stand_cols])

    minmax = MinMaxScaler()
    train.loc[:, minmax_cols] = 2 * minmax.fit_transform(
        train.loc[:, minmax_cols]) - 1
    test.loc[:,
             minmax_cols] = 2 * minmax.transform(test.loc[:, minmax_cols]) - 1

    robust = RobustScaler()
    train.loc[:, robust_cols] = 2 * robust.fit_transform(
        train.loc[:, robust_cols]) - 1
    test.loc[:,
             robust_cols] = 2 * robust.transform(test.loc[:, robust_cols]) - 1
    return train, test
from preprocessing_exp_weights import preprocess_exp_weights

(x, y, x_cols) = preprocess_exp_weights(rebuild=False,
                                        time_to_event=True,
                                        scale_feat=False,
                                        fill_null=False,
                                        custom_tag='noimg')
#img2

suff_dict = generate_suffix_dict(x)

# In[4]:

#x = x.drop(list(set(suff_dict['img'])-{'ind12_word_log_ratio_img','ind48_word_log_ratio_img'}), 1)

pref_dict = generate_prefix_dict(x)
suff_dict = generate_suffix_dict(x)
mid_dict = generate_midfix_dict(x)
#mid_dict['bp'] = mid_dict['bp_sys'] + mid_dict['bp_dia']

# In[5]:

pref_dict.keys()
suff_dict.keys()
mid_dict.keys()

# In[13]:


def plot_cv_results(inner_perf_arr, lambdas, alphas):
    """Plot AUC vs. Lambda values across runs for diff alphas."""
def main(x, y, run_inner_fold=True, compute_perm_imp=True):
    #444
    np.random.seed(449)
    time1 = time.time()
    x, y = x.copy(), y.copy()

    enc = (y.reset_index(level=2,
                         drop=True).reset_index().drop_duplicates().set_index(
                             ['mrn', 'id']))

    # TODO: figure out if this is a problem

    y['hsa'] = y.index.get_level_values(2)
    y.loc[y.any_sbo_surg_enc == 0,
          'time_to_event_enc'] = (y.time_to_event_enc.max() -
                                  y.loc[y.any_sbo_surg_enc == 0, 'hsa'])
    y = y.drop('hsa', 1)
    y.loc[y['time_to_event_enc'] == 0, 'time_to_event_enc'] = 0.01

    num_folds, num_lambdas, num_alphas = 5, 20, 3
    lambdas = np.array([np.exp(x) for x in np.linspace(-4, 6, num_lambdas)])
    #alphas = np.array([x**2 for x in np.linspace(0,1,num_alphas)]).round(3)
    # Don't need to add zero because just round
    alphas = np.array([0.1**x
                       for x in np.linspace(0, 4, num_alphas)[::-1]]).round(3)
    #alphas = np.array([0])

    print('\n>>> Generating {} group stratified folds...'.format(num_folds))
    group_fold_dict = generate_fold_dict(enc, num_folds, 0.01)
    inner_group_fold_dict = group_fold_dict.copy()
    del inner_group_fold_dict[num_folds - 1]

    x_inner = x.drop(group_fold_dict[num_folds - 1])
    y_inner = y.drop(group_fold_dict[num_folds - 1])
    time2 = time.time()
    print('Finished in {}s'.format(round(time2 - time1, 1)))

    if run_inner_fold:
        print('\n>>> Running inner CV with {} folds...'.format(num_folds - 1))
        inner_perf_arr, inner_betas_arr = inner_cv_cox(
            x=x_inner,
            y=y_inner,
            lambdas=lambdas,
            alphas=alphas,
            fold_dict=inner_group_fold_dict)
        time3 = time.time()
        print('Fit finished in {}s'.format(round(time3 - time2, 1)))

        lambda_opt, alpha_opt = get_best_hparams(inner_perf_arr, lambdas,
                                                 alphas)
    else:
        time3 = time.time()
        inner_perf_arr, inner_betas_arr = None, None
        lambda_opt, alpha_opt = 0.25, 0.0
        #lambda_opt, alpha_opt = 1e-3, 0.0

    print(
        '\n>>> Running outer CV with {} folds, \nlambda* = {}, alpha* = {}...'.
        format(num_folds, lambda_opt, alpha_opt))
    # Cache preprocessing
    train_means_dict, scalers_dict = cache_preprocessing_info(
        x, group_fold_dict)
    fold_generator = generate_fold_datasets(x, y, group_fold_dict,
                                            train_means_dict, scalers_dict)
    betas_arr, perf_arr = outer_cv_cox(x, y, lambda_opt, alpha_opt,
                                       group_fold_dict, fold_generator)
    time4 = time.time()
    print('Fit finished in {}s'.format(round(time4 - time3, 1)))

    if compute_perm_imp:
        print('\n>>> Computing permutation importance...')
        pref_dict = generate_prefix_dict(x_train)
        suff_dict = generate_suffix_dict(x_train)
        mid_dict = generate_midfix_dict(x_train)
        mid_dict['bp'] = mid_dict['bp_sys'] + mid_dict['bp_dia']
        #perm_group_dict = {col:[col] for col in x_samp_full.columns}
        fold_generator = generate_fold_datasets(x, y, group_fold_dict,
                                                train_means_dict, scalers_dict)
        perm_group_dict = mid_dict
        perm_imp_df = permutation_importance_cv(x, y, betas_arr,
                                                group_fold_dict,
                                                fold_generator,
                                                perm_group_dict)

    else:
        perm_imp_df = None
    time5 = time.time()
    print('Fit finished in {}s'.format(round(time5 - time4, 1)))

    print('\n>>> Make predictions...')
    fold_generator = generate_fold_datasets(x, y, group_fold_dict,
                                            train_means_dict, scalers_dict)
    y_train_pred, y_test_pred = make_predictions(x, y, betas_arr,
                                                 group_fold_dict,
                                                 fold_generator)
    time6 = time.time()
    print('Predictions finished in {}s'.format(round(time6 - time5, 1)))
    """
    epsilon, lambdas, alphas, num_folds

    generate_fold_dict -> group_fold_dict
    inner_cv_cox -> inner_perf_arr, inner_betas_arr
    outer_cv_cox -> betas_arr, perf_arr
    permutation_importance_cv -> perm_imp_df
    make_predictions -> y_train_pred, y_test_pred
    """
    betas_df = pd.DataFrame(betas_arr, columns=x.columns)

    result_list = [
        lambdas, alphas, lambda_opt, alpha_opt, group_fold_dict,
        inner_perf_arr, inner_betas_arr, betas_df, perf_arr, perm_imp_df,
        y_train_pred, y_test_pred, train_means_dict, scalers_dict
    ]
    return result_list