Ejemplo n.º 1
0
def run_prediction_from_trained_model_fct(trained_model_file, X_file, target_name, selection_criterium, df_file,
                                          data_str, regress_confounds=False):
    import os, pickle
    import numpy as np
    import pandas as pd
    from sklearn.metrics import r2_score, mean_absolute_error
    from LeiCA_LIFE.learning.learning_utils import pred_real_scatter, plot_brain_age, residualize_group_data

    data_str = target_name + '__' + selection_criterium + '__' + data_str

    df = pd.read_pickle(df_file)
    df['pred_age_test'] = np.nan

    X_test = np.load(X_file)
    y_test = df[[target_name]].values.squeeze()
    confounds = df[['mean_FD_P']].values

    # REGRESS OUT CONFOUNDS IF NEEDED
    if regress_confounds:
        X_test = residualize_group_data(X_test, confounds)

    with open(trained_model_file, 'r') as f:
        pipe = pickle.load(f)

    # RUN PREDICTION
    y_predicted = pipe.predict(X_test)

    df.ix[:, ['pred_age_test']] = y_predicted

    test_mae = mean_absolute_error(y_test, y_predicted)
    test_r2 = r2_score(y_test, y_predicted)
    test_rpear2 = np.corrcoef(y_test, y_predicted)[0, 1] ** 2

    train_r2 = np.nan
    train_mae = np.nan


    # SCATTER PLOTS
    title_str = 'r2: {:.3f} MAE:{:.3f}'.format(test_r2, test_mae)
    scatter_file = pred_real_scatter(y_test, y_predicted, title_str, data_str)

    brain_age_scatter_file = plot_brain_age(y_test, y_predicted, data_str)

    df_use_file = os.path.join(os.getcwd(), data_str + '_df_predicted.pkl')
    df.to_pickle(df_use_file)

    # performace results df
    df_res_out_file = os.path.abspath(data_str + '_df_results.pkl')
    df_res = pd.DataFrame(
        {'FD_res': regress_confounds, 'r2_train': [train_r2], 'MAE_train': [train_mae],
         'r2_test': [test_r2], 'rpear2_test': [test_rpear2],
         'MAE_test': [test_mae]},
        index=[data_str])
    df_res.to_pickle(df_res_out_file)

    return scatter_file, brain_age_scatter_file, df_use_file, df_res_out_file
Ejemplo n.º 2
0
def run_prediction_split_fct(X_file, target_name, selection_criterium, df_file, data_str, regress_confounds=False,
                             run_cv=False, n_jobs_cv=1, run_tuning=False, X_file_nki=None, df_file_nki=None,
                             reverse_split=False, random_state_nki=666, run_learning_curve=False, life_test_size=0.5):
    import os, pickle
    import numpy as np
    import pandas as pd
    from sklearn.svm import SVR
    from sklearn.cross_validation import cross_val_predict, train_test_split, StratifiedKFold
    from sklearn.feature_selection import VarianceThreshold
    from sklearn.preprocessing import StandardScaler, Imputer
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import r2_score, mean_absolute_error
    from sklearn.utils import shuffle
    from LeiCA_LIFE.learning.learning_utils import pred_real_scatter, plot_brain_age, residualize_group_data, \
        learning_curve_plot

    if X_file_nki:
        run_2sample_training = True
    else:
        run_2sample_training = False

    empty_file = os.path.abspath('empty.txt')
    with open(empty_file, 'w') as f:
        f.write('')

    data_str = target_name + '__' + selection_criterium + '__' + data_str

    variables = ['train_mae', 'train_r2', 'cv_r2', 'cv_mae', 'cv_r2_mean', 'cv_r2_std', 'y_predicted_cv']
    for v in variables:
        try:
            exec (v)
        except NameError:
            exec ('%s = np.nan' % v)


    #########
    # LIFE
    #########
    #### LOAD DATA
    df_life = pd.read_pickle(df_file)
    # add ouput cols to df
    df_life['study'] = 'life'
    df_life['split_group'] = ''

    X_life = np.load(X_file)
    y_life = df_life[[target_name]].values.squeeze()
    confounds_life = df_life[['mean_FD_P']].values
    ind_life = range(X_life.shape[0])

    #### split with age stratification
    n_age_bins = 20
    df_life['age_bins'] = pd.cut(df_life['age'], n_age_bins, labels=range(n_age_bins))

    X_train_life, X_test_life, y_train_life, y_test_life, confounds_train_life, confounds_test_life, \
    ind_train_life, ind_test_life = train_test_split(X_life,
                                                     y_life,
                                                     confounds_life,
                                                     ind_life,
                                                     stratify=df_life['age_bins'].values,
                                                     test_size=life_test_size,
                                                     random_state=666)
    if reverse_split:
        X_train_life, X_test_life = X_test_life, X_train_life
        y_train_life, y_test_life = y_test_life, y_train_life
        confounds_train_life, confounds_test_life = confounds_test_life, confounds_train_life
        ind_train_life, ind_test_life = ind_test_life, ind_train_life

    df_life.ix[ind_train_life, 'split_group'] = 'train'
    df_life.ix[ind_test_life, 'split_group'] = 'test'
    df_train_life = df_life.ix[ind_train_life, ['age_bins', 'study']]
    df_test_life = df_life.ix[ind_test_life, ['age_bins', 'study']]
    sample_weights_train_life = np.ones_like(y_train_life)

    #########
    # NKI
    #########
    if run_2sample_training:
        #### LOAD DATA
        df_nki = pd.read_pickle(df_file_nki)
        df_nki['study'] = 'nki'
        df_nki['train_group_2samp'] = False

        X_nki = np.load(X_file_nki)
        y_nki = df_nki[[target_name]].values.squeeze()
        confounds_nki = df_nki[['mean_FD_P']].values
        ind_nki = range(X_nki.shape[0])

        #### split with age stratification
        df_nki['age_bins'] = pd.cut(df_nki['age'], n_age_bins, labels=range(n_age_bins))

        X_train_nki, X_test_nki, y_train_nki, y_test_nki, confounds_train_nki, confounds_test_nki, \
        ind_train_nki, ind_test_nki = train_test_split(X_nki,
                                                       y_nki,
                                                       confounds_nki,
                                                       ind_nki,
                                                       stratify=df_nki['age_bins'].values,
                                                       train_size=0.1,
                                                       random_state=random_state_nki)
        if reverse_split:
            X_train_nki, X_test_nki = X_test_nki, X_train_nki
            y_train_nki, y_test_nki = y_test_nki, y_train_nki
            confounds_train_nki, confounds_test_nki = confounds_test_nki, confounds_train_nki
            ind_train_nki, ind_test_nki = ind_test_nki, ind_train_nki

        df_nki['train_group_2samp'] = np.nan
        df_nki.ix[ind_train_nki, 'train_group_2samp'] = True
        df_train_nki = df_nki.ix[ind_train_nki, ['age_bins', 'study']]


    else:
        X_train_nki = np.array([])
        y_train_nki = np.array([])
        confounds_train_nki = []
        df_nki = pd.DataFrame([])
        df_train_nki = pd.DataFrame([])
        df_life['train_group_2samp'] = np.nan

    #########
    # stack life and nki
    #########
    df_big = pd.concat((df_life, df_nki))
    df_big_ind_train = ((df_big.split_group == 'train') | (df_big.train_group_2samp == True))
    df_big_ind_test_life = ((df_big.split_group == 'test') & (df_big.study == 'life'))
    df_big['run_2sample_training'] = run_2sample_training

    df_train = pd.concat((df_train_life, df_train_nki))
    df_test = df_test_life

    if run_2sample_training:
        # calc sample weights so that Nl*Wl == Nn*Wn (& (Nl*Wl + Nn*Wn) == Nl + Nn)
        Nl = y_train_life.shape[0]
        Nn = y_train_nki.shape[0]
        w_life = float(Nl + Nn) / (2. * Nl)
        w_nki = float(Nl + Nn) / (2. * Nn)

        sample_weights_train_life = np.zeros_like(y_train_life)
        sample_weights_train_life.fill(w_life)
        sample_weights_train_nki = np.zeros_like(y_train_nki)
        sample_weights_train_nki.fill(w_nki)
        sample_weights_train = np.concatenate((sample_weights_train_life, sample_weights_train_nki))

        X_train = np.concatenate((X_train_life, X_train_nki))
        y_train = np.concatenate((y_train_life, y_train_nki))
        confounds_train = np.concatenate((confounds_train_life, confounds_train_nki))
    else:
        X_train = X_train_life
        y_train = y_train_life
        confounds_train = confounds_train_life
        sample_weights_train = np.ones_like(y_train_life)
    df_train['sample_weights'] = sample_weights_train

    stacked_ind_train_life = slice(0, X_train_life.shape[0])
    stacked_ind_train_nki = slice(X_train_life.shape[0], X_train.shape[0])
    age_bins_train = df_big.ix[df_big_ind_train, 'age_bins'].values

    # test with life data only
    X_test = X_test_life
    y_test = y_test_life
    confounds_test = confounds_test_life



    #########
    # PIPELINE
    #########
    #### REGRESS OUT CONFOUNDS IF NEEDED
    if regress_confounds:
        X_train = residualize_group_data(X_train, confounds_train)
        X_test = residualize_group_data(X_test, confounds_test)

    #### PREPROCESSING
    fill_missing = Imputer()
    var_thr = VarianceThreshold()
    normalize = StandardScaler()

    #### set C to values
    if 'aseg' in data_str:
        C = 1
    else:
        C = 10 ** -3

    regression_model = SVR(kernel='linear', C=C, cache_size=1000)
    pipeline_list = [('fill_missing', fill_missing),
                     ('var_thr', var_thr),
                     ('normalize', normalize)]

    pipeline_list.append(('regression_model', regression_model))
    pipe = Pipeline(pipeline_list)




    #### FIT MODEL
    pipe.fit(X=X_train, y=y_train, **{'regression_model__sample_weight': sample_weights_train})  # (X_train, y_train)
    y_predicted_train = pipe.predict(X_train)
    y_predicted = pipe.predict(X_test)

    df_train['pred_age_train'] = y_predicted_train
    df_test['pred_age_test'] = y_predicted

    y_predicted_train_life = y_predicted_train[stacked_ind_train_life]
    y_predicted_train_nki = y_predicted_train[stacked_ind_train_nki]
    df_life.ix[ind_train_life, 'pred_age_train'] = y_predicted_train_life
    df_life.ix[ind_train_life, 'pred_age_train'] = y_predicted_train_life

    # df_life.ix[ind_train, ['pred_age_train']] = y_predicted_train
    # df_life.ix[ind_test, ['pred_age_test']] = y_predicted
    test_mae = mean_absolute_error(y_test, y_predicted)
    test_r2 = r2_score(y_test, y_predicted)
    test_rpear2 = np.corrcoef(y_test, y_predicted)[0, 1] ** 2

    train_mae = mean_absolute_error(y_train, y_predicted_train)
    train_r2 = r2_score(y_train, y_predicted_train)




    #### RUN CROSSVALIDATION
    if run_cv:
        strat_k_fold = StratifiedKFold(df_train.ix[:, 'age_bins'].values, n_folds=5, shuffle=True, random_state=0)
        # crossval predict and manually calc. cv score to get y_cv_predicted
        # cv_score_ = cross_val_score(pipe, X_train, y_train, cv=strat_k_fold, n_jobs=n_jobs_cv)  #
        y_predicted_cv = cross_val_predict(pipe, X_train, y_train, cv=strat_k_fold, n_jobs=n_jobs_cv)
        df_train['y_predicted_cv'] = y_predicted_cv

        cv_r2 = []
        cv_mae = []
        cv_test_fold = np.zeros_like(y_train)
        cv_test_fold.fill(np.nan)
        for k, (k_train, k_test) in enumerate(strat_k_fold):
            cv_r2.append(r2_score(y_train[k_test], y_predicted_cv[k_test]))
            cv_mae.append(mean_absolute_error(y_train[k_test], y_predicted_cv[k_test]))
            cv_test_fold[k_test] = k
        cv_r2_mean = np.mean(cv_r2)
        cv_r2_std = np.std(cv_r2)

        # df_big['cv_test_fold'] = np.nan
        # df_big.ix[df_big_ind_train, 'cv_test_fold'] = cv_test_fold
        df_train['cv_test_fold'] = cv_test_fold

    if run_learning_curve:
        X_full = np.vstack((X_train_life, X_test_life))
        y_full = np.hstack((y_train_life, y_test_life))
        from sklearn.learning_curve import learning_curve
        train_sizes, train_scores, test_scores = learning_curve(pipe, X_full, y_full, cv=5, n_jobs=n_jobs_cv,
                                                                train_sizes=np.linspace(0.1, 1.0, 10))

        learning_curve_plot_file, learning_curve_df_file = learning_curve_plot(train_sizes, train_scores, test_scores,
                                                                               'learning curve', data_str,
                                                                               post_str='_training_curve')
    else:
        learning_curve_plot_file, learning_curve_df_file = empty_file, empty_file


    #### SCATTER PLOTS
    title_str = 'r2: {:.3f} MAE:{:.3f}'.format(test_r2, test_mae)
    scatter_file = pred_real_scatter(y_test, y_predicted, title_str, data_str)

    if run_cv:
        title_str = 'r2: {:.3f}({:.3f}) MAE:{:.3f}({:.3f})'.format(cv_r2_mean, cv_r2_std, np.mean(cv_mae),
                                                                   np.std(cv_mae))
        scatter_file_cv = pred_real_scatter(y_train, y_predicted_cv, title_str, data_str, post_str='_cv')

    else:
        scatter_file_cv = empty_file

    brain_age_scatter_file = plot_brain_age(y_test, y_predicted, data_str)






    #### TUNING CURVES
    if run_tuning:
        from sklearn.learning_curve import validation_curve
        from sklearn.cross_validation import StratifiedKFold
        import pylab as plt
        strat_k_fold = StratifiedKFold(df_big.ix[df_big_ind_train, 'age_bins'].values, n_folds=5, shuffle=True,
                                       random_state=0)
        param_range = np.logspace(-4, 0, num=12)
        # fixme n_jobs
        train_scores, test_scores = validation_curve(pipe, X_train, y_train, param_name="regression_model__C",
                                                     param_range=param_range,
                                                     cv=strat_k_fold, n_jobs=n_jobs_cv)
        # plot
        # http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#example-model-selection-plot-validation-curve-py
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.figure()
        plt.title("Validation Curve")
        plt.xlabel("C")
        plt.ylabel("Score")
        plt.ylim(0.0, 1.1)
        plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
        plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
                         alpha=0.2,
                         color="r")
        plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="g")
        plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2,
                         color="g")
        plt.legend(loc="best")

        tuning_curve_file = os.path.join(os.getcwd(), 'tuning_curve_' + data_str + '.pdf')
        plt.savefig(tuning_curve_file)
        plt.close()
        # #############################################
    else:
        tuning_curve_file = empty_file





    #### join df_train and df_test with df_big
    # drop duplicate columns
    df_train.drop([l for l in df_train.columns if l in df_big.columns], axis=1, inplace=True)
    df_test.drop([l for l in df_test.columns if l in df_big.columns], axis=1, inplace=True)
    df_big = df_big.join(df_train, how='left')
    df_big = df_big.join(df_test, how='left')

    #### SAVE
    df_res_out_file = os.path.abspath(data_str + '_df_results.pkl')
    df_res = pd.DataFrame(
        {'FD_res': regress_confounds, 'r2_train': [train_r2], 'MAE_train': [train_mae], 'r2_test': [test_r2],
         'rpear2_test': [test_rpear2], 'MAE_test': [test_mae], 'cv_r2': [cv_r2], 'cv_r2_mean': [cv_r2_mean],
         'cv_r2_std': [cv_r2_std]},
        index=[data_str])
    df_res.to_pickle(df_res_out_file)

    df_life_out_file = os.path.join(os.getcwd(), data_str + '_life_df_predicted.pkl')
    df_life.to_pickle(df_life_out_file)

    df_nki_out_file = os.path.join(os.getcwd(), data_str + '_nki_df_predicted.pkl')
    df_nki.to_pickle(df_nki_out_file)

    df_big_out_file = os.path.join(os.getcwd(), data_str + '_df_predicted.pkl')
    df_big.to_pickle(df_big_out_file)

    model_out_file = os.path.join(os.getcwd(), 'trained_model.pkl')
    with open(model_out_file, 'w') as f:
        pickle.dump(pipe, f)

    return scatter_file, brain_age_scatter_file, df_life_out_file, df_nki_out_file, df_big_out_file, model_out_file, df_res_out_file, \
           tuning_curve_file, scatter_file_cv, learning_curve_plot_file, learning_curve_df_file
Ejemplo n.º 3
0
def run_prediction_split_fct(X_file,
                             target_name,
                             selection_criterium,
                             df_file,
                             data_str,
                             regress_confounds=False,
                             run_cv=False,
                             n_jobs_cv=1,
                             run_tuning=False,
                             X_file_nki=None,
                             df_file_nki=None,
                             reverse_split=False,
                             random_state_nki=666,
                             run_learning_curve=False,
                             life_test_size=0.5):
    import os, pickle
    import numpy as np
    import pandas as pd
    from sklearn.svm import SVR
    from sklearn.cross_validation import cross_val_predict, train_test_split, StratifiedKFold
    from sklearn.feature_selection import VarianceThreshold
    from sklearn.preprocessing import StandardScaler, Imputer
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import r2_score, mean_absolute_error
    from sklearn.utils import shuffle
    from LeiCA_LIFE.learning.learning_utils import pred_real_scatter, plot_brain_age, residualize_group_data, \
        learning_curve_plot

    if X_file_nki:
        run_2sample_training = True
    else:
        run_2sample_training = False

    empty_file = os.path.abspath('empty.txt')
    with open(empty_file, 'w') as f:
        f.write('')

    data_str = target_name + '__' + selection_criterium + '__' + data_str

    variables = [
        'train_mae', 'train_r2', 'cv_r2', 'cv_mae', 'cv_r2_mean', 'cv_r2_std',
        'y_predicted_cv'
    ]
    for v in variables:
        try:
            exec(v)
        except NameError:
            exec('%s = np.nan' % v)

    #########
    # LIFE
    #########
    #### LOAD DATA
    df_life = pd.read_pickle(df_file)
    # add ouput cols to df
    df_life['study'] = 'life'
    df_life['split_group'] = ''

    X_life = np.load(X_file)
    y_life = df_life[[target_name]].values.squeeze()
    confounds_life = df_life[['mean_FD_P']].values
    ind_life = range(X_life.shape[0])

    #### split with age stratification
    n_age_bins = 20
    df_life['age_bins'] = pd.cut(df_life['age'],
                                 n_age_bins,
                                 labels=range(n_age_bins))

    X_train_life, X_test_life, y_train_life, y_test_life, confounds_train_life, confounds_test_life, \
    ind_train_life, ind_test_life = train_test_split(X_life,
                                                     y_life,
                                                     confounds_life,
                                                     ind_life,
                                                     stratify=df_life['age_bins'].values,
                                                     test_size=life_test_size,
                                                     random_state=666)
    if reverse_split:
        X_train_life, X_test_life = X_test_life, X_train_life
        y_train_life, y_test_life = y_test_life, y_train_life
        confounds_train_life, confounds_test_life = confounds_test_life, confounds_train_life
        ind_train_life, ind_test_life = ind_test_life, ind_train_life

    df_life.ix[ind_train_life, 'split_group'] = 'train'
    df_life.ix[ind_test_life, 'split_group'] = 'test'
    df_train_life = df_life.ix[ind_train_life, ['age_bins', 'study']]
    df_test_life = df_life.ix[ind_test_life, ['age_bins', 'study']]
    sample_weights_train_life = np.ones_like(y_train_life)

    #########
    # NKI
    #########
    if run_2sample_training:
        #### LOAD DATA
        df_nki = pd.read_pickle(df_file_nki)
        df_nki['study'] = 'nki'
        df_nki['train_group_2samp'] = False

        X_nki = np.load(X_file_nki)
        y_nki = df_nki[[target_name]].values.squeeze()
        confounds_nki = df_nki[['mean_FD_P']].values
        ind_nki = range(X_nki.shape[0])

        #### split with age stratification
        df_nki['age_bins'] = pd.cut(df_nki['age'],
                                    n_age_bins,
                                    labels=range(n_age_bins))

        X_train_nki, X_test_nki, y_train_nki, y_test_nki, confounds_train_nki, confounds_test_nki, \
        ind_train_nki, ind_test_nki = train_test_split(X_nki,
                                                       y_nki,
                                                       confounds_nki,
                                                       ind_nki,
                                                       stratify=df_nki['age_bins'].values,
                                                       train_size=0.1,
                                                       random_state=random_state_nki)
        if reverse_split:
            X_train_nki, X_test_nki = X_test_nki, X_train_nki
            y_train_nki, y_test_nki = y_test_nki, y_train_nki
            confounds_train_nki, confounds_test_nki = confounds_test_nki, confounds_train_nki
            ind_train_nki, ind_test_nki = ind_test_nki, ind_train_nki

        df_nki['train_group_2samp'] = np.nan
        df_nki.ix[ind_train_nki, 'train_group_2samp'] = True
        df_train_nki = df_nki.ix[ind_train_nki, ['age_bins', 'study']]

    else:
        X_train_nki = np.array([])
        y_train_nki = np.array([])
        confounds_train_nki = []
        df_nki = pd.DataFrame([])
        df_train_nki = pd.DataFrame([])
        df_life['train_group_2samp'] = np.nan

    #########
    # stack life and nki
    #########
    df_big = pd.concat((df_life, df_nki))
    df_big_ind_train = ((df_big.split_group == 'train') |
                        (df_big.train_group_2samp == True))
    df_big_ind_test_life = ((df_big.split_group == 'test') &
                            (df_big.study == 'life'))
    df_big['run_2sample_training'] = run_2sample_training

    df_train = pd.concat((df_train_life, df_train_nki))
    df_test = df_test_life

    if run_2sample_training:
        # calc sample weights so that Nl*Wl == Nn*Wn (& (Nl*Wl + Nn*Wn) == Nl + Nn)
        Nl = y_train_life.shape[0]
        Nn = y_train_nki.shape[0]
        w_life = float(Nl + Nn) / (2. * Nl)
        w_nki = float(Nl + Nn) / (2. * Nn)

        sample_weights_train_life = np.zeros_like(y_train_life)
        sample_weights_train_life.fill(w_life)
        sample_weights_train_nki = np.zeros_like(y_train_nki)
        sample_weights_train_nki.fill(w_nki)
        sample_weights_train = np.concatenate(
            (sample_weights_train_life, sample_weights_train_nki))

        X_train = np.concatenate((X_train_life, X_train_nki))
        y_train = np.concatenate((y_train_life, y_train_nki))
        confounds_train = np.concatenate(
            (confounds_train_life, confounds_train_nki))
    else:
        X_train = X_train_life
        y_train = y_train_life
        confounds_train = confounds_train_life
        sample_weights_train = np.ones_like(y_train_life)
    df_train['sample_weights'] = sample_weights_train

    stacked_ind_train_life = slice(0, X_train_life.shape[0])
    stacked_ind_train_nki = slice(X_train_life.shape[0], X_train.shape[0])
    age_bins_train = df_big.ix[df_big_ind_train, 'age_bins'].values

    # test with life data only
    X_test = X_test_life
    y_test = y_test_life
    confounds_test = confounds_test_life

    #########
    # PIPELINE
    #########
    #### REGRESS OUT CONFOUNDS IF NEEDED
    if regress_confounds:
        X_train = residualize_group_data(X_train, confounds_train)
        X_test = residualize_group_data(X_test, confounds_test)

    #### PREPROCESSING
    fill_missing = Imputer()
    var_thr = VarianceThreshold()
    normalize = StandardScaler()

    #### set C to values
    if 'aseg' in data_str:
        C = 1
    else:
        C = 10**-3

    regression_model = SVR(kernel='linear', C=C, cache_size=1000)
    pipeline_list = [('fill_missing', fill_missing), ('var_thr', var_thr),
                     ('normalize', normalize)]

    pipeline_list.append(('regression_model', regression_model))
    pipe = Pipeline(pipeline_list)

    #### FIT MODEL
    pipe.fit(X=X_train,
             y=y_train,
             **{'regression_model__sample_weight':
                sample_weights_train})  # (X_train, y_train)
    y_predicted_train = pipe.predict(X_train)
    y_predicted = pipe.predict(X_test)

    df_train['pred_age_train'] = y_predicted_train
    df_test['pred_age_test'] = y_predicted

    y_predicted_train_life = y_predicted_train[stacked_ind_train_life]
    y_predicted_train_nki = y_predicted_train[stacked_ind_train_nki]
    df_life.ix[ind_train_life, 'pred_age_train'] = y_predicted_train_life
    df_life.ix[ind_train_life, 'pred_age_train'] = y_predicted_train_life

    # df_life.ix[ind_train, ['pred_age_train']] = y_predicted_train
    # df_life.ix[ind_test, ['pred_age_test']] = y_predicted
    test_mae = mean_absolute_error(y_test, y_predicted)
    test_r2 = r2_score(y_test, y_predicted)
    test_rpear2 = np.corrcoef(y_test, y_predicted)[0, 1]**2

    train_mae = mean_absolute_error(y_train, y_predicted_train)
    train_r2 = r2_score(y_train, y_predicted_train)

    #### RUN CROSSVALIDATION
    if run_cv:
        strat_k_fold = StratifiedKFold(df_train.ix[:, 'age_bins'].values,
                                       n_folds=5,
                                       shuffle=True,
                                       random_state=0)
        # crossval predict and manually calc. cv score to get y_cv_predicted
        # cv_score_ = cross_val_score(pipe, X_train, y_train, cv=strat_k_fold, n_jobs=n_jobs_cv)  #
        y_predicted_cv = cross_val_predict(pipe,
                                           X_train,
                                           y_train,
                                           cv=strat_k_fold,
                                           n_jobs=n_jobs_cv)
        df_train['y_predicted_cv'] = y_predicted_cv

        cv_r2 = []
        cv_mae = []
        cv_test_fold = np.zeros_like(y_train)
        cv_test_fold.fill(np.nan)
        for k, (k_train, k_test) in enumerate(strat_k_fold):
            cv_r2.append(r2_score(y_train[k_test], y_predicted_cv[k_test]))
            cv_mae.append(
                mean_absolute_error(y_train[k_test], y_predicted_cv[k_test]))
            cv_test_fold[k_test] = k
        cv_r2_mean = np.mean(cv_r2)
        cv_r2_std = np.std(cv_r2)

        # df_big['cv_test_fold'] = np.nan
        # df_big.ix[df_big_ind_train, 'cv_test_fold'] = cv_test_fold
        df_train['cv_test_fold'] = cv_test_fold

    if run_learning_curve:
        X_full = np.vstack((X_train_life, X_test_life))
        y_full = np.hstack((y_train_life, y_test_life))
        from sklearn.learning_curve import learning_curve
        train_sizes, train_scores, test_scores = learning_curve(
            pipe,
            X_full,
            y_full,
            cv=5,
            n_jobs=n_jobs_cv,
            train_sizes=np.linspace(0.1, 1.0, 10))

        learning_curve_plot_file, learning_curve_df_file = learning_curve_plot(
            train_sizes,
            train_scores,
            test_scores,
            'learning curve',
            data_str,
            post_str='_training_curve')
    else:
        learning_curve_plot_file, learning_curve_df_file = empty_file, empty_file

    #### SCATTER PLOTS
    title_str = 'r2: {:.3f} MAE:{:.3f}'.format(test_r2, test_mae)
    scatter_file = pred_real_scatter(y_test, y_predicted, title_str, data_str)

    if run_cv:
        title_str = 'r2: {:.3f}({:.3f}) MAE:{:.3f}({:.3f})'.format(
            cv_r2_mean, cv_r2_std, np.mean(cv_mae), np.std(cv_mae))
        scatter_file_cv = pred_real_scatter(y_train,
                                            y_predicted_cv,
                                            title_str,
                                            data_str,
                                            post_str='_cv')

    else:
        scatter_file_cv = empty_file

    brain_age_scatter_file = plot_brain_age(y_test, y_predicted, data_str)

    #### TUNING CURVES
    if run_tuning:
        from sklearn.learning_curve import validation_curve
        from sklearn.cross_validation import StratifiedKFold
        import pylab as plt
        strat_k_fold = StratifiedKFold(df_big.ix[df_big_ind_train,
                                                 'age_bins'].values,
                                       n_folds=5,
                                       shuffle=True,
                                       random_state=0)
        param_range = np.logspace(-4, 0, num=12)
        # fixme n_jobs
        train_scores, test_scores = validation_curve(
            pipe,
            X_train,
            y_train,
            param_name="regression_model__C",
            param_range=param_range,
            cv=strat_k_fold,
            n_jobs=n_jobs_cv)
        # plot
        # http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#example-model-selection-plot-validation-curve-py
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.figure()
        plt.title("Validation Curve")
        plt.xlabel("C")
        plt.ylabel("Score")
        plt.ylim(0.0, 1.1)
        plt.semilogx(param_range,
                     train_scores_mean,
                     label="Training score",
                     color="r")
        plt.fill_between(param_range,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.2,
                         color="r")
        plt.semilogx(param_range,
                     test_scores_mean,
                     label="Cross-validation score",
                     color="g")
        plt.fill_between(param_range,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.2,
                         color="g")
        plt.legend(loc="best")

        tuning_curve_file = os.path.join(os.getcwd(),
                                         'tuning_curve_' + data_str + '.pdf')
        plt.savefig(tuning_curve_file)
        plt.close()
        # #############################################
    else:
        tuning_curve_file = empty_file

    #### join df_train and df_test with df_big
    # drop duplicate columns
    df_train.drop([l for l in df_train.columns if l in df_big.columns],
                  axis=1,
                  inplace=True)
    df_test.drop([l for l in df_test.columns if l in df_big.columns],
                 axis=1,
                 inplace=True)
    df_big = df_big.join(df_train, how='left')
    df_big = df_big.join(df_test, how='left')

    #### SAVE
    df_res_out_file = os.path.abspath(data_str + '_df_results.pkl')
    df_res = pd.DataFrame(
        {
            'FD_res': regress_confounds,
            'r2_train': [train_r2],
            'MAE_train': [train_mae],
            'r2_test': [test_r2],
            'rpear2_test': [test_rpear2],
            'MAE_test': [test_mae],
            'cv_r2': [cv_r2],
            'cv_r2_mean': [cv_r2_mean],
            'cv_r2_std': [cv_r2_std]
        },
        index=[data_str])
    df_res.to_pickle(df_res_out_file)

    df_life_out_file = os.path.join(os.getcwd(),
                                    data_str + '_life_df_predicted.pkl')
    df_life.to_pickle(df_life_out_file)

    df_nki_out_file = os.path.join(os.getcwd(),
                                   data_str + '_nki_df_predicted.pkl')
    df_nki.to_pickle(df_nki_out_file)

    df_big_out_file = os.path.join(os.getcwd(), data_str + '_df_predicted.pkl')
    df_big.to_pickle(df_big_out_file)

    model_out_file = os.path.join(os.getcwd(), 'trained_model.pkl')
    with open(model_out_file, 'w') as f:
        pickle.dump(pipe, f)

    return scatter_file, brain_age_scatter_file, df_life_out_file, df_nki_out_file, df_big_out_file, model_out_file, df_res_out_file, \
           tuning_curve_file, scatter_file_cv, learning_curve_plot_file, learning_curve_df_file
Ejemplo n.º 4
0
def run_prediction_from_trained_model_fct(trained_model_file,
                                          X_file,
                                          target_name,
                                          selection_criterium,
                                          df_file,
                                          data_str,
                                          regress_confounds=False):
    import os, pickle
    import numpy as np
    import pandas as pd
    from sklearn.metrics import r2_score, mean_absolute_error
    from LeiCA_LIFE.learning.learning_utils import pred_real_scatter, plot_brain_age, residualize_group_data

    data_str = target_name + '__' + selection_criterium + '__' + data_str

    df = pd.read_pickle(df_file)
    df['pred_age_test'] = np.nan

    X_test = np.load(X_file)
    y_test = df[[target_name]].values.squeeze()
    confounds = df[['mean_FD_P']].values

    # REGRESS OUT CONFOUNDS IF NEEDED
    if regress_confounds:
        X_test = residualize_group_data(X_test, confounds)

    with open(trained_model_file, 'r') as f:
        pipe = pickle.load(f)

    # RUN PREDICTION
    y_predicted = pipe.predict(X_test)

    df.ix[:, ['pred_age_test']] = y_predicted

    test_mae = mean_absolute_error(y_test, y_predicted)
    test_r2 = r2_score(y_test, y_predicted)
    test_rpear2 = np.corrcoef(y_test, y_predicted)[0, 1]**2

    train_r2 = np.nan
    train_mae = np.nan

    # SCATTER PLOTS
    title_str = 'r2: {:.3f} MAE:{:.3f}'.format(test_r2, test_mae)
    scatter_file = pred_real_scatter(y_test, y_predicted, title_str, data_str)

    brain_age_scatter_file = plot_brain_age(y_test, y_predicted, data_str)

    df_use_file = os.path.join(os.getcwd(), data_str + '_df_predicted.pkl')
    df.to_pickle(df_use_file)

    # performace results df
    df_res_out_file = os.path.abspath(data_str + '_df_results.pkl')
    df_res = pd.DataFrame(
        {
            'FD_res': regress_confounds,
            'r2_train': [train_r2],
            'MAE_train': [train_mae],
            'r2_test': [test_r2],
            'rpear2_test': [test_rpear2],
            'MAE_test': [test_mae]
        },
        index=[data_str])
    df_res.to_pickle(df_res_out_file)

    return scatter_file, brain_age_scatter_file, df_use_file, df_res_out_file