elif args.model == 'mlp':
            k = GPy.kern.MLP(X.shape[1], ARD=args.ard)

        if args.bias:
            k = k + GPy.kern.Bias(X.shape[1])
        #if args.label_preproc == "warp":
        #    model = GPy.models.WarpedGP(X_train, Y_train, kernel=k)
        #    model['warp_tanh.psi'] = np.random.lognormal(0, 1, (3, 3))
        #else:
            #model = GPy.models.GPRegression(X_train, Y_train, kernel=k)
        icmk = GPy.util.multioutput.ICM(input_dim=X.shape[1], num_outputs=6, 
                                        kernel=k, W_rank=args.rank)
        model = GPy.models.GPCoregionalizedRegression(X_train_list,
                                                      Y_train_list,
                                                      kernel=icmk)
        model.optimize(messages=True, max_iters=100)
        print model
    
        # Get predictions
        info_dict = {}
        preds_list = []
        vars_list = []
        if args.model == 'ridge' or args.model == 'svr':
            preds = model.predict(X_test)
            if args.label_preproc == 'scale':
                preds = Y_scaler.inverse_transform(preds)
            elif args.label_preproc == 'warp':
                preds += 50
            info_dict['mae'] = MAE(preds, Y_test.flatten())
            info_dict['rmse'] = np.sqrt(MSE(preds, Y_test.flatten()))
            info_dict['pearsonr'] = pearsonr(preds, Y_test.flatten())
Esempio n. 2
0
            k = GPy.kern.Matern32(X.shape[1], ARD=args.ard)
        elif args.model == 'mat52':
            k = GPy.kern.Matern52(X.shape[1], ARD=args.ard)
        elif args.model == 'ratquad':
            k = GPy.kern.RatQuad(X.shape[1], ARD=args.ard)
        elif args.model == 'linear':
            k = GPy.kern.Linear(X.shape[1], ARD=args.ard)
        elif args.model == 'mlp':
            k = GPy.kern.MLP(X.shape[1], ARD=args.ard)
        k = k + GPy.kern.Bias(1)
        if args.label_preproc == "warp":
            model = GPy.models.WarpedGP(X_train, Y_train, kernel=k)
            model['warp_tanh.psi'] = np.random.lognormal(0, 1, (3, 3))
        else:
            model = GPy.models.GPRegression(X_train, Y_train, kernel=k)
        model.optimize(messages=True, max_iters=100)

    # Get predictions
    info_dict = {}
    if args.model == 'ridge' or args.model == 'svr':
        preds = model.predict(X_test)
        if args.label_preproc == 'scale':
            preds = Y_scaler.inverse_transform(preds)
        info_dict['mae'] = MAE(preds, Y_test.flatten())
        info_dict['rmse'] = np.sqrt(MSE(preds, Y_test.flatten()))
        info_dict['pearsonr'] = pearsonr(preds, Y_test.flatten())
    else:
        # TODO: check if this makes sense
        #preds, vars = model.predict(X_test)
        preds, vars = model.predict_noiseless(X_test)
        if args.label_preproc == 'scale':
def run_baselines_particle_size(data, options):
    """
        Run regression model(s) on single replica of data (no random resampling, but uses indexes). Good for testing training and testing on manually specified splits

        :param data: input dataset, training and test mixed, [x,y] labels last column
        :param kwargs: Dictionary containing options.  Fields are:

        |    SEP_METHOD : ['interpolation','prediction'] -> Mode of learning / testing
        |    NUM_REP : N -> N random resampling of training and test sets (ONLY 'interpolation')
        |    LEG_P : [1, 2, 3] -> On which leg to predict. Treated separately by the models
        |    METHODS : [ridgereg', 'pls', 'lasso', 'rbfgpr', 'rbfgprard', 'rf'] -> Regression method
        |    LOG_Y : BOOL : -> whether to take the log of y (e.g. concentrations)
        |    NORMALIZE_Y : BOOL -> whether to normalize outputs y
        |    NORMALIZE_X : BOOL -> whether to normalize inputs x
        |    SAVEFOLDER : STRING -> folder address to store results
        |    MODELNAME : STRING -> name of the model file and scores
        |    SPLIT_SIZE : FLOAT [0,1] -> percentage of training to test datapoints
        |    TRN_TEST_INDEX : DF -> list of integers containing wheter the point belongs to training (=1) or
        |    SAVE_PRED : BOOL -> strore predicted values for trn and test (denormalized if needed)
         to the test set (=2). Has to be same size of data.shape[0]

        :returns: A dictionary containing weights, accuracy scores for training and test sets
    """

    SEP_METHOD = options['SEP_METHOD']
    NUM_REP = options['NUM_REP']
    LEG_P = options['LEG_P']
    METHODS = options['METHODS']
    NORMALIZE_Y = options['NORMALIZE_Y']
    NORMALIZE_X = options['NORMALIZE_X']
    SAVEFOLDER = options['SAVEFOLDER']
    MODELNAME = options['MODELNAME']
    SPLIT_SIZE = options['SPLIT_SIZE']
    TRN_TEST_INDEX = options['TRN_TEST_INDEX']
    LOG_Y = options['LOG_Y']
    SAVE_PRED = options['SAVE_PRED']
    #SAVE_TEXT_DUMP = kwargs['SAVE_TEXT_DUMP']
    if not os.path.isdir(SAVEFOLDER):
        os.mkdir(SAVEFOLDER)

    if os.path.exists(SAVEFOLDER / MODELNAME):
        print("file exists, overwriting")

    summ = {}
    #if SAVE_TEXT_DUMP:
    #    results = pd.DataFrame(index=[],columns=['tst_r2','tst_rmse','trn_r2','trn_rmse','n_tr','n_ts'])
    # if LOG_Y:
    #     data.parbin = data.parbin.apply(np.log) #(data.loc[:,'parbin'].copy()+10e-6)
    for sep_method in SEP_METHOD:
        # print(sep_method)
        for leg in LEG_P:
            # print(leg)
            for meth in METHODS:
                nre = 0
                while nre < NUM_REP:
                    np.random.seed(nre)

                    string_exp = 'leg_' + str(leg) + '_' + sep_method + '_' + meth + '_' + str(nre)
                    nre += 1

                    data_f = data.copy()
                        # leg_whole_.loc[:,'parbin'] = np.log(leg_whole_.loc[:,'parbin'])
                    if leg != 0:

                        if 'leg' not in data.columns.tolist():
                            data_f = dataset.add_legs_index(data_f)

                        data_f = data_f.loc[data_f['leg'] == leg]
                        data_f.drop('leg', axis=1, inplace=True)
                    else:
                        if 'leg' in data.columns.tolist():
                            data_f.drop('leg', axis=1, inplace=True)

                    leg_whole_ = data_f.dropna().copy()
                    if LOG_Y:
                        leg_whole_.loc[:,'parbin'] = leg_whole_.parbin.apply(lambda x: np.log(x + 1e-10))

                    s1, s2 = leg_whole_.shape

                    if s1 < 10:
                        continue

                    if not TRN_TEST_INDEX.values.any():

                        # mode = 'interpolation', 'prediction', 'temporal_subset'
                        inds = modeling.sample_trn_test_index(leg_whole_.index, split=SPLIT_SIZE,   mode=sep_method, group='all', options=options['SUB_OPTIONS'])

                        trn = leg_whole_.loc[(inds.iloc[:,0]==1),:].copy()
                        tst = leg_whole_.loc[(inds.iloc[:,0]==2),:].copy()

                        ###### INSERT SPLIT FUNCTION HERE:
                        # separation = SPLIT_SIZE
                        # trn_size = ceil(s1*separation) #;
                        #
                        # if sep_method.lower() == 'prediction':
                        # #     print('training data until ' + str(separation) + ', then test.')
                        #     trn = leg_whole_.iloc[:trn_size,:].copy()
                        #     tst = leg_whole_.iloc[trn_size:,:].copy()
                        #
                        # elif sep_method.lower() == 'interpolation':
                        # #     print('training data random %f pc subset, rest test'%(separation*100))
                        #     leg_whole_ = shuffle(leg_whole_)
                        #     trn = leg_whole_.iloc[:trn_size,:].copy()
                        #     tst = leg_whole_.iloc[trn_size:,:].copy()

                    elif TRN_TEST_INDEX.values.any():
                        trn = leg_whole_.loc[TRN_TEST_INDEX.values == 1,:].copy()
                        tst = leg_whole_.loc[TRN_TEST_INDEX.values == 2,:].copy()

                    inds_trn = trn.index
                    inds_tst = tst.index

                    # Standardize data to 0 mean unit variance based on training statistics (assuming stationarity)
                    # SCALE TRAINING DATA X, y
                    if NORMALIZE_X:
                        scalerX = preprocessing.StandardScaler().fit(trn.iloc[:,:-1])
                        X = scalerX.transform(trn.iloc[:,:-1])#, columns=trn.iloc[:,:-1].columns, index=trn.index)
                    else:
                        X = trn.iloc[:,:-1]

                    if NORMALIZE_Y:
                        scalerY = preprocessing.StandardScaler().fit(trn.iloc[:,-1].values.reshape(-1, 1))
                        y = scalerY.transform(trn.iloc[:,-1].values.reshape(-1, 1))
                    else:
                        y = trn.iloc[:,-1]

                    ######### 1 : Ridge Regression
                    if meth.lower() == 'ridgereg':
                        MSE_error = make_scorer(mean_squared_error, greater_is_better=False)
                        regModel = RidgeCV(alphas=np.logspace(-3,0), fit_intercept=True,
                            normalize=False, store_cv_values=False, gcv_mode='svd',
                            cv=5).fit(X,y) #(trn.iloc[:,:-1], trn.iloc[:,-1]
                        regModel.coef_ = regModel.coef_[0]

                    elif meth.lower() == 'bayesianreg':
                        regModel = sk.linear_model.BayesianRidge(n_iter=500, tol=1.e-6, alpha_1=1.e-6, alpha_2=1.e-6, lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False, fit_intercept=False, normalize=False).fit(X,y.ravel())


                    elif meth.lower() == 'pls':
                        n = 3
                        regModel = PLSRegression(n_components=n, scale=False).fit(X,y)
                        regModel.coef_ = np.squeeze(np.transpose(regModel.coef_))

                    elif meth.lower() == 'lasso':
                        regModel = LassoCV(alphas=np.logspace(-2,0,1), n_alphas=500,
                                           fit_intercept=True, max_iter=5000, cv=5).fit(X,y.ravel())

                    elif meth.lower() == 'lingpr':
                        kernel = kernels.DotProduct(sigma_0 = 1, sigma_0_bounds=(1e-05, 1e05)) + \
                             1.0 * kernels.WhiteKernel(noise_level=1e-3, noise_level_bounds=(1e-3, 1e+3))
                        regModel = GaussianProcessRegressor(kernel=kernel, optimizer='fmin_l_bfgs_b',
                                                            alpha=0, n_restarts_optimizer=5).fit(X,y)
                        str_kernel = str(kernel)
                        # print(str_kernel)

                    elif meth.lower() == 'rf':
                        import sklearn.ensemble
                        regModel = sklearn.ensemble.RandomForestRegressor(n_estimators=500,
                                criterion='mse', max_features='sqrt',
                                max_depth=15, min_samples_split=2,
                                min_samples_leaf=1).fit(X,np.ravel(y))
                        regModel.coef_ = regModel.feature_importances_

                    elif meth.lower() == 'gpr':
                        kernel = 1.0 * kernels.RBF(length_scale=1.0, length_scale_bounds=(1e0, 1e2)) + \
                        1.0 * kernels.WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-2, 1e2)) # kernels.ExpSineSquared(length_scale=1, periodicity=1) + \
                        # 1.0 * kernels.DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-02, 1e2))
#*  kernels.ExpSineSquared(length_scale=1, periodicity=1) + \ 1.0 * kernels.ConstantKernel(constant_value=1.0, constant_value_bounds=(1e-02, 100.0)) + \
                        regModel = GaussianProcessRegressor(kernel=kernel, optimizer='fmin_l_bfgs_b',
                                                            alpha=0.5,
                                                            n_restarts_optimizer=5).fit(X,y)


                        # print(regModel.kernel_)

                    elif meth.lower() == 'gprard':

                        #x = trn.iloc[:,:-1].values
                        #y = trn.iloc[:,-1].values.reshape(-1,1)
                        s1 = X.shape[1]

                        k = (GPy.kern.RBF(s1, ARD=True)
                             + GPy.kern.White(s1, 1)
                             + GPy.kern.Bias(s1, 1))
                             #+ GPy.kern.Linear(s1, variances=0.001, ARD=False))

                        regModel = GPy.models.GPRegression(X, y, kernel=k)
                        #regModel.optimize_restarts(parallel=True, robust=True, num_restarts=5, max_iters=200)
                        regModel.optimize('scg', max_iters=200) # 'scg'
                        regModel.coef_ = np.array(regModel.sum.rbf.lengthscale)

                    else:
                        print('method not implemented yet. Or check the spelling')
                        break


                    if NORMALIZE_X:
                        x = scalerX.transform(tst.iloc[:,:-1])#, columns=tst.iloc[:,:-1].columns, index=tst.index)
                    else:
                        x = tst.iloc[:,:-1]

                    y_ts_gt = tst.iloc[:,-1]

                    if meth.lower() == 'gprard':
                        # x_ = tst_.values
                        # x  = trn.iloc[:,:-1].values
                        y_ts_h = regModel.predict(x)[0].reshape(-1,)
                        y_tr_h = regModel.predict(X)[0].reshape(-1,)

                    elif (meth.lower() == 'bayesianreg') or (meth.lower() == 'gpr'):
                        [y_ts_h, y_ts_std] = regModel.predict(x,return_std=SAVE_PRED)
                        y_ts_h, y_ts_std = y_ts_h.reshape(-1,), y_ts_std.reshape(-1,)
                        [y_tr_h, y_tr_std] = regModel.predict(X,return_std=SAVE_PRED)
                        y_tr_h, y_tr_std = y_tr_h.reshape(-1,), y_tr_std.reshape(-1,)

                    else:
                        y_ts_h = regModel.predict(x).reshape(-1,)
                        y_tr_h = regModel.predict(X).reshape(-1,)

                    if NORMALIZE_Y:
                        y_tr_h = scalerY.inverse_transform(y_tr_h)
                        y_ts_h = scalerY.inverse_transform(y_ts_h)
                        y_tr_gt = scalerY.inverse_transform(y)#trn.iloc[:,-1]

                        # print(trn.iloc[:,-1].values[0:10],y_tr_gt[0:10], y[0:10])
                    else:
                        y_tr_gt = y#trn.iloc[:,-1]

                    # print(y[:10], y_tr_gt[:10])

                    # Compute scores
                    if LOG_Y:
                        y_ts_gt = np.exp(y_ts_gt) - 1e-10
                        y_ts_h = np.exp(y_ts_h) - 1e-10
                        y_tr_gt = np.exp(y_tr_gt) - 1e-10
                        y_tr_h = np.exp(y_tr_h) - 1e-10

                    # print(np.min(y_tr_gt),np.max(y_tr_gt), ' -- ', np.min(y_tr_h),np.max(y_tr_h))
                    # print(np.min(y_ts_gt),np.max(y_ts_gt), ' -- ', np.min(y_ts_h),np.max(y_ts_h))


                    mse = np.sqrt(mean_squared_error(y_ts_gt, y_ts_h))
                    r2 = r2_score(y_ts_gt, y_ts_h)


                    t_mse = np.sqrt(mean_squared_error(y_tr_gt, y_tr_h))
                    t_r2 = r2_score(y_tr_gt, y_tr_h)



                    if hasattr(regModel, 'alpha_') & hasattr(regModel, 'coef_'):
                        summ[string_exp] = {'regularizer': regModel.alpha_,
                                            'weights': regModel.coef_,
                                            'tr_RMSE': t_mse,
                                            'tr_R2': t_r2,
                                            'ts_RMSE': mse,
                                            'ts_R2': r2,
                                            'tr_size': trn.shape[0],
                                            'ts_size': tst.shape[0]}#,
                                            # 'y_tr_hat': y_tr_h,
                                            # 'y_ts_hat': y_ts_h}
                        if 'str_kernel' in locals():
                            summ[string_exp].update({'kernel': str_kernel})

                    elif hasattr(regModel, 'coef_') & ~hasattr(regModel, 'alpha_'):
                        summ[string_exp] = {'weights': regModel.coef_,
                                            'tr_RMSE': t_mse,
                                            'tr_R2': t_r2,
                                            'ts_RMSE': mse,
                                            'ts_R2': r2,
                                            'tr_size': trn.shape[0],
                                            'ts_size': tst.shape[0]}#,
                                            # 'y_tr_hat': y_tr_h,
                                            # 'y_ts_hat': y_ts_h}
                    else:
                        summ[string_exp] = {'tr_RMSE': t_mse,
                                            'tr_R2': t_r2,
                                            'ts_RMSE': mse,
                                            'ts_R2': r2,
                                            'tr_size': trn.shape[0],
                                            'ts_size': tst.shape[0]}#,
                                            # 'y_tr_hat': y_tr_h,
                                            # 'y_ts_hat': y_ts_h}
                        if 'str_kernel' in locals():
                            summ[string_exp].update({'kernel': str_kernel})

                    if SAVE_PRED:
                        # Convert to pandas
                        # print(y_tr_gt[:10])

                        y_tr_h = pd.Series(y_tr_h,index=inds_trn)
                        y_ts_h = pd.Series(y_ts_h,index=inds_tst)
                        y_tr_gt = pd.Series(np.reshape(y_tr_gt,(-1,)),index=inds_trn)
                        y_ts_gt = pd.Series(np.reshape(y_ts_gt,(-1,)),index=inds_tst)
                        # print(y_tr_gt.iloc[:10])

                        if 'y_ts_std' in locals():
                            y_ts_std = pd.Series(y_ts_std,index=inds_tst)
                            y_tr_std = pd.Series(y_tr_std,index=inds_trn)

                        # Add to dictionary
                        summ[string_exp].update({'y_tr_hat': y_tr_h,
                                                 'y_ts_hat': y_ts_h,
                                                 'y_tr_gt': y_tr_gt,
                                                 'y_ts_gt': y_ts_gt})
                        # print(summ[string_exp]['y_tr_gt'].head(), summ[string_exp]['y_ts_gt'].head())

                        if 'y_ts_std' in locals():
                            summ[string_exp].update({'y_tr_std': y_tr_std,
                                                     'y_ts_std': y_ts_std})



                    #if SAVE_TEXT_DUMP:
                        # results = pd.DataFrame(index=[],columns=['n_ts', 'tst_r2','tst_rmse',' n_tr', 'trn_r2','trn_rmse'])
                    #    results.loc[nre-1] = [len(y_ts_h), r2, mse, len(y_tr_h), t_r2, t_mse]

                    del leg_whole_, regModel, y_tr_gt, y_ts_gt, y_tr_h, y_ts_h, trn, tst

    # save_obj(summ, SAVEFOLDER / MODELNAME)
    #results.to_csv(path_or_buf=SAVEFOLDER + MODELNAME + '.csv', sep='\t')
    return summ