Exemple #1
0
    def test_posterior(self):
        """Check the posterior over weights function returns mean and covar."""
        clf = RVR()

        x = np.array([[1, 2], [3, 4]])
        y = np.array([[5, 6], [7, 8]])

        clf.phi = clf._apply_kernel(x, y)

        clf.alpha_ = np.ones(3)
        clf.m_ = np.ones(3)
        clf.beta_ = 1
        clf.y = np.array([1, 1])

        clf._posterior()

        m_target = np.array([6.103885e-03, 3.750334e-08, 6.666294e-01])
        sigma_target = np.array([
            [9.997764e-01, -1.373791e-09, -6.103885e-03],
            [-1.373791e-09, 1.000000e+00, -3.750334e-08],
            [-6.103885e-03, -3.750334e-08, 3.333706e-01]
        ])

        np.testing.assert_allclose(clf.m_, m_target)
        np.testing.assert_allclose(clf.sigma_, sigma_target)
Exemple #2
0
    def test_predict(self):
        """Check the predict function works with pre-set values."""
        clf = RVR(kernel='linear', bias_used=False)

        clf.relevance_ = np.array([[1, 1]])
        clf.m_ = np.array([1])

        y = clf.predict(np.array([1, 1]))
        self.assertEqual(y, 2)
Exemple #3
0
 def __init__(self, HI):
     if (HI.shape[0] == 1):
         HI = HI.reshape(1, 1)
         timesteps = np.array([i for i in range(len(HI))
                               ]).reshape(len(HI), HI.shape[0])
     else:
         timesteps = np.array([i for i in range(len(HI))
                               ]).reshape(len(HI), HI.shape[1])
     self.rvrmodel = RVR(kernel='linear')
     self.optimize(timesteps, HI)
Exemple #4
0
    def test_fit(self):
        """Check the fit function works correctly."""
        clf = RVR(kernel='linear', threshold_alpha=1e3, verbose=True)

        X = np.array([
            [1],
            [2],
            [3],
        ])
        y = np.array([1, 2, 3])
        np.random.seed(1)
        y = y + 0.1 * np.random.randn(y.shape[0])

        clf.fit(X, y)

        m_target = np.array([0.065906, 0.131813, 0.197719, 0.159155])

        np.testing.assert_array_equal(clf.relevance_, X)
        np.testing.assert_allclose(clf.m_, m_target, rtol=1e-3)
Exemple #5
0
    def test_regression_sinc(self):
        """Check regression works with y=sinc(x)."""
        clf = RVR()
        x = np.linspace(0, 10, 101)
        y = np.sinc(x)

        np.random.seed(1)
        y = y + 0.1 * np.random.randn(y.shape[0])

        X = x[:, np.newaxis]

        clf.fit(X, y)
        score = clf.score(X, y)

        m_target = [
            1.117655e+00, -6.334513e-01, 5.868671e-01, -4.370936e-01,
            2.320311e-01, -4.638864e-05, -7.505325e-02, 6.133291e-02
        ]

        self.assertGreater(score, 0.85)
        np.testing.assert_allclose(clf.m_, m_target, rtol=1e-3)
        self.assertEqual(clf.relevance_.shape, (8, 1))

        prediction, mse = clf.predict(np.array([[0.5]]), eval_MSE=True)
        self.assertAlmostEqual(prediction[0], 0.611, places=3)
        self.assertAlmostEqual(mse[0], 0.00930, places=5)
Exemple #6
0
    def test_regression_linear_noise(self):
        """Check regression works with a linear function with added noise."""
        clf = RVR(kernel='linear', alpha=1e11)

        x = np.arange(1, 101)
        y = x + 5

        np.random.seed(1)
        y = y + 0.1 * np.random.randn(y.shape[0])

        X = x[:, np.newaxis]

        clf.fit(X, y)
        score = clf.score(X, y)

        m_target = np.array([1, 5])
        rel_target = np.array([[1]])

        self.assertGreater(score, 0.99)
        np.testing.assert_allclose(clf.m_, m_target, rtol=1e-2)
        np.testing.assert_allclose(clf.relevance_, rel_target)
        self.assertAlmostEqual(clf.beta_, 126.583, places=3)

        prediction, mse = clf.predict(np.array([[50]]), eval_MSE=True)
        self.assertAlmostEqual(prediction[0], 55.006, places=3)
        self.assertAlmostEqual(mse[0], 0.00798, places=5)
Exemple #7
0
class RVRDegradationModel:
    def __init__(self, HI):
        if (HI.shape[0] == 1):
            HI = HI.reshape(1, 1)
            timesteps = np.array([i for i in range(len(HI))
                                  ]).reshape(len(HI), HI.shape[0])
        else:
            timesteps = np.array([i for i in range(len(HI))
                                  ]).reshape(len(HI), HI.shape[1])
        self.rvrmodel = RVR(kernel='linear')
        self.optimize(timesteps, HI)

    def optimize(self, X, Y):
        self.rvrmodel.fit(X, Y)

    def update(self, X, Y):
        self.optimize(X, Y)

    def predict(self, X):
        # self.rvrmodel.fit(X, X)
        Yp = self.rvrmodel.predict(X)
        print(Yp)
        return Yp
def rvr_pipeline(x,y,pca,kernel,x_p=0,y_p=0,fold=10,seed=2019,predict_data=False):
    
    rvr = RVR(kernel=kernel)
    kf = KFold(n_splits=fold, shuffle=True, random_state=seed)
    score = np.zeros((fold,))
    i = 0
    for train,test in kf.split(x,y):
        t1 = time.time()
        x_train, y_train = x[train], y[train]
        x_test, y_test = x[test], y[test]
        pca.fit(x_train)
        new_train = pca.transform(x_train)
        new_test = pca.transform(x_test)
        scaler.fit(new_train)
        new_train = scaler.transform(new_train)
        new_test = scaler.transform(new_test)
        rvr.fit(new_train,y_train)
        pred = rvr.predict(new_test)
        mse = abs(pred-y_test)
        score[i] = sum(mse)/mse.shape[0]
        i+=1
        t2 = time.time()
        print('fold '+str(i)+':',t2-t1,'sec')
    print('='*40)
    print('MAE:',np.mean(score))
    
    if predict_data:
        pca.fit(x)
        new_train = pca.transform(x)
        new_test = pca.transform(x_p)
        scaler.fit(new_train)
        new_train = scaler.transform(new_train)
        new_test = scaler.transform(new_test)
        rvr.fit(new_train,y)
        pred = rvr.predict(new_test)
        error = abs(pred-y_p)
        print('Test MAE:',sum(error)/error.shape[0])
    
    return pred
Exemple #9
0
    def test_regression_linear(self):
        """Check regression works with a linear function."""
        clf = RVR(kernel='linear', alpha=1e11)

        x = np.arange(1, 100)
        y = x + 5

        X = x[:, np.newaxis]

        clf.fit(X, y)

        score = clf.score(X, y)

        m_target = np.array([1, 5])

        self.assertGreater(score, 0.99)
        np.testing.assert_allclose(clf.m_, m_target)

        prediction, mse = clf.predict(np.array([[50]]), eval_MSE=True)
        self.assertAlmostEqual(prediction[0], 55, places=3)
        self.assertAlmostEqual(mse[0], 6.18e-6, places=3)
Exemple #10
0
from skrvm import RVR
from skrvm import RVC
from sklearn.datasets import load_iris

X = [[0, 0], [2, 2]]
y = [0.5, 2.5]
clf = RVR(kernel='linear')
# clf = RVR(kernel='rbf')
# clf = RVR(kernel='poly')
clf.fit(X, y)

RVR(alpha=1e-06,
    beta=1e-06,
    beta_fixed=False,
    bias_used=True,
    coef0=0.0,
    coef1=None,
    degree=3,
    kernel='linear',
    n_iter=3000,
    threshold_alpha=1000000000.0,
    tol=0.001,
    verbose=True)

print(clf.predict([[1, 1]]))

# clf = RVC()
# clf.fit(load_iris().data, load_iris().target)
# RVC(alpha=1e-06, beta=1e-06, beta_fixed=False, bias_used=True, coef0=0.0,
#     coef1=None, degree=3, kernel='rbf', n_iter=3000, n_iter_posterior=50,
#     threshold_alpha=1000000000.0, tol=0.001, verbose=False)
Exemple #11
0
def benchmark():
    # Any integer value between 1 and 3 to select the number of subplots to show:
    num_figures = 2

    # Parameters to generate training data
    num_samples = 100
    noise_level = 0.1
    training_data_range = 10

    # Training data
    X, y = generate_training_data(num_samples, noise_level, training_data_range)

    # Fit
    gpr = GaussianProcessRegressor(kernel=RBF() + WhiteKernel())
    gpr.fit(X, y)

    ## Implementation of RVR by skrvm
    rvr = RVR(kernel='rbf')
    rvr.fit(X, y)

    ## Implementation of RVR by sklearn_rvm
    # Caveat: Since sklearn v.0.22, the default value of gamma changed from ‘auto’ to ‘scale’.
    # Reference: https://github.com/Mind-the-Pineapple/sklearn-rvm/issues/9
    emrvr = EMRVR(kernel='rbf',
                  gamma='auto')
    emrvr.fit(X, y)

    # Predict
    plot_params = get_plot_params()
    X_plot = np.linspace(plot_params['x_low'], plot_params['x_high'], 10000)[:, None]

    # Caveat:
    # generating the variance of the predictive distribution takes considerably longer than just predicting the mean.
    # Reference:
    # https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_compare_gpr_krr.html
    y_gpr, y_gpr_std = gpr.predict(X_plot, return_std=True)

    ## Implementation of RVR by skrvm
    y_rvr = rvr.predict(X_plot)
    y_rvr_std = None

    ## Implementation of RVR by sklearn_rvm
    y_emrvr, y_emrvr_std = emrvr.predict(X_plot, return_std=True)

    # Plot
    fig, axs = plt.subplots(num_figures, 1, figsize=(15, 7))

    try:
        # In case there are stricly more than 1 subplot, there is no issue.
        num_sub_plots = len(axs)
    except TypeError:
        # In case there is exactly 1 subplot, we have to ensure that axs is a list, for code compatibility.
        axs = [axs]
        num_sub_plots = len(axs)

    print('Plotting {} subplots.'.format(num_sub_plots))

    plot_results(X, y, emrvr, gpr, X_plot, y_emrvr, y_gpr,
                 "sklearn_rvm", "GPR", y_emrvr_std, y_gpr_std,
                 rvr_color='navy', gpr_color='darkorange',
                 training_data_range=training_data_range,
                 ax=axs[0])

    if len(axs) > 1:
        plot_results(X, y, emrvr, rvr, X_plot, y_emrvr, y_rvr,
                     "sklearn_rvm", "skrvm", y_emrvr_std, y_rvr_std,
                     rvr_color='navy', gpr_color='purple',
                     training_data_range=training_data_range,
                     ax=axs[1])

    if len(axs) > 2:
        plot_results(X, y, rvr, gpr, X_plot, y_rvr, y_gpr,
                     "skrvm", "GPR", y_rvr_std, y_gpr_std,
                     rvr_color='purple', gpr_color='darkorange',
                     training_data_range=training_data_range,
                     ax=axs[2])

    plt.show()

    return
Exemple #12
0
def final_train(x, y, x_test, y_test, out_list, mn, age_group_all):
    model = []
    best_score = []

    if mn == 'LAD':
        print(out_list)
        [C_list,
         score_list] = zip(*[(item[6]['C'], item[5]) for item in out_list])
        C_final = np.median(C_list)
        best_score = np.mean(score_list)
        print('in final LAD')
        print('para', C_list, C_final, 'score', score_list, best_score)
        model = LAD(epsilon=0.0,
                    tol=0.0001,
                    C=C_final,
                    loss='epsilon_insensitive',
                    fit_intercept=True,
                    intercept_scaling=1.0,
                    dual=True,
                    verbose=0,
                    random_state=None,
                    max_iter=10000)
        model.fit(x, y)
        pred_var = predict(mn, model, x_test, y_test)

    elif mn == 'RFR':
        [n_est_list, score_list] = zip(*[(item[6]['n_estimators'], item[5])
                                         for item in out_list])
        n_est = int(np.median(n_est_list))
        best_score = np.mean(score_list)
        print('in final RFR')
        print('n_est_list', n_est_list, n_est, 'score', score_list, best_score)
        rfr = RandomForestRegressor(criterion='mse')
        params = {"n_estimators": [n_est]}
        model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0)
        model.fit(x, y)
        pred_var = predict(mn, model, x_test, y_test)

    elif mn == 'PLSR':
        [n_comp_list, score_list] = zip(*[(item[6]['n_components'], item[5])
                                          for item in out_list])
        n_comp = int(np.median(n_comp_list))
        best_score = np.mean(score_list)
        print('in final PLSR')
        print('n_comp_list', n_comp_list, n_comp, 'score', score_list,
              best_score)
        pls_reg = PLSRegression()
        params = {'n_components': [n_comp]}
        model = GridSearchCV(pls_reg, param_grid=params, cv=5, verbose=0)
        model.fit(x, y)
        pred_var = predict(mn, model, x_test, y_test)

    elif mn == 'RR':
        from sklearn.linear_model import Ridge, RidgeCV
        [n_comp_list,
         score_list] = zip(*[(item[6]['alpha'], item[5]) for item in out_list])
        n_comp = int(np.median(n_comp_list))
        best_score = np.mean(score_list)
        print('in final RR')
        print('n_comp_list', n_comp_list, n_comp, 'score', score_list,
              best_score)
        ridge = Ridge()
        params = {'alpha': [n_comp]}
        model = GridSearchCV(ridge, param_grid=params, cv=5, verbose=0)
        model.fit(x, y)
        pred_var = predict(mn, model, x_test, y_test)

    elif mn == 'RVM':
        from skrvm import RVR
        print('in final RVM')
        model = RVR(kernel='linear')
        model.fit(x, y)
        best_score = 0
        pred_var = predict(mn, model, x_test, y_test)

    elif mn == 'COMB':
        print('IN COMB')
        group_lad = dict()
        from mord import LAD
        from sklearn.ensemble import RandomForestRegressor

        print('shapes', x.shape, y.shape)

        lad1 = LAD(epsilon=0.0,
                   tol=0.0001,
                   loss='epsilon_insensitive',
                   fit_intercept=True,
                   intercept_scaling=1.0,
                   dual=True,
                   verbose=0,
                   random_state=None,
                   max_iter=10000)
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        broad_lad = GridSearchCV(lad1,
                                 param_grid=params,
                                 cv=5,
                                 scoring='neg_mean_absolute_error',
                                 verbose=0)
        broad_lad.fit(x, y)

        for ages in age_group_all:
            # print('ages', ages)
            idx_grp = list()
            for item in ages:  # for every age in the age group collect the training data by getting the indices
                for idx, val in enumerate(y):
                    if val == item:
                        idx_grp.append(idx)

            key_age_grp = str(np.min(ages)) + '_' + str(np.max(ages))
            x_samples_train = x[idx_grp]
            y_samples_train = y[idx_grp]
            # print('y_samples_train', y_samples_train)

            lad2 = LAD(epsilon=0.0,
                       tol=0.0001,
                       loss='epsilon_insensitive',
                       fit_intercept=True,
                       intercept_scaling=1.0,
                       dual=True,
                       verbose=0,
                       random_state=None,
                       max_iter=10000)
            params2 = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
            specific_lad = GridSearchCV(lad2,
                                        param_grid=params2,
                                        cv=5,
                                        scoring='neg_mean_absolute_error',
                                        verbose=0)
            specific_lad.fit(x_samples_train, y_samples_train)
            group_lad[key_age_grp] = specific_lad

        pred_all = make_predictions(x, broad_lad, group_lad)
        rfr = RandomForestRegressor(criterion='mse')
        params = {"n_estimators": [500]}
        model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0)
        model.fit(pred_all, y)
        print("[INFO] RFR grid search best parameters: {}".format(
            model.best_params_))

        best_score = model.best_score_
        pred_all_test = make_predictions(x_test, broad_lad, group_lad)
        pred_var = predict(mn, model, pred_all_test, y_test)

    return model, best_score, pred_var
 def __init__(self):
     from skrvm import RVR
     self.model = RVR(
         verbose=False,
         kernel="rbf",
     )
Exemple #14
0
def train(m, x_train, y_train, x_test, y_test):
    print('training', m)
    model = []
    pred_var = {}

    if m == 'LAD':
        from mord import LAD
        lad = LAD(epsilon=0.0,
                  tol=0.0001,
                  loss='epsilon_insensitive',
                  fit_intercept=True,
                  intercept_scaling=1.0,
                  dual=True,
                  verbose=0,
                  random_state=None,
                  max_iter=10000)
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        model = GridSearchCV(lad,
                             param_grid=params,
                             cv=5,
                             scoring='neg_mean_absolute_error',
                             verbose=0)

        y_train = y_train.astype(float).round()
        y_train = y_train.astype(int)

        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] LAD grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'MCLog':  # this class is not avaialble
        from sklearn.linear_model import LogisticRegression
        mcl = LogisticRegression(multi_class='multinomial',
                                 max_iter=10000,
                                 solver='newton-cg',
                                 fit_intercept=True)
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        model = GridSearchCV(mcl,
                             param_grid=params,
                             cv=5,
                             scoring='neg_mean_absolute_error',
                             verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] MCLog grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'LogAT':  # takes quite some time
        from mord import LogisticAT
        lat = LogisticAT()
        params = {"alpha": np.linspace(0, 1, 5)}
        model = GridSearchCV(lat,
                             param_grid=params,
                             cv=5,
                             scoring='neg_mean_absolute_error',
                             verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] LogAT grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'LinearSVC':
        from sklearn.svm import LinearSVC
        svm = LinearSVC()
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] LinearSVC grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'RFC':
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier()
        params = {"n_estimators": [10, 100, 500, 1000]}
        model = GridSearchCV(rfc, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] RFC grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'Lasso':
        from sklearn.linear_model import Lasso
        from sklearn.linear_model import LassoCV
        svm = Lasso()
        params = {"alpha": [10]}
        model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] RFR grid search best parameters: {}".format(
            model.best_params_))
        # model = LassoCV(n_alphas=10, cv=5, verbose=3)
        # model.fit(x_train, y_train)
        # print("[INFO] Lasso path search best parameter: {}".format(model.alpha_))

    elif m == 'RFR':
        from sklearn.ensemble import RandomForestRegressor
        rfr = RandomForestRegressor(criterion='mse')
        params = {"n_estimators": [500]}
        model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] RFR grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'RR':
        from sklearn.linear_model import Ridge, RidgeCV
        ridge = Ridge()
        params = {
            'alpha':
            [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        }
        model = GridSearchCV(ridge, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        print("[INFO] Ridge Regression grid search best parameters: {}".format(
            model.best_params_))
        # model = RidgeCV(alphas=(0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), cv=5)
        # model.fit(x_train, y_train)
        # print("[INFO] Ridge Regression grid search best parameters: {}".format(model.alpha_))
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)

    elif m == 'PLSR':
        from sklearn.cross_decomposition import PLSRegression
        pls_reg = PLSRegression()
        params = {
            'n_components': [
                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
                19, 20
            ]
        }
        model = GridSearchCV(pls_reg, param_grid=params, cv=5, verbose=0)
        # pdb.set_trace()
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        print("[INFO] PLS Regression grid search best parameters: {}".format(
            model.best_params_))
        pred_var = predict(m, model, x_test, y_test)

    elif m == 'RVM':
        from skrvm import RVR
        print('in RVM')
        model = RVR(kernel='linear')
        # avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model, x_train, y_train, x_test, y_test, loss='mse',
        #                                                             num_rounds=3, random_seed=123)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)

        # print('Average expected loss: %.3f' % avg_expected_loss)
        # print('Average bias: %.3f' % avg_bias)
        # print('Average variance: %.3f' % avg_var)

    elif m == 'DTR':
        from sklearn.tree import DecisionTreeRegressor
        model = DecisionTreeRegressor()
        # params = {"criterion": ["mse", "mae"], "min_samples_split": [10, 20, 40], "max_depth": [2],
        #           "min_samples_leaf": [20, 40, 100], "max_leaf_nodes": [5, 20, 100]}
        # params = {"max_depth": [2,4,6]}
        # model = GridSearchCV(dtr, param_grid=params, cv=5, verbose=0)

        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)

    elif m == 'COMB':
        from sklearn.ensemble import RandomForestRegressor
        from mord import LAD
        from group_pred import create_age_groups
        print('IN COMB')
        group_lad = dict()

        print('shapes', x_train.shape, y_train.shape)

        lad1 = LAD(epsilon=0.0,
                   tol=0.0001,
                   loss='epsilon_insensitive',
                   fit_intercept=True,
                   intercept_scaling=1.0,
                   dual=True,
                   verbose=0,
                   random_state=None,
                   max_iter=10000)
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        broad_lad = GridSearchCV(lad1,
                                 param_grid=params,
                                 cv=5,
                                 scoring='neg_mean_absolute_error',
                                 verbose=0)

        y_train_r = y_train.astype(float).round()
        y_train_r = y_train_r.astype(int)

        broad_lad.fit(x_train, y_train_r)

        age_group_all = create_age_groups(y_train_r, 10, 5)

        for ages in age_group_all:
            # print('ages', ages)
            idx_grp = list()
            for item in ages:  # for every age in the age group collect the training data by getting the indices
                for idx, val in enumerate(y_train_r):
                    if val == item:
                        idx_grp.append(idx)

            print('group info', ages, len(idx_grp))
            if len(idx_grp) > 5:
                key_age_grp = str(np.min(ages)) + '_' + str(np.max(ages))
                x_samples_train = x_train[idx_grp]
                y_samples_train = y_train_r[idx_grp]
                # print('y_samples_train', y_samples_train)

                lad2 = LAD(epsilon=0.0,
                           tol=0.0001,
                           loss='epsilon_insensitive',
                           fit_intercept=True,
                           intercept_scaling=1.0,
                           dual=True,
                           verbose=0,
                           random_state=None,
                           max_iter=10000)
                params2 = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
                specific_lad = GridSearchCV(lad2,
                                            param_grid=params2,
                                            cv=5,
                                            scoring='neg_mean_absolute_error',
                                            verbose=0)
                specific_lad.fit(x_samples_train, y_samples_train)
                group_lad[key_age_grp] = specific_lad

        print('len_groups', len(group_lad))
        pred_all = make_predictions(x_train, broad_lad, group_lad)

        rfr = RandomForestRegressor(criterion='mse')
        params = {"n_estimators": [500]}
        model_2 = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0)
        model_2.fit(pred_all, y_train)

        # lad = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True,
        #            intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000)
        # params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        # model_2 = GridSearchCV(lad, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0)
        # model_2.fit(pred_all, y_train_r)

        train_var = predict(m, model_2, pred_all, y_train)
        print("[INFO] RFR grid search best parameters: {}".format(
            model_2.best_params_))

        pred_all_test = make_predictions(x_test, broad_lad, group_lad)
        pred_var = predict(m, model_2, pred_all_test, y_test)
        model = [broad_lad, group_lad, model_2]
    else:
        print('unknown model')

    if m == 'RVM' or 'DTR':
        return model, 0, 0, pred_var, train_var
    elif m == 'COMB':
        return model, model_2.best_score_, model_2.best_params_, pred_var, train_var
    else:
        return model, model.best_score_, model.best_params_, pred_var, train_var
Exemple #15
0
#Scale the entire input dataset
Coulomb_df = scaler.transform(Coulomb_df)

X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
        Coulomb_df, Output_df, test_size=.2, random_state=None)

reports_df = pd.DataFrame(
    columns=['Name', 'MARE', 'MSE', 'R2'])

for regr_choice in range(5):

    regr_names = ['RF', 'SVM', 'RVM', 'Huber',
                  'XGBOOST']
    regr_objects = [RandomForestRegressor(n_estimators=400, max_depth=1000, random_state=0),
                    svm.SVR(kernel='rbf', epsilon=0.1, verbose=True),
                    RVR(kernel='rbf', n_iter=10000, tol=0.0001, verbose=True),
                    linear_model.HuberRegressor(
                        epsilon=1.35, max_iter=100, alpha=0.0001, warm_start=False, fit_intercept=True, tol=1e-05),
                    XGBRegressor(objective='reg:linear', colsample_bytree=0.3, learning_rate=0.1,
                                 max_depth=400, alpha=10, n_estimators=400)
                    ]

    regr = regr_objects[regr_choice]
    regr_name = regr_names[regr_choice]

    if reusingModels:
        regr = joblib.load('SavedModels_'+regr_name+'.pkl')
    else:
        regr.fit(X_train_scaled, y_train)

    if 'XGB' in regr_name:
Exemple #16
0
def train(x, y):
    model = RVR(kernel='rbf')
    model.fit(x, y)
    return model
X = data[[Label.Rain.value, Label.Wind.value]]
y = data[Label.PM2_5.value]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2)

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# Random Forest, RVR, Lassoで機械学習
model1 = RandomForestRegressor(bootstrap=True, criterion="mse")
model2 = RVR(kernel="rbf")
model3 = Lasso(alpha=0.1)

model1.fit(X_train_std, y_train)
model2.fit(X_train_std, y_train)
model3.fit(X_train_std, y_train)

y_train_pred = model1.predict(X_train_std)
y_test_pred = model1.predict(X_test_std)

print("Random Forest MSE train: {0}, test: {1}".format(
    mean_squared_error(y_train, y_train_pred),
    mean_squared_error(y_test, y_test_pred)))

y_train_pred = model2.predict(X_train_std)
y_test_pred = model2.predict(X_test_std)
y = boston.target[:20]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=2
)

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

model = LinearRegression()
model2 = SVR(kernel="rbf", C=1000.0, epsilon=6.5)
model3 = RVR(kernel="rbf")

model.fit(X_train_std, y_train)
model2.fit(X_train_std, y_train)
model3.fit(X_train_std, y_train)

y_train_pred = model2.predict(X_train_std)
y_test_pred = model2.predict(X_test_std)

print("SVR MSE train: {0}, test: {1}".format(
    mean_squared_error(y_train, y_train_pred),
    mean_squared_error(y_test, y_test_pred)
))

y_train_pred = model3.predict(X_train_std)
y_test_pred = model3.predict(X_test_std)
def rvr_analysis(random_seed, save_path, n_folds, analysis):
    save_path = save_path / ('random_seed_%03d' % random_seed)
    print('Random seed: %03d' % random_seed)
    # Load the saved validation dataset
    project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, dataset)
    with open(save_path / ('splitted_dataset_%s.pickle' % dataset),
              'rb') as handle:
        splitted_dataset = pickle.load(handle)

    kf = KFold(n_splits=n_folds, random_state=random_seed)
    mae_cv = np.zeros((n_folds, 1))
    pearsons_corr = np.zeros((n_folds, 1))
    pearsons_pval = np.zeros((n_folds, 1))

    # Set target and features
    x = splitted_dataset['Xtest_scaled']
    y = splitted_dataset['Ytest']

    t_time_train = []
    t_time_test = []

    for i_fold, (train_idx, test_idx) in enumerate(kf.split(x, y)):
        x_train, x_test = x[train_idx, :], x[test_idx, :]
        y_train, y_test = y[train_idx], y[test_idx]

        print('CV iteration: %d' % (i_fold + 1))
        print('Shape of the trainig and test dataset')
        print(y_train.shape, y_test.shape)

        # train the model
        model = RVR(kernel='linear')
        cv_time_train = time.process_time()
        model.fit(x_train, y_train)
        elapsed_time = time.process_time() - cv_time_train
        print('CV - Elapased time in seconds to train:')
        t_time_train.append(elapsed_time)
        print('%.03f' % elapsed_time)

        # test the model
        cv_time_test = time.process_time()
        y_predicted = model.predict(x_test)
        elapsed_time = time.process_time() - cv_time_test
        t_time_test.append(elapsed_time)
        print('CV - Elapased time in seconds to test:')
        print('%.03f' % elapsed_time)

        mae_kfold = mean_absolute_error(y_test, y_predicted)
        mae_cv[i_fold, :] = mae_kfold
        # now look at the pearson's correlation
        r_test, r_p_value_test = pearsonr(y_test, y_predicted)
        pearsons_corr[i_fold, :] = r_test
        pearsons_pval[i_fold, :] = r_p_value_test

    print('CV results')
    print('MAE: Mean(SD) = %.3f(%.3f)' % (mae_cv.mean(), mae_cv.std()))
    print('Pearson\'s Correlation: Mean(SD) = %.3f(%.3f)' %
          (r_test.mean(), r_test.std()))
    print('Mean CV time: %.3f s ' % np.mean(t_time_train))
    print('SD CV time: %.3f s' % np.std(t_time_train))
    print('Mean CV time: %.3f s ' % np.mean(t_time_test))
    print('SD CV time: %.3f s' % np.std(t_time_test))
    print('')

    if analysis == 'vanilla_combi':
        # Train the entire dataset
        x_train_all, x_test_all, y_train_all, y_test_all = \
                train_test_split(x, y, test_size=.85, random_state=random_seed)
        print('All: Shape of the trainig and test dataset')
        print(y_train_all.shape, y_test_all.shape)
    elif analysis == 'uniform_dist':
        # Train the entire dataset
        x_train_all, x_test_all, y_train_all, y_test_all = \
                train_test_split(x, y, test_size=.20,  random_state=random_seed)
        print('ALL: Shape of the trainig and test dataset')
        print(y_train_all.shape, y_test_all.shape)
    print('Training RVR model:')
    model_all = RVR(kernel='linear')
    model_all.fit(x_train_all, y_train_all)
    # plot predicted vs true for the test (Entire sample)
    print('Plotting Predicted Vs True Age for all the sample')
    y_predicted_test = model.predict(x_test_all)
    output_path_test = save_path / (
        'rvr_test_predicted_true_age_rnd_seed%d.eps' % random_seed)
    plot_predicted_vs_true(y_test_all, y_predicted_test, output_path_test,
                           'Age')

    return mae_cv, r_test, t_time_train, t_time_test

normal_data_all = preprocessing.scale(full_data_matrix)#normalize
pca = PCA(10, svd_solver='auto')
pca.fit(normal_data_all)
normal_data_pca = pca.transform(normal_data_all)#transform data to xx components


#####################################################################################
##We use the image_sematics from each to train after
n_subject_to_use = 1
n_observations = n_subject_to_use*690
X_train, X_test, y_train_index, y_test_index = train_test_split(normal_data_pca[range(n_observations),:],range(n_observations),test_size=0.2)

mean_err = np.zeros((2048,1))
for i in range(2048):   
    n_semantic_as_y = i #the 1st semantic is used as output
    clf1=RVR(kernel='rbf')
    clf1.fit(X_train,full_semantics_matrix[y_train_index,n_semantic_as_y])
    predicted_out = clf1.predict(X_test) #full_semantics_matrix[y_test_index,n_semantic_as_y]

    ## calc error from test output
    err = predicted_out-full_semantics_matrix[y_test_index,n_semantic_as_y]
    mean_err[i] = np.mean(err)
    print(i)



plt.figure()
plt.plot(mean_err)
plt.show()