Beispiel #1
0
 def test_fit_and_predict_numpy(self):
     m = MERF(max_iterations=5)
     # Train
     m.fit(np.array(self.X_train), np.array(self.Z_train), self.clusters_train, self.y_train)
     # Predict Known Clusters
     yhat_known = m.predict(np.array(self.X_known), np.array(self.Z_known), self.clusters_known)
     self.assertEqual(len(yhat_known), 5)
     # Predict New Clusters
     yhat_new = m.predict(np.array(self.X_new), np.array(self.Z_new), self.clusters_new)
     self.assertEqual(len(yhat_new), 2)
Beispiel #2
0
 def test_fit_and_predict_pandas(self):
     m = MERF(max_iterations=10)
     # Train
     m.fit(self.X_train, self.Z_train, self.clusters_train, self.y_train)
     self.assertEqual(len(m.gll_history), 10)
     # Predict Known Clusters
     yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known)
     self.assertEqual(len(yhat_known), 5)
     # Predict New Clusters
     yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new)
     self.assertEqual(len(yhat_new), 2)
Beispiel #3
0
 def test_user_defined_fe_model(self):
     lgbm = LGBMRegressor()
     m = MERF(fixed_effects_model=lgbm, max_iterations=5)
     # Train
     m.fit(self.X_train, self.Z_train, self.clusters_train, self.y_train)
     self.assertEqual(len(m.gll_history), 5)
     # Predict Known Clusters
     yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known)
     self.assertEqual(len(yhat_known), 5)
     # Predict New Clusters
     yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new)
     self.assertEqual(len(yhat_new), 2)
Beispiel #4
0
 def test_validation_numpy(self):
     m = MERF(max_iterations=3)
     # Train
     m.fit(
         np.array(self.X_train),
         np.array(self.Z_train),
         self.clusters_train,
         self.y_train,
         np.array(self.X_new),
         np.array(self.Z_new),
         self.clusters_new,
         self.y_new,
     )
     self.assertEqual(len(m.val_loss_history), 3)
     # Predict Known Clusters
     yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known)
     self.assertEqual(len(yhat_known), 5)
     # Predict New Clusters
     yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new)
     self.assertEqual(len(yhat_new), 2)
Beispiel #5
0
    def test_pickle(self):
        m = MERF(max_iterations=5)
        # Train
        m.fit(self.X_train, self.Z_train, self.clusters_train, self.y_train)

        # Write to pickle file
        with open("model.pkl", "wb") as fin:
            pickle.dump(m, fin)

        # Read back from pickle file
        with open("model.pkl", "rb") as fout:
            m_pkl = pickle.load(fout)

        # Check that m is not the same object as m_pkl
        self.assertIsNot(m_pkl, m)
        # Predict Known Clusters
        yhat_known_pkl = m_pkl.predict(self.X_known, self.Z_known, self.clusters_known)
        yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known)
        assert_almost_equal(yhat_known_pkl, yhat_known)
        # Predict New Clusters
        yhat_new_pkl = m_pkl.predict(self.X_new, self.Z_new, self.clusters_new)
        yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new)
        assert_almost_equal(yhat_new_pkl, yhat_new)
results.loc["Boosting_Ign","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2))

# 4. Gradient tree-boosting including the grouping variable as a categorical variable ('Boosting_Cat')
X_train_cat = np.column_stack((group_train,X_train))
X_test_cat = np.column_stack((group_test,X_test))
data_train_cat = gpb.Dataset(X_train_cat, y_train, categorical_feature=[0])
cvbst = gpb.cv(params=params, train_set=data_train_cat,
               num_boost_round=1000, early_stopping_rounds=5,
               nfold=4, verbose_eval=True, show_stdv=False, seed=1)
best_iter = np.argmin(cvbst['l2-mean'])
print("Best number of iterations: " + str(best_iter))
# Best number of iterations: 49
start_time = time.time() # measure time
bst = gpb.train(params=params, train_set=data_train_cat, num_boost_round=best_iter)
results.loc["Boosting_Cat","Time"] = time.time() - start_time
y_pred = bst.predict(data=X_test_cat)
results.loc["Boosting_Cat","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2))

# 5. Mixed-effects random forest ('MERF')
from merf import MERF
rf_params={'max_depth': 6, 'n_estimators': 300}
merf_model = MERF(max_iterations=100, rf_params=rf_params)
print("Warning: the following takes a lot of time")
start_time = time.time() # measure time
merf_model.fit(pd.DataFrame(X_train), np.ones(shape=(ntrain,1)), pd.Series(group_train), y_train)
results.loc["MERF","Time"] = time.time() - start_time
y_pred = merf_model.predict(pd.DataFrame(X_test), np.ones(shape=(ntrain,1)), pd.Series(group_test))
results.loc["MERF","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2))

print(results.apply(pd.to_numeric).round(3))
Beispiel #7
0
def mae(pred, true):
    return np.mean(np.abs(pred - true))


# In[78]:

# Split data into train and test sets
indices = np.arange(len(y_samples))
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_samples, y_samples, indices, test_size=0.2, random_state=301)

# In[ ]:

from merf import MERF
merf = MERF()
clusters_train = [ids[e] for e in idx_train]
clusters_train = pd.Series(clusters_train)
clusters_test = [ids[e] for e in idx_test]
clusters_test = pd.Series(clusters_test)

Z_train = np.ones(shape=(X_train.shape[0], 1))
merf.fit(X_train, Z_train, clusters_train, y_train)
train_preds = merf.predict(X_train, Z_train, clusters_train)
train_preds = np.maximum(train_preds, 0)  # Don't predict negative cases
print('Train MAE:', mae(train_preds, y_train))

Z_test = np.ones(shape=(X_test.shape[0], 1))
test_preds = merf.predict(X_test, Z_test, clusters_test)
test_preds = np.maximum(test_preds, 0)  # Don't predict negative cases
print('Test MAE:', mae(test_preds, y_test))
Beispiel #8
0
    if eq_train_ratio:
        train = equalize_num_case_control(train, data['eq_cases_train_cols'])

    if visit_type == "all":
        if i > 0:
            val_aurocs2 = []
            for max_depth in [5, 10, 15]:
                model = MERF(n_estimators=100,
                             gll_early_stop_threshold=0.001,
                             max_iterations=2,
                             max_depth=max_depth)
                model.fit(train[cols],
                          pandas.DataFrame(np.ones((train.shape[0], 1))),
                          train.TRR_ID, train.diab_in_1_year)
                test_y_hat = model.predict(
                    test[cols], pandas.DataFrame(np.ones((test.shape[0], 1))),
                    test.TRR_ID)
                test_auroc = roc_auc_score(test.diab_in_1_year, test_y_hat)
                val_aurocs2.append(test_auroc)
            max_depth = [5, 10, 15][np.argmax(val_aurocs2)]
        else:
            max_depth = max_depths[np.argmax(val_aurocs)]

        model = MERF(n_estimators=100,
                     gll_early_stop_threshold=0.001,
                     max_iterations=2,
                     max_depth=max_depth)
        model.fit(train[cols], pandas.DataFrame(np.ones((train.shape[0], 1))),
                  train.TRR_ID, train.diab_in_1_year)
        train_y_hat = model.predict(
            train[cols], pandas.DataFrame(np.ones((train.shape[0], 1))),
Beispiel #9
0
 def test_not_fitted_error(self):
     m = MERF()
     with self.assertRaises(NotFittedError):
         m.predict(self.X_known, self.Z_known, self.clusters_known)
Beispiel #10
0
def merf(normalise = False):
    hyper_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': ['l1', 'rmse'],
        'learning_rate': 0.001,
        'feature_fraction': 0.8,
         
        "max_depth": 6,
        "max_bin": 512,
        "num_leaves": 40,
        "num_iterations": 100000,
        "n_estimators": 300,
        "verbose": -1
        }
      #   'bagging_fraction': 0.7,
      #  'bagging_freq': 10, "num_leaves": 12, 
      
    gbm = lgb.LGBMRegressor(**hyper_params)    
    
    ap2 = ap.fillna(method = "pad") 
    ap2.isna().sum().sum()
    X_train, Y_train, X_test, Y_test = preprocessing(ap2, hour2int = True, onehotencode = False)
      
    Z_train = np.ones((len(X_train), 1))
    
    clusters_train = X_train['hours']
    clusters_test= X_test['hours']
    
    X_train1 = X_train.drop(["hours"],axis = 1)
    
    X_test1 = X_test.drop(["hours"],axis = 1)
    
    if normalise:
        X_train1 =(X_train1-X_train1.mean())/X_train1.std()
        X_test1 =(X_test1-X_test1.mean())/X_test1.std()
    # we should not nornalise the Y (response)    
    #   Y_train1 =(Y_train-Y_train.mean())/Y_train.std()
         
    #my_imputer = SimpleImputer()
    #X_train1 = my_imputer .fit_transform(X_train1)   # fit missing
    #X_test1  = my_imputer .fit_transform(X_test1)  
    
    # normalising for boosting is commonly not necessary, but for the mixed effect models 
    # we actually may want to normalise. But we only normalise X (predictors)!
       # check if missing 
    print( Y_train1.isnull().any().any(),X_train1.isnull().any().any(),X_test.isnull().any().any())
    
    merf = MERF(gbm, max_iterations = 4)
    merf.fit(X_train1,  Z_train, clusters_train, Y_train1)
    
    Z_test = np.ones((len(X_test1), 1))
    y_pred_ = merf.predict(X_test1, Z_test, clusters_test)
    # also normalise the response and prediction wont work
    #if normalise:
    #    y_pred = y_pred_*Y_train.std()+Y_train.mean() 
        
    mae = abs(y_pred - Y_test).mean()
    rmse =  math.sqrt(((y_pred - Y_test)*(y_pred - Y_test)).mean())
    rrmse = rmse / Y_test.median()
    r2 = get_r2_numpy_corrcoef(Y_test, y_pred)
    return(mae, rmse, rrmse, r2)
Beispiel #11
0
Z_train = np.ones((len(X_train), 1))

clusters_train = X_train['hours']
clusters_test= X_test['hours']
my_imputer = SimpleImputer()

X_train = my_imputer .fit_transform(X_train)   # fit missing
X_test  = my_imputer .fit_transform(X_test)  
merf.fit(X_train,  Z_train, clusters_train, Y_train)


    

# %% [code]
Z_test = np.ones((len(X_test), 1))
y_hat = merf.predict(X_test, Z_test, clusters_test)
y_hat

# %% [code]
metrics.explained_variance_score(y_hat, Y_test)


# %% [code]
metrics.r2_score(y_hat, Y_test)

# %% [code]
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, Y_train)