def FI_xgb_sklearn():
    X, y = load_traindata(encodetype='le')
    cols = list(X.columns)

    rndcol = np.random.randn(X.shape[0])
    X = np.column_stack((X, rndcol))
    cols.append('random')

    xgb1 = XGBRegressor(learning_rate=0.01,
                        n_estimators=3320,
                        max_depth=3,
                        min_child_weight=4,
                        colsample_bytree=0.8,
                        subsample=0.8,
                        importance_type='total_gain',
                        objective='reg:linear',
                        n_jobs=-1,
                        random_state=0,
                        seed=27,
                        silent=True)

    xgb1.fit(X, y)

    imp = sorted(list(zip(cols, xgb1.feature_importances_)),
                 key=lambda t: abs(t[1]),
                 reverse=True)
    imp = pd.DataFrame(imp, columns=['Feature', 'Importance'])
    rnd_idx = np.argwhere(imp['Feature'] == 'random')[0][0]
    print(imp.iloc[:rnd_idx + 1, :])
    return imp
def FI_reg():
    """
    Add a random column to the feature data, and run the optimized lasso model
    Any feature that ranks lower in importance than the random column has no significance for prediction with this model
    """
    X, y = load_traindata()
    cols = list(X.columns)

    lasso = Lasso(max_iter=10000, alpha=0.004498433)
    scaler = RobustScaler()
    X = scaler.fit_transform(X)

    # add random column
    rndcol = np.random.randn(X.shape[0])
    X = np.column_stack((X, rndcol))
    cols.append('random')
    lasso.fit(X, y)

    coefs = sorted(list(zip(cols, lasso.coef_)),
                   key=lambda t: abs(t[1]),
                   reverse=True)
    coefs = pd.DataFrame(coefs, columns=['Feature', 'Coef'])
    rnd_idx = np.argwhere(coefs['Feature'] == 'random')[0][0]
    print("random column coefficient is : %.4f, ranking %d" %
          (coefs.iloc[rnd_idx, 1], rnd_idx))
def initialise_game(model, budget, niter, feature_number, method):

    train_x_all = []
    test_x_all = []
    dev_x_all = []
    for i in range(len(model)):
        train_x, train_y = helper.load_traindata(feature_number[i],
                                                 model[i],
                                                 seed=0)
        train_x_all.append(train_x)
        train_y_all = train_y
        test_x, test_y = helper.load_testdata(feature_number[i],
                                              model[i],
                                              seed=0)
        test_x_all.append(test_x)
        test_y_all = test_y
        #dev_x, dev_y = helper.load_testdata(feature_number[i], model[i], seed=3)
        #dev_x_all.append(dev_x)
        #dev_y_all = dev_y
    dev_x_all = test_x_all
    dev_y_all = test_y_all

    story = [train_x_all, train_y_all]
    dev = [dev_x_all, dev_y_all]
    test = [test_x_all, test_y_all]

    # load game
    game = Env(story, test, dev, budget, MODEL_VER, model, feature_number, CUM,
               EXPNUM, 0, method)
    return game
Beispiel #4
0
def FI_RF_permuation_blog():
    X, y = load_traindata(encodetype ='le')
    
    rf = ensemble.RandomForestRegressor(n_estimators = 100, max_depth = 14, max_features = 26, random_state = 42, oob_score=True)
    rf.fit(X, y)
    
    drop_in_mse = permutation_importances(rf, X, y, oob_regression_mse_score)
    imp = [-x for x in drop_in_mse]
    imp = sorted(list(zip(X.columns,imp)),key=lambda t: abs(t[1]), reverse = True)
    imp = pd.DataFrame( imp, columns = ['Feature', 'Importance'] )
    return imp
def FI_RF_permuation(rf):
    X, y = load_traindata(encodetype='le')
    rndcol = np.random.randn(X.shape[0])
    X['random'] = rndcol

    rf.fit(X, y)

    imp = permutation_importances(rf, X, y)
    imp = sorted(list(zip(X.columns, imp)),
                 key=lambda t: abs(t[1]),
                 reverse=True)
    imp = pd.DataFrame(imp, columns=['Feature', 'Importance'])
    rnd_idx = np.argwhere(imp['Feature'] == 'random')[0][0]
    print(imp.iloc[:rnd_idx + 1, :])
    return imp
Beispiel #6
0
def FI_RF_permuation(metric = oob_regression_r2_score):
    X, y = load_traindata(encodetype ='le')
    rndcol = np.random.randn(X.shape[0])
    X['random'] = rndcol
    
    rf = ensemble.RandomForestRegressor(n_estimators = 100, max_depth = 14, max_features = 26, random_state = 42, oob_score=True)
    rf.fit(X, y)
    
    imp = permutation_importances(rf, X, y, oob_regression_r2_score)
    #imp = permutation_importances(rf, X, y, oob_regression_mse_score)
    imp = sorted(list(zip(X.columns,imp)),key=lambda t: abs(t[1]), reverse = True)
    imp = pd.DataFrame( imp, columns = ['Feature', 'Importance'] )
    rnd_idx = np.argwhere( imp['Feature'] == 'random' )[0][0]
    print(imp.iloc[:rnd_idx+1,:])
    return imp
Beispiel #7
0
def FI_RF_sklearn():    
    X, y = load_traindata(encodetype ='le')
    cols = list(X.columns)
#    scaler = RobustScaler()
#    X = scaler.fit_transform(X)
    rndcol = np.random.randn(X.shape[0])
    X = np.column_stack((X,rndcol))
    cols.append('random')
    
    rf = ensemble.RandomForestRegressor(n_estimators = 100, max_depth = 14, max_features = 26, random_state = 42)
    rf.fit(X, y)
    
    imp = sorted(list(zip(cols,rf.feature_importances_)),key=lambda t: abs(t[1]), reverse = True)
    imp = pd.DataFrame( imp, columns = ['Feature', 'Importance'] )
    rnd_idx = np.argwhere( imp['Feature'] == 'random' )[0][0]
    print(imp.iloc[:rnd_idx+1,:])
    return imp
Beispiel #8
0
def FI_reg_blog():
    """
    Add a random column to the feature data, and run the optimized lasso model
    Any feature that ranks lower in importance than the random column has no significance for prediction with this model
    """
    X, y = load_traindata()
    cols = list(X.columns)
    
    lasso = Lasso(max_iter = 10000, alpha =  0.000308884359647748)
    scaler = RobustScaler()
    X = scaler.fit_transform(X)
    
    lasso.fit(X, y)
    
    coefs = sorted(list(zip(cols,lasso.coef_)),key=lambda t: abs(t[1]), reverse = True)
    coefs = pd.DataFrame( coefs, columns = ['Feature', 'Coef'] )
    return coefs
Beispiel #9
0
def FI_xgb_blog():    
    X, y = load_traindata(encodetype ='le')
    cols = list(X.columns)
    
    xgb1 = XGBRegressor( 
            learning_rate=0.01, n_estimators=3320,
            max_depth=3, min_child_weight=4, 
            colsample_bytree=0.8, subsample=0.8, 
            importance_type='total_gain', objective='reg:linear',
            n_jobs=-1, random_state= 0, 
            seed=27, silent=True
            )

    xgb1.fit(X, y)
    
    imp = sorted(list(zip(cols,xgb1.feature_importances_)),key=lambda t: abs(t[1]), reverse = True)
    imp = pd.DataFrame( imp, columns = ['Feature', 'Importance'] )
    return imp
Beispiel #10
0
    def reboot(self, model):
        # reboot everything to initial status
        # save in the csv file
        helper.write_csv_game(self, model)
        random.seed(self.cvit)
        random.shuffle(self.order)
        self.queried_times = 0
        self.terminal = False
        if not self.accum:
            self.queried_set_x = []
            self.queried_set_y = []
        self.current_frame = 0
        self.count_action0 = 0
        self.count_action1 = 0
        self.rAll = []
        self.episode += 1
        self.train_x_all = []
        self.train_y_all = []
        self.test_x_all = []
        self.test_y_all = []
        self.rounds = 0
        for i in range(len(self.feature)):
            train_x, train_y = helper.load_traindata(self.feature[i],
                                                     model[i],
                                                     seed=self.episode)
            self.train_x_all.append(train_x)
            self.train_y_all = train_y
            test_x, test_y = helper.load_testdata(self.feature[i],
                                                  model[i],
                                                  seed=self.episode)
            self.test_x_all.append(test_x)
            self.test_y_all = test_y
        self.dev_x_all = self.test_x_all
        self.dev_y_all = self.test_y_all

        self.cross_counts_correct_train = np.zeros(
            shape=(len(self.feature) + 1, len(self.feature) + 2)).astype(int)
        self.cross_counts_incorrect_train = np.zeros(
            shape=(len(self.feature) + 1, len(self.feature) + 2)).astype(int)
        self.cross_counts_correct_test = np.zeros(shape=(len(self.feature) + 1,
                                                         len(self.feature) +
                                                         2)).astype(int)
        self.cross_counts_incorrect_test = np.zeros(
            shape=(len(self.feature) + 1, len(self.feature) + 2)).astype(int)
Beispiel #11
0
def data_generation(model, feature_number):

    train_x_all = []
    test_x_all = []
    
    for i in range(len(model)):
        print("Loading data for feature {0}..".format(feature_number[i]))
        train_x, train_y = helper.load_traindata(feature_number[i],model[i],seed=0)
        train_x_all.append(train_x)
        train_y_all = train_y
        test_x, test_y = helper.load_testdata(feature_number[i],model[i],seed=0)
        test_x_all.append(test_x)
        test_y_all = test_y

    
    train = [train_x_all, train_y_all]
    test = [test_x_all, test_y_all]

    return train, test
proj_path = 'C:\\Users\\yanqi\\Documents\\NYCDSA\\Project 3 - Machine Learning\\Housing Price Prediction\\house_price_prediction\\code\\basecase'
os.chdir(proj_path)

from sklearn.model_selection import GridSearchCV, KFold, train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error
import pickle
from helper import plot_cv_traintestscores, make_prediction_dummy, load_traindata
from sklearn import tree
import time

X, y = load_traindata()
cols = X.columns

# standardize x_train data
scaler = RobustScaler()
X = scaler.fit_transform(X)

n_folds_i = 5
n_folds_o = 5
rs = 1
inner_cv = KFold(n_splits=n_folds_i, shuffle=True, random_state=rs)
outer_cv = KFold(n_splits=n_folds_o, shuffle=True, random_state=rs)


def simpleDT(X, y):
    tree_model = tree.DecisionTreeRegressor()
import os
proj_path = 'C:\\Users\\yanqi\\Documents\\NYCDSA\\Project 3 - Machine Learning\\Housing Price Prediction\\house_price_prediction\\code\\basecase'
os.chdir(proj_path)

from sklearn.model_selection import GridSearchCV, KFold, train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error
from helper import plot_cv_traintestscores, make_prediction_dummy, load_traindata
from sklearn import tree
import time

X, y = load_traindata(encodetype='le')
cols = X.columns

# standardize x_train data
scaler = RobustScaler()
X = scaler.fit_transform(X)

n_folds_i = 5
n_folds_o = 5
rs = 1
inner_cv = KFold(n_splits=n_folds_i, shuffle=True, random_state=rs)
outer_cv = KFold(n_splits=n_folds_o, shuffle=True, random_state=rs)


def simpleDT(X, y):
    tree_model = tree.DecisionTreeRegressor()
                   min_child_weight=3,
                   missing=None,
                   n_estimators=1996,
                   n_jobs=-1,
                   nthread=None,
                   objective='reg:linear',
                   random_state=0,
                   reg_alpha=0,
                   reg_lambda=0.9,
                   scale_pos_weight=1,
                   seed=27,
                   silent=True,
                   subsample=0.55)

# load training data
X, y = load_traindata()
cols = X.columns

X_le, tmp = load_traindata(encodetype='le')
cols_le = X_le.columns

scaler1 = RobustScaler()
X = scaler1.fit_transform(X)
scaler2 = RobustScaler()
X_le = scaler2.fit_transform(X_le)

# create 5-fold CV scheme to fit base models, and make out-of-fold predictions
cv = KFold(n_splits=5, shuffle=True, random_state=42)
oof_pred = np.zeros((X.shape[0], 4))

for train_idx, test_idx in cv.split(X):