Here I will try to use xgb. """ import pandas as pd import xgboost as xgb from sklearn.cross_validation import ShuffleSplit import numpy as np from gini_normalized import normalized_gini from preprocessing.to_labels import to_labels import math train = pd.read_csv("../data/train_new.csv") hold = pd.read_csv("../data/hold_new.csv") test = pd.read_csv("../data/test.csv") par = (train, hold, test) train, hold, test = to_labels(par) y = train["Hazard"] # X = train.drop(['Hazard', 'Id'], 1) # X = train.drop(['Hazard', 'Id', 'T2_V10', 'T2_V7', 'T1_V13', 'T1_V10', 'tp_59', 'tp_84', 'global_mean', 'global_median', 'global_std'], 1) # X_test = test.drop(['Hazard', 'Id', 'T2_V10', 'T2_V7', 'T1_V13', 'T1_V10', 'tp_59', 'tp_84', 'global_mean', 'global_median', 'global_std'], 1) X = train.drop(["Hazard", "Id", "T2_V10", "T2_V7", "T1_V13", "T1_V10"], 1) X_hold = hold.drop(["Hazard", "Id", "T2_V10", "T2_V7", "T1_V13", "T1_V10"], 1) X_test = test.drop(["Id", "T2_V10", "T2_V7", "T1_V13", "T1_V10"], 1) # params = { # 'objective': 'reg:linear', "objective": "count:poisson",
import numpy as np from sklearn.ensemble import RandomForestRegressor from preprocessing.to_labels import to_labels from gini_normalized import normalized_gini # joined = pd.read_csv('../data/joined.csv') # # train = joined[joined['Hazard'] != -1] # test = joined[joined['Hazard'] == -1] train = pd.read_csv('../data/train_new.csv') hold = pd.read_csv('../data/hold_new.csv') test = pd.read_csv('../data/test.csv') # hold = pd.read_csv('../data/hold_new.csv') train, hold = to_labels((train, hold)) y = train['Hazard'] X = train.drop(['Hazard', 'Id', 'T2_V10', 'T2_V7', 'T1_V13', 'T1_V10'], 1) X_hold = hold.drop(['Hazard', 'Id', 'T2_V10', 'T2_V7', 'T1_V13', 'T1_V10'], 1) X_test = hold.drop(['Id', 'T2_V10', 'T2_V7', 'T1_V13', 'T1_V10'], 1) random_state = 42 ind = 1 if ind == 1: rs = ShuffleSplit(len(y), n_iter=10, test_size=0.5, random_state=random_state)
import numpy as np from sklearn.ensemble import RandomForestRegressor from preprocessing.to_labels import to_labels from gini_normalized import normalized_gini # joined = pd.read_csv('../data/joined.csv') # # train = joined[joined['Hazard'] != -1] # test = joined[joined['Hazard'] == -1] train = pd.read_csv("../data/train_new.csv") hold = pd.read_csv("../data/hold_new.csv") test = pd.read_csv("../data/test.csv") # hold = pd.read_csv('../data/hold_new.csv') train, hold = to_labels((train, hold)) y = train["Hazard"] X = train.drop(["Hazard", "Id", "T2_V10", "T2_V7", "T1_V13", "T1_V10"], 1) X_hold = hold.drop(["Hazard", "Id", "T2_V10", "T2_V7", "T1_V13", "T1_V10"], 1) X_test = hold.drop(["Id", "T2_V10", "T2_V7", "T1_V13", "T1_V10"], 1) random_state = 42 ind = 1 if ind == 1: rs = ShuffleSplit(len(y), n_iter=10, test_size=0.5, random_state=random_state) result = []
Here I will try to use xgb, trying to predict log(1+x) ''' import pandas as pd import xgboost as xgb from sklearn.cross_validation import ShuffleSplit import numpy as np from gini_normalized import normalized_gini from preprocessing.to_labels import to_labels import math train = pd.read_csv('../data/train_new.csv') hold = pd.read_csv('../data/hold_new.csv') test = pd.read_csv('../data/test.csv') par = (train, hold, test) train, hold, test = to_labels(par) y = train['Hazard'].apply(lambda x: math.log(1 + x), 1) # X = train.drop(['Hazard', 'Id'], 1) # X = train.drop(['Hazard', 'Id', 'T2_V10', 'T2_V7', 'T1_V13', 'T1_V10', 'tp_59', 'tp_84', 'global_mean', 'global_median', 'global_std'], 1) # X_test = test.drop(['Hazard', 'Id', 'T2_V10', 'T2_V7', 'T1_V13', 'T1_V10', 'tp_59', 'tp_84', 'global_mean', 'global_median', 'global_std'], 1) X = train.drop(['Hazard', 'Id', 'T2_V10', 'T2_V7', 'T1_V13', 'T1_V10'], 1) X_hold = hold.drop(['Hazard', 'Id', 'T2_V10', 'T2_V7', 'T1_V13', 'T1_V10'], 1) X_test = test.drop(['Id', 'T2_V10', 'T2_V7', 'T1_V13', 'T1_V10'], 1) params = { # 'objective': 'reg:linear', 'objective': 'count:poisson', # 'eta': 0.005, # 'min_child_weight': 6,