-
Notifications
You must be signed in to change notification settings - Fork 0
/
modeling.py
277 lines (248 loc) · 9.94 KB
/
modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
__author__ = 'p_cohen'
############## Import packages ########################
# Workaround for pycharm bug
from __builtin__ import list, range, len, str, set, any, int
import pandas as pd
import numpy as np
import math
import sys
sys.path.append('C:/Users/p_cohen/Desktop/xgboost try 2/')
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, RidgeCV
from sklearn.tree import DecisionTreeRegressor
############### Define Globals ########################
DATA_PATH = 'C:/Git_repos/catepiller/Original/'
CLN_PATH = 'C:/Git_repos/catepiller/Clean/'
############### Define Functions ########################
def create_val_and_train(df, seed, ids, split_rt=.20):
"""
Creates two samples (generally used to create
train and validation samples)
Parameters
----------------------------------------------------
ids: this is the level of randomization, so if you want to
randomize countries, rather than cities, you would
set this to 'countries'
split_rate: pct of data to assign as validation
Output
----------------------------------------------------
trn_for_mods (1-split_rate of df), trn_for_val (split_rate of data)
"""
np.random.seed(seed)
# Get vector of de-dupped values of ids
id_dat = pd.DataFrame(df[ids].drop_duplicates())
# Create random vector to split train val on
vect_len = len(id_dat.ix[:, 0])
id_dat['rand_vals'] = (np.array(np.random.rand(vect_len, 1)))
df = pd.merge(df, id_dat, on=ids)
# split data into two dfs
trn_for_mods = df[df.rand_vals > split_rt]
trn_for_val = df[df.rand_vals <= split_rt]
# drop rand_vals
trn_for_val = trn_for_val.drop('rand_vals', axis=1)
trn_for_mods = trn_for_mods.drop('rand_vals', axis=1)
return trn_for_mods, trn_for_val
def rmsle(actual, predicted):
"""
original author = 'benhamner'
Computes the root mean squared log error.
This function computes the root mean squared log error between two lists
of numbers.
Parameters
----------
actual : list of numbers, numpy array
The ground truth value
predicted : same type as actual
The predicted value
Returns
-------
score : double
The root mean squared log error between actual and predicted
"""
sle_val = (np.power(np.log(np.array(actual)+1) -
np.log(np.array(predicted)+1), 2))
msle_val = np.mean(sle_val)
return np.sqrt(msle_val)
def write_preds(df, mod, name, scale, features):
"""
Writes predictions from a model into a dataframe, transforms them
according to e^x - 1
Parameters
-----------
df: data to build predictions into and from
mod: a predictive model
name: name of predictions
features: features used in model
Output
----------
dataframe with predictions labeled as 'preds'+name
"""
nm = 'preds'+str(name)
df[nm] = mod.predict(df[features])
df[nm] = np.power(df[nm], scale)
return df
def write_xgb_preds(df, xgb_data, mod, pred_nm, scale, is_test=0):
"""
This writes predictions from an XGBOOST model into the data
Parameters
--------------
df: pandas dataframe to predict into
xgb_data: XGB dataframe (built from same data as df,
with features used by mod)
mod: XGB model used for predictions
pred_nm: prediction naming convention
scale: (float) this is the power to raise predictions to pred^scale
Output
--------------
data frame with predictions
"""
# Create name for predictions column
nm = 'preds'+str(pred_nm)
# Predict and rescale (rescales to e^pred - 1)
df[nm] = mod.predict(xgb_data)
# df[nm] = df[nm].apply(lambda x: math.exp(x)-1)
df[nm] = np.power(df[nm], scale)
# Create an average prediction across folds for actual submission
if is_test == 1:
df['cost'] += df[nm]/num_loops
return df
def gen_weights(df):
"""
This creates weights based on the number of rows per tube assembly
:param df: data frame to add weights to
:return: dataframe with wieghts
"""
df['one'] = 1
grouped = df.groupby('tube_assembly_id')
counts = grouped.one.sum().reset_index()
counts = counts.rename(columns={'one': 'ob_weight'})
counts['ob_weight'] = counts['ob_weight']
df = df.merge(counts, on='tube_assembly_id')
df = df.drop('one', axis=1)
return df
def outcome_transfactor(int):
"""
This creates slightly different functional forms for outcome
:param int: integer used to control functional form (think of it as a
random number
:return: transformation factor (power)
"""
power_up, power_down = 16, 1.0/16
if int % 2 == 0:
power_up, power_down = 17, 1.0/17
if int % 3 == 0:
power_up, power_down = 15, 1.0/15
return power_up, power_down
def create_firststage_preds(train, valid, testing):
"""
This handles the first stage of a true stacking procedure using
random forests to create first stage predictions in the train, test,
and validation. Splits train into two sections, run random forest
on both and predicts from one half into other (and visa versa). Then
random forest is run on whole model and predicted into both validation
and test.
"""
np.random.seed(42)
# Get vector of de-dupped values of ids
id_dat = pd.DataFrame(train['tube_assembly_id'].drop_duplicates())
# Create random vector to split train val on
vect_len = len(id_dat.ix[:, 0])
id_dat['rand_vals'] = (np.array(np.random.rand(vect_len, 1)))
df = pd.merge(train, id_dat, on='tube_assembly_id')
# Create model for both halves of df
frst1 = RandomForestRegressor(n_estimators=300, n_jobs=7)
is_first_half = df.rand_vals > .5
is_scnd_half = df.rand_vals < .5
frst1.fit(df.ix[is_first_half, feats], df.ix[is_first_half, 'target'])
frst2 = RandomForestRegressor(n_estimators=300, n_jobs=7)
frst2.fit(df.ix[is_scnd_half, feats], df.ix[is_scnd_half, 'target'])
# Predict frst1 onto forst2 data set and visa versa
train['forest'] = 0
train['forest'][is_scnd_half] = frst1.predict(df.ix[is_scnd_half, feats])
train['forest'][is_first_half] = frst2.predict(df.ix[is_first_half, feats])
# Create forest in full data for validation and test
frst = RandomForestRegressor(n_estimators=300, n_jobs=7)
frst.fit(df[feats], df.target)
valid['forest'] = frst.predict(valid[feats])
testing['forest'] = frst.predict(testing[feats])
# Create model for both halves of df
rdg1 = RidgeCV(alphas=[.5, .75, 1, 1.25])
rdg2 = RidgeCV(alphas=[.5, .75, 1, 1.25])
rdg1.fit(df.ix[is_first_half, feats], df.ix[is_first_half, 'target'])
rdg2.fit(df.ix[is_scnd_half, feats], df.ix[is_scnd_half, 'target'])
# Predict frst1 onto forst2 data set and visa versa
train['ridge'] = 0
train['ridge'][is_scnd_half] = rdg1.predict(df.ix[is_scnd_half, feats])
train['ridge'][is_first_half] = rdg2.predict(df.ix[is_first_half, feats])
# Create forest in full data for validation and test
rdg = RidgeCV(alphas=[.5, .75, 1, 1.25])
rdg.fit(df[feats], df.target)
valid['ridge'] = rdg.predict(valid[feats])
testing['ridge'] = rdg.predict(testing[feats])
############### Load data ######################
# Load data
all_data = pd.read_csv(CLN_PATH + "full_data.csv")
non_test = all_data[all_data.is_test == 0]
test = all_data[all_data.is_test != 0]
# Create list of features
feats = list(all_data.columns.values)
non_feats = ['id', 'is_test', 'tube_assembly_id', 'cost']
for var in non_feats:
feats.remove(var)
# ########### Run unrebalanced xgb ################
# Set parameters
avg_score = 0
num_loops = 8
start_num = 50
test['cost'] = 0
loop = 1
current_sum = 0.0
param = {'max_depth': 8, 'eta': .0264, 'silent': 1, 'subsample': .75,
'colsample_bytree': .75, 'gamma': .00025}
# Run models (looping through different train/val splits)
for cv_fold in range(start_num, start_num+num_loops):
# Create trn val samples
trn, val = create_val_and_train(non_test, cv_fold, 'tube_assembly_id', .2)
power_up, power_down = outcome_transfactor(cv_fold)
trn['target'] = np.power(trn['cost'], power_down)
trn = gen_weights(trn)
# Gradient boosting
xgb_trn = xgb.DMatrix(np.array(trn[feats]), label=np.array(trn['target']),
weight=np.array(trn.ob_weight))
xgb_val = xgb.DMatrix(np.array(val[feats]))
xgb_test = xgb.DMatrix(np.array(test[feats]))
xboost = xgb.train(param.items(), xgb_trn, 2500)
# Predict and rescale predictions
cv_str = str(cv_fold)
val = write_xgb_preds(val, xgb_val, xboost, cv_str, power_up, is_test=0)
test = write_xgb_preds(test, xgb_test, xboost, cv_str, power_up, is_test=1)
# Save score
score = rmsle(val['cost'], val['preds'+cv_str])
avg_score += score/num_loops
current_sum += score
print "Loop %s score is : %s" % (loop, score)
print "Current average score is %s" % (current_sum/loop)
loop += 1
# Export test preds
test['id'] = test['id'].apply(lambda x: int(x))
test[['id', 'cost']].to_csv(SUBM_PATH+'2500 trees with 15 folds and new vars part 2.csv', index=False)
# ########### Browse feature importances ################
# Code for browsing feature importances
cv_fold = 12
# Create trn val samples
trn, val = create_val_and_train(non_test, cv_fold, 'tube_assembly_id', .2)
# recode target variable to log(x+1)
trn['target'] = np.power(trn['cost'], .0625)
val['target'] = np.power(trn['cost'], .0625)
# Gradient boosting
frst = RandomForestRegressor(n_estimators=300, n_jobs=4)
frst.fit(trn[feats], trn['target'])
outputs = pd.DataFrame({'feats': feats,
'weight': frst.feature_importances_})
outputs = outputs.sort(columns='weight', ascending=False)
val = write_preds(val, frst, cv_fold, 16, feats)
# Score loop
score = rmsle(val['cost'], val['preds'+str(cv_fold)])
print "Score for %s trees is: %s" % (12, score)
print outputs