def build(self): train, y, test, _ = data.get() # ntrain = len(train) df = pd.concat([train, test], axis=0) to_drop = df.columns for c in ['ap_hi', 'ap_lo', 'height', 'weight']: squish(df, c, c + 'S', 10) for c in ['gluc', 'cholesterol']: squish(df, c, c + 'S', 1) df = df.drop(to_drop, axis=1) train = df[:ntrain] test = df[ntrain:].copy() # train = pd.concat([train, y], axis=1) global_avg = y.mean() for c in test.columns: d = wa_dict(train, c, 'cardio', global_avg, 10) test[c] = test[c].map(d).astype('float32') test.fillna(global_avg, inplace=True) kf = model_selection.KFold(n_splits=10, shuffle=True, random_state=1) for itrain, iset in kf.split(train, y): for c in test.columns: d = wa_dict(train.ix[itrain], c, 'cardio', global_avg, 10) train.ix[iset, c] = train.ix[iset, c].map(d) train.drop('cardio', axis=1, inplace=True) train.fillna(global_avg, inplace=True) return train.astype('float32'), test.astype('float32'), None
def build(self): train, _, test, _ = data.get() # ntrain = len(train) df = pd.concat([train, test], axis=0) to_drop = df.columns h = df.height / 100 df['bwi'] = df['weight'] / (h * h) df['ap_p'] = df['ap_hi'] - df['ap_lo'] df['ap_m4'] = (df['ap_hi'] + 3 * df['ap_lo']) / 4 df['ap_m3'] = (df['ap_hi'] + 2 * df['ap_lo']) / 3 df['ap_m2'] = (df['ap_hi'] + df['ap_lo']) / 2 male = df['gender'] == 2 df.ix[male, 'ap_hi_e'] = 109 + 0.5 * df.ix[ male, 'age'] / 365.25 + 0.1 * df.ix[male, 'weight'] df.ix[male, 'ap_lo_e'] = 74 + 0.1 * df.ix[ male, 'age'] / 365.25 + 0.15 * df.ix[male, 'weight'] df.ix[~male, 'ap_hi_e'] = 102 + 0.7 * df.ix[ ~male, 'age'] / 365.25 + 0.15 * df.ix[~male, 'weight'] df.ix[~male, 'ap_lo_e'] = 78 + 0.17 * df.ix[ ~male, 'age'] / 365.25 + 0.1 * df.ix[~male, 'weight'] df = df.drop(to_drop, axis=1) ftrain = df[:ntrain] ftest = df[ntrain:] return ftrain, ftest, None
def build(self): train, _, test, _ = data.get() # ntrain = len(train) df = pd.concat([train, test], axis=0) to_drop = df.columns h = df.height / 100 df['bwi'] = df['weight'] / (h * h) df['ap_p'] = df['ap_hi'] - df['ap_lo'] df['ap_m4'] = (df['ap_hi'] + 3 * df['ap_lo']) / 4 df['ap_m3'] = (df['ap_hi'] + 2 * df['ap_lo']) / 3 df['ap_m2'] = (df['ap_hi'] + df['ap_lo']) / 2 male = df['gender'] == 2 df.ix[male, 'ap_hi_e'] = 109 + 0.5 * df.ix[male, 'age'] / 365.25 + 0.1 * df.ix[male, 'weight'] df.ix[male, 'ap_lo_e'] = 74 + 0.1 * df.ix[male, 'age'] / 365.25 + 0.15 * df.ix[male, 'weight'] df.ix[~male, 'ap_hi_e'] = 102 + 0.7 * df.ix[~male, 'age'] / 365.25 + 0.15 * df.ix[~male, 'weight'] df.ix[~male, 'ap_lo_e'] = 78 + 0.17 * df.ix[~male, 'age'] / 365.25 + 0.1 * df.ix[~male, 'weight'] df.ix[male, 'weight_ah'] = (df.ix[male, 'ap_hi'] - 109 - 0.5 * df.ix[male, 'age'] / 365.25) / 0.1 df.ix[male, 'weight_al'] = (df.ix[male, 'ap_lo'] - 74 - 0.1 * df.ix[male, 'age'] / 365.25) / 0.15 df.ix[~male, 'weight_ah'] = (df.ix[~male, 'ap_hi'] - 102 - 0.7 * df.ix[~male, 'age'] / 365.25) / 0.15 df.ix[~male, 'weight_al'] = (df.ix[~male, 'ap_lo'] - 78 - 0.17 * df.ix[~male, 'age'] / 365.25) / 0.1 df['ap_hi_ed'] = df['ap_hi'] - df['ap_hi_e'] df['ap_lo_ed'] = df['ap_lo'] - df['ap_lo_e'] df['dw_ah'] = df['weight'] - df['weight_ah'] df['dw_al'] = df['weight'] - df['weight_al'] df = df.drop(to_drop, axis=1) ftrain = df[:ntrain] ftest = df[ntrain:] return ftrain, ftest, None
def predict(): saved = state.load('model') #saved = None if saved == None: train, y, test, _ = data.get() z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['id'] = train.id v['y'] = y cv, _ = run(train, y, test, v, z) state.save('model', (v, z, cv, None)) else: v, z, cv, _ = saved return v, z, cv, _
def build(self): train, _, test, _ = data.get() cset = [] ntrain = len(train) df = pd.concat([train, test], axis=0) to_drop = df.columns for sc in ['height', 'weight', 'ap_hi', 'ap_lo']: tc = df[sc].apply(str) maxc = tc.apply(len).max() for n in range(maxc): df['ft_l_'+sc+'_'+str(n)] = tc.apply(lambda s:ord(s[n]) if n < len(s) else -1) df['ft_r_'+sc+'_'+str(n)] = tc.apply(lambda s:ord(s[-n]) if n < len(s) else -1) cset.append('ft_l_'+sc+'_'+str(n)) cset.append('ft_r_'+sc+'_'+str(n)) df = df.drop(to_drop, axis=1) self.train_= df[:ntrain] self.test_ = df[ntrain:] return self.train_, self.test_, None
def build(self): train, y, test, _ = data.get() ntrain = len(train) df = pd.concat([train, test], axis=0) to_drop = df.columns dcn = [] for n in [2, 5, 10, 15, 25]: cname = 'kmeans_' + str(n) dcn.append(cname) df[cname] = cluster.KMeans(n_clusters=n).fit_predict(df) df = pd.get_dummies(df, columns=dcn) df = df.drop(to_drop, axis=1) train = df[:ntrain] test = df[ntrain:].copy() return train.astype('int32'), test.astype('int32'), None
def build(self): train, _, test, _ = data.get() cset = [] ntrain = len(train) df = pd.concat([train, test], axis=0) to_drop = df.columns for sc in ['height', 'weight', 'ap_hi', 'ap_lo']: tc = df[sc].apply(str) maxc = tc.apply(len).max() for n in range(maxc): df['ft_l_' + sc + '_' + str(n)] = tc.apply( lambda s: ord(s[n]) if n < len(s) else -1) df['ft_r_' + sc + '_' + str(n)] = tc.apply( lambda s: ord(s[-n]) if n < len(s) else -1) cset.append('ft_l_' + sc + '_' + str(n)) cset.append('ft_r_' + sc + '_' + str(n)) df = pd.get_dummies(df, columns=cset).drop(to_drop, axis=1) self.train_ = df[:ntrain] self.test_ = df[ntrain:] return self.train_, self.test_, None
def build(self): train, y, test, _ = data_src.get() xgb_params = dict( max_depth=5, learning_rate=0.005, subsample=0.7, gamma=5, alpha=0.01, #colsample_bytree = 0.8, objective='binary:logistic', eval_metric='logloss', seed=1, silent=1) idx = (test.smoke > 0).values * (test.smoke < 1).values print('values to restore:', np.sum(idx)) xtrain = pd.concat([train, test[~idx]]) ytrain = xtrain['smoke'] xtrain.drop('smoke', axis=1, inplace=True) print(xtrain.shape, ytrain.shape, test[idx].shape) dtrain = xgb.DMatrix(xtrain.values, ytrain.values) dpred = xgb.DMatrix(test[idx].drop('smoke', axis=1).values) cv = xgb.cv(params=xgb_params, dtrain=dtrain, num_boost_round=10000, early_stopping_rounds=50, nfold=10, seed=1, metrics='error', stratified=True) print('smoke num_boost_rounds =', len(cv)) bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv)) test.ix[idx, 'smoke'] = bst.predict(dpred) test['smoke'] = (test['smoke'] > 0.5) * 1 return train, y, test, None
def build(self): train, y, test, _ = data_src.get() xgb_params = dict( max_depth = 5, learning_rate = 0.005, subsample = 0.7, gamma = 5, alpha = 0.01, #colsample_bytree = 0.8, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) idx = (test.smoke > 0).values * (test.smoke < 1).values print('values to restore:', np.sum(idx)) xtrain = pd.concat([train, test[~idx]]) ytrain = xtrain['smoke'] xtrain.drop('smoke', axis=1, inplace=True) print(xtrain.shape, ytrain.shape, test[idx].shape) dtrain = xgb.DMatrix(xtrain.values, ytrain.values) dpred = xgb.DMatrix(test[idx].drop('smoke', axis=1).values) cv = xgb.cv(params=xgb_params, dtrain=dtrain, num_boost_round=10000, early_stopping_rounds=50, nfold=10, seed=1, metrics='error', stratified=True) print('smoke num_boost_rounds =', len(cv)) bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv)) test.ix[idx, 'smoke'] = bst.predict(dpred) test['smoke'] = (test['smoke'] > 0.5) * 1 return train, y, test, None