def lgb_gs(set_params, dtrn_X, trn_y, dval_X, val_X, val_y): min_score = 100 for params in tqdm(list(ParameterGrid(set_params))): logger.debug('params:\n {}'.format(params)) model = lgb.train(params, dtrn_X, num_boost_round=1000, valid_sets=[dval_X], early_stopping_rounds=100, verbose_eval=50) pred = model.predict(val_X, num_iteration=model.best_iteration) sc_rmse = rmse(val_y, pred) if min_score > sc_rmse: min_score = sc_rmse min_params = params logger.debug('rmse: {}'.format(sc_rmse)) logger.info('current min rmse: {}'.format(min_score)) logger.info('') logger.info('Top min params:\n {}'.format(min_params)) logger.info('Top min rmse: {}'.format(min_score)) return min_params
def run_lgb(train_X, train_y): lg_params = { "objective": "regression", "boosting": "gbdt", "metric": "rmse", "num_leaves": 128, # [32, 48, 64, 128] "learning_rate": 0.07, # [0.05, 0.07, 0.1, 0.2] "feature_fraction": 0.7, "bagging_freq": 5, "bagging_fraction": 0.7, "bagging_seed": 2018, "verbosity": -1 } trn_X, val_X, trn_y, val_y = train_test_split(train_X, train_y, test_size=0.20, shuffle=True, random_state=0) lg_trn = lgb.Dataset(trn_X, label=trn_y) lg_val = lgb.Dataset(val_X, label=val_y) logger.info('split.train: {}'.format(trn_X.shape)) logger.info('split.valid: {}'.format(val_X.shape)) # GridSearch # min_params = lgb_gs(lg_params, lg_trn, trn_y, lg_val, val_X, val_y) model = lgb.train(lg_params, lg_trn, num_boost_round=5000, valid_sets=[lg_val], early_stopping_rounds=100, verbose_eval=50) pred_trn = model.predict(trn_X, num_iteration=model.best_iteration) pred_val = model.predict(val_X, num_iteration=model.best_iteration) rmse_trn = rmse(trn_y, pred_trn) rmse_val = rmse(val_y, pred_val) logger.info('rmse - Train: {}'.format(rmse_trn)) logger.info('rmse - valid: {}'.format(rmse_val)) # Feature Importance logger.debug('Feature Importances') feat_n = model.feature_name() feat_i = list(model.feature_importance()) df_tmp1 = pd.DataFrame(feat_n, columns={'feat_n'}) df_tmp2 = pd.DataFrame(feat_i, columns={'feat_i'}) df_tmp = df_tmp1.join(df_tmp2, how='inner') df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False) df_tmp = df_tmp.reset_index(drop=True) df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum() for i in range(len(df_tmp.index)): logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0], df_tmp.ix[i, 1])) return model
def run_lgb(trn_X, trn_y, val_X, val_y, tfvocab, cat_vars): lg_params = { "objective": "regression", "boosting": "gbdt", "metric": "rmse", # "max_depth": 15, # [15] "num_leaves": 128, # [256] "learning_rate": 0.07, # [0.018] "feature_fraction": 0.7, # [0.5] "bagging_freq": 5, "bagging_fraction": 0.7, # [0.75] "bagging_seed": 2018, "verbosity": -1, # "verbose": 0 } lg_trn = lgb.Dataset(trn_X, label=trn_y, feature_name=tfvocab, categorical_feature=cat_vars) lg_val = lgb.Dataset(val_X, label=val_y, feature_name=tfvocab, categorical_feature=cat_vars) logger.info('split.train: {}'.format(trn_X.shape)) logger.info('split.valid: {}'.format(val_X.shape)) # GridSearch # min_params = lgb_gs(lg_params, lg_trn, trn_y, lg_val, val_X, val_y) # Train Start model = lgb.train(lg_params, lg_trn, num_boost_round=16000, valid_sets=[lg_val], early_stopping_rounds=200, verbose_eval=100) pred_trn = model.predict(trn_X, num_iteration=model.best_iteration) pred_val = model.predict(val_X, num_iteration=model.best_iteration) rmse_trn = rmse(trn_y, pred_trn) rmse_val = rmse(val_y, pred_val) logger.info('rmse - Train: {}'.format(rmse_trn)) logger.info('rmse - valid: {}'.format(rmse_val)) # Feature Importance logger.debug('Feature Importances') feat_n = model.feature_name() feat_i = list(model.feature_importance()) df_tmp1 = pd.DataFrame(feat_n, columns={'feat_n'}) df_tmp2 = pd.DataFrame(feat_i, columns={'feat_i'}) df_tmp = df_tmp1.join(df_tmp2, how='inner') df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False) df_tmp = df_tmp.reset_index(drop=True) df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum() # for i in range(len(df_tmp.index)): for i in range(50): logger.debug('\t{0:20s} : {1:>10.6f}'.format( df_tmp.ix[i, 0], df_tmp.ix[i, 1])) return model
def run_xgb(train_X, train_y): xg_params = { "max_depth": 8, # [4, 6, 8] "min_child_weight": 6, # [4, 6, 8] "learning_rate": 0.1, # [0.05, 0.075, 0.1, 0.2] "colsample_bytree": 0.8, "colsample_bylevel": 0.8, "reg_alpha": 0, } trn_X, val_X, trn_y, val_y = train_test_split(train_X, train_y, test_size=0.20, random_state=0) xg_trn = xgb.DMatrix(trn_X, label=trn_y) xg_val = xgb.DMatrix(val_X, label=val_y) watchlist = [(xg_trn, 'train'), (xg_val, 'eval')] logger.info('split.train: {}'.format(trn_X.shape)) logger.info('split.valid: {}'.format(val_X.shape)) # GridSearch # min_params = xgb_gs(xg_params, xg_trn, trn_y, xg_val, val_y, wl=watchlist) model = xgb.train(xg_params, xg_trn, num_boost_round=5000, evals=watchlist, early_stopping_rounds=100, verbose_eval=50) pred_trn = model.predict(xg_trn, ntree_limit=model.best_ntree_limit) pred_val = model.predict(xg_val, ntree_limit=model.best_ntree_limit) rmse_trn = rmse(trn_y, pred_trn) rmse_val = rmse(val_y, pred_val) logger.info('rmse - Train: {}'.format(rmse_trn)) logger.info('rmse - valid: {}'.format(rmse_val)) # Feature Importance create_feats_map(list(trn_X.columns[2:])) feat_i = model.get_fscore(fmap=XGBFMAP) df_tmp = pd.DataFrame(list(feat_i.items()), columns=['feat_n', 'feat_i']) df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False) df_tmp = df_tmp.reset_index(drop=True) df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum() for i in range(len(df_tmp.index)): logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0], df_tmp.ix[i, 1])) return model
def ens_nn(trn_X, trn_y, trn_rows, test_X, test_rows): kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) pred_trn_X = np.zeros((trn_rows, )) pred_test_X = np.zeros((test_rows, )) pred_test_skf = np.empty((NFOLDS, test_rows)) lr = 0.1 bz = int(trn_X.shape[0] / 10) ep = 50 op = ks.optimizers.Adam(lr=lr) model_in = ks.Input(shape=(trn_X.shape[1], ), dtype='float32', sparse=True) out = ks.layers.Dense(192, activation='relu')(model_in) out = ks.layers.Dense(64, activation='relu')(out) out = ks.layers.Dense(64, activation='relu')(out) out = ks.layers.Dense(1)(out) model = ks.Model(model_in, out) model.compile(loss='mean_squared_error', optimizer=op) for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))): x_trn = trn_X[trn_i] y_trn = trn_y[trn_i] x_test = trn_X[test_i] for j in range(ep): model.fit(x=x_trn, y=y_trn, batch_size=bz, epochs=1, verbose=0) pred_trn = model.predict(x_trn, batch_size=bz)[:, 0] rmse_trn = rmse(y_trn, pred_trn) logger.debug('epochs {0}: rmse - Train:{1:.6f}'.format( j + 1, rmse_trn)) pred_trn_X[test_i] = model.predict(x_test, batch_size=bz)[:, 0] pred_test_skf[i, :] = model.predict(test_X, batch_size=bz)[:, 0] pred_test_X[:] = pred_test_skf.mean(axis=0) rmse_trn = rmse(trn_y, pred_trn_X) logger.info('rmse - NN Train: {}'.format(rmse_trn)) return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
# ElasticNet Feature Processing '''en_train, en_test = ens_en(ready_df[:train_row], train_y, train_row, ready_df[train_row:], test_row) en_preds = np.concatenate([en_train, en_test]) df['en_preds'] = en_preds del en_preds, en_train, en_test gc.collect()''' # NN Feature Processing - TFIDF # nn_train, nn_test = ens_nn(ready_df[:train_row], # train_y, train_row, # ready_df[train_row:], test_row) # NN Feature Processing - Features logger.debug(df.isnull().sum()) nn_train, nn_test = ens_nn(df[:train_row], train_y, train_row, df[train_row:], test_row) nn_preds = np.concatenate([nn_train, nn_test]) df['nn_preds'] = nn_preds del nn_preds, nn_train, nn_test gc.collect() # XGB Feature Processing '''xgb_train, xgb_test = ens_xgb(ready_df[:train_row], train_y, train_row, ready_df[train_row:], test_row) xgb_preds = np.concatenate([xgb_train, xgb_test]) df['xgb_preds'] = xgb_preds del xgb_preds, xgb_train, xgb_test gc.collect()'''
def ens_nn(trn_X, trn_y, trn_rows, test_X, test_rows): # trn_X['param_feat'], test_X['param_feat'], tknzr_pf = tknzr_fit('param_feat', trn_X, test_X) # trn_X['description'], test_X['description'], tknzr_pf = tknzr_desc_fit('description', trn_X, test_X) trn_X = trn_X.ix[:, [1, 2, 3, 4, 8, 11, 5]] test_X = test_X.ix[:, [1, 2, 3, 4, 8, 11, 5]] kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) pred_trn_X = np.zeros((trn_rows, )) pred_test_X = np.zeros((test_rows, )) pred_test_skf = np.empty((NFOLDS, test_rows)) lr = 0.005 bz = 100000 ep = 500 op = Adam(lr=lr) # early = EarlyStopping(monitor='val_loss', patience=500, mode='min') logger.info('NN Train Shape: {}'.format(trn_X.shape)) logger.info('NN Test Shape : {}'.format(test_X.shape)) '''# model_in = ks.Input(shape=(trn_X.shape[1],), dtype='float32', sparse=True) # model_in = Input(shape=(trn_X.shape[1],), dtype='float32', sparse=False) # out = ks.layers.Dense(192, activation='relu')(model_in) # out = ks.layers.Dense(64, activation='relu')(out) # out = ks.layers.Dense(64, activation='relu')(out) out = Dense(16, activation='relu')(model_in) out = Dense(8, activation='relu')(out) out = Dense(8, activation='relu')(out) out = Dense(1, activation='sigmoid')(out) model = Model(model_in, out)''' model = make_model_nn(trn_X, test_X) model.compile(loss='mean_squared_error', optimizer=op) '''tr_reg, ts_reg = ((trn_X['region']), (test_X['region'])) tr_city, ts_city = (np.array(trn_X['city']), np.array(test_X['city'])) tr_pcn, ts_pcn = (np.array(trn_X['parent_category_name']), np.array(test_X['parent_category_name'])) tr_cn, ts_cn = (np.array(trn_X['category_name']), np.array(test_X['category_name'])) tr_ut, ts_ut = (np.array(trn_X['user_type']), np.array(test_X['user_type'])) tr_pf, ts_pf, tknzr_pf = tknzr_fit('param_feat', trn_X, test_X)''' trn_X['param_feat'], test_X['param_feat'], tknzr_pf = tknzr_fit( 'param_feat', trn_X, test_X) # trn_X = np.array([tr_reg, tr_city, tr_pcn, tr_cn, tr_ut, tr_pf[:,0]]) # test_X = np.array([ts_reg, ts_city, ts_pcn, tr_cn, tr_ut, tr_pf[:,0]]) trn_X = np.array(trn_X) test_X = np.array(test_X) logger.debug(model.summary()) for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))): # for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X.index)))): x_trn = trn_X[trn_i] y_trn = trn_y[trn_i] # x_trn = trn_X.iloc[trn_i] # y_trn = trn_y.iloc[trn_i] x_trn, x_val, y_trn, y_val = train_test_split(x_trn, y_trn, test_size=0.10, shuffle=False, random_state=23) x_test = trn_X[test_i] # x_test = trn_X.iloc[test_i] '''x_trn = np.array(x_trn) y_trn = np.array(y_trn) x_val = np.array(x_val) y_val = np.array(y_val) x_test = np.array(x_test) test_X = np.array(test_X)''' model.fit( x=[ x_trn[:, 0], x_trn[:, 1], x_trn[:, 2], x_trn[:, 3], x_trn[:, 4], x_trn[:, 5] ], y=y_trn, # model.fit(x=[x_trn[:,[0,1,2,3,4,5]]], y=y_trn, validation_data=([ x_val[:, 0], x_val[:, 1], x_val[:, 2], x_val[:, 3], x_val[:, 4], x_val[:, 5] ], y_val), batch_size=bz, epochs=ep, verbose=1) # batch_size=bz, epochs=ep, callbacks=[early], verbose=1) pred_trn_X[test_i] = model.predict([ x_test[:, 0], x_test[:, 1], x_test[:, 2], x_test[:, 3], x_test[:, 4], x_test[:, 5] ], batch_size=bz)[:, 0] pred_test_skf[i, :] = model.predict([ test_X[:, 0], test_X[:, 1], test_X[:, 2], test_X[:, 3], test_X[:, 4], test_X[:, 5] ], batch_size=bz)[:, 0] pred_test_X[:] = pred_test_skf.mean(axis=0) rmse_trn = rmse(trn_y, pred_trn_X) logger.info('Rmse - NN Train: {}, lr: {}, bz: {}, ep: {}'.format( rmse_trn, lr, bz, ep)) return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)