def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est, depth, retrain=True): logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) logging.info('Loading CV Ids') cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) for i, (i_trn, i_val) in enumerate(cv.split(y), 1): logging.info('Training model #{}'.format(i)) clf = ExtraTreesRegressor(n_estimators=n_est, max_depth=depth, random_state=SEED) clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict(X[i_val]) logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val]))) if not retrain: p_tst += clf.predict(X_tst) / N_FOLD logging.info('CV: {:.6f}'.format(kappa(y, p_val))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') clf = ExtraTreesRegressor(n_estimators=n_est, max_depth=depth, random_state=SEED) clf.fit(X, y) p_tst = clf.predict(X_tst) logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def fit(self, X, y): X = np.asarray(X) res = minimize(lambda x: -kappa(y, X.dot(x)), x0=self.random_state.rand(X.shape[1]), method=self.algo, tol=self.tol) self.coef_ = res.x return self
def train_predict(train_file, test_file, feature_map_file, predict_valid_file, predict_test_file, feature_importance_file, retrain=True): logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) logging.info('Loading CV Ids') cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) for i, (i_trn, i_val) in enumerate(cv.split(y), 1): logging.info('Training model #{}'.format(i)) clf = LinearRegression() clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict(X[i_val]) logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val]))) if i == 1: df = pd.read_csv(feature_map_file, sep='\t', names=['id', 'name', 'type']) df['coef'] = clf.coef_ df.sort_values('coef', ascending=False, inplace=True) df.to_csv(feature_importance_file, index=False) logging.info('feature importance is saved in {}'.format(feature_importance_file)) if not retrain: p_tst += clf.predict(X_tst) / N_FOLD logging.info('CV: {:.6f}'.format(kappa(y, p_val))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') clf = LinearRegression() clf.fit(X, y) p_tst = clf.predict(X_tst) logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, feature_map_file, predict_valid_file, predict_test_file, feature_importance_file, n_est=100, depth=4, lrate=.1, l2_leaf_reg=1): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.info(('n_est={}, depth={}, lrate={}, ' 'l2_leaf_reg={}').format(n_est, depth, lrate, l2_leaf_reg)) logging.info('Loading training and test data...') logging.info('{}'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) if sparse.issparse(X): X = X.todense() X_tst = X_tst.todense() features = pd.read_csv(feature_map_file, sep='\t', header=None, names=['idx', 'name', 'type']) cat_cols = features.idx[features.type != 'q'].tolist() logging.info('Loading CV Ids') cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) feature_name, feature_ext = os.path.splitext(train_file) feature_name = os.path.splitext(feature_name)[0] for i, (i_trn, i_val) in enumerate(cv.split(y), 1): logging.info('Training model #{}'.format(i)) cv_train_file = '{}.trn{}{}'.format(feature_name, i, feature_ext) cv_test_file = '{}.tst{}{}'.format(feature_name, i, feature_ext) if os.path.exists(cv_train_file): is_cv_feature = True X_cv, _ = load_data(cv_train_file) X_tst_cv, _ = load_data(cv_test_file) X_trn = np.hstack((X[i_trn], X_cv[i_trn])) X_val = np.hstack((X[i_val], X_cv[i_val])) X_tst_ = np.hstack((X_tst, X_tst_cv)) else: is_cv_feature = False X_trn = X[i_trn] X_val = X[i_val] X_tst_ = X_tst if i == 1: logging.info('Training with early stopping') clf = cbt.CatBoostRegressor(learning_rate=lrate, depth=depth, l2_leaf_reg=l2_leaf_reg, iterations=n_est, loss_function='RMSE', random_seed=SEED, thread_count=N_JOB) if len(cat_cols) > 0: clf = clf.fit(X_trn, y[i_trn], eval_set=[X_val, y[i_val]], use_best_model=True, cat_features=cat_cols) else: clf = clf.fit(X_trn, y[i_trn], eval_set=[X_val, y[i_val]], use_best_model=True) n_best = clf.tree_count_ logging.info('best iteration={}'.format(n_best)) df = pd.read_csv(feature_map_file, sep='\t', names=['id', 'name', 'type']) df['gain'] = clf.feature_importances_ df.loc[:, 'gain'] = df.gain / df.gain.sum() df.sort_values('gain', ascending=False, inplace=True) df.to_csv(feature_importance_file, index=False) logging.info('feature importance is saved in {}'.format( feature_importance_file)) else: clf = cbt.CatBoostRegressor(learning_rate=lrate, depth=depth, l2_leaf_reg=l2_leaf_reg, iterations=n_best, loss_function='RMSE', random_seed=SEED, thread_count=N_JOB) if len(cat_cols) > 0: clf = clf.fit(X_trn, y[i_trn], eval_set=(X_val, y[i_val]), use_best_model=False, cat_features=cat_cols) else: clf = clf.fit(X_trn, y[i_trn], eval_set=(X_val, y[i_val]), use_best_model=False) p_val[i_val] = clf.predict(X_val) logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val]))) p_tst += clf.predict(X_tst_) / N_FOLD logging.info('CV: {:.6f}'.format(kappa(y, p_val))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, model_file, predict_valid_file, predict_test_file, nn='nn2', n_est=100, lrate=.001, n_stop=100, batch_size=1024): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.info('{}'.format(model_name)) logging.info(('{}, n_est={}, lrate={}, n_stop={}, batch_size={}').format( nn, n_est, lrate, n_stop, batch_size)) logging.info('Loading CV Ids') cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) if sparse.issparse(X): X = X.todense() X_tst = X_tst.todense() logging.debug('Training ({}), and test ({}) data loaded'.format( X.shape, X_tst.shape)) n_bests = [] p = np.zeros_like(y, dtype=float) p_tst = np.zeros((X_tst.shape[0], )) input_dim = X.shape[1] for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1): logging.info('Training model #{}'.format(i)) clf = get_model(nn, input_dim, None, lrate) if i == 1: logging.info(clf.summary()) es = EarlyStopping(monitor='val_loss', patience=n_stop) mcp = ModelCheckpoint(model_file, monitor='val_loss', save_best_only=True, save_weights_only=False) h = clf.fit_generator( generator(X[i_trn], y[i_trn], batch_size), steps_per_epoch=int(np.ceil(len(i_trn) / batch_size)), epochs=n_est, validation_data=generator(X[i_val], y[i_val], batch_size), validation_steps=int(np.ceil(len(i_val) / batch_size)), callbacks=[es, mcp]) val_losss = h.history['val_loss'] n_best = val_losss.index(min(val_losss)) + 1 clf.load_weights(model_file) logging.info('best epoch={}'.format(n_best)) else: clf.fit_generator( generator(X[i_trn], y[i_trn], batch_size), steps_per_epoch=int(np.ceil(len(i_trn) / batch_size)), epochs=n_best, validation_data=generator(X[i_val], y[i_val], batch_size), validation_steps=int(np.ceil(len(i_val) / batch_size))) p[i_val] = clf.predict(X[i_val]).flatten() logging.info('CV {} kappa={:.6f}, best iteration={}'.format( i, kappa(y[i_val], p[i_val]), n_best)) p_tst += clf.predict(X_tst).flatten() / N_FOLD logging.info('CV kappa: {:.6f}'.format(kappa(y, p))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',') logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, feature_map_file, predict_valid_file, predict_test_file, feature_importance_file, n_est=100, depth=4, lrate=.1, subcol=.5, subrow=.5, sublev=1, weight=1, n_stop=100, retrain=True): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.info( ('n_est={}, depth={}, lrate={}, ' 'subcol={}, subrow={}, sublev={},' 'weight={}, n_stop={}').format(n_est, depth, lrate, subcol, subrow, sublev, weight, n_stop)) logging.info('Loading training and test data...') logging.info('{}'.format(model_name)) # set xgb parameters params = { 'objective': "reg:linear", 'max_depth': depth, 'eta': lrate, 'subsample': subrow, 'colsample_bytree': subcol, 'colsample_bylevel': sublev, 'min_child_weight': weight, 'eval_metric': 'rmse', 'silent': 1, 'nthread': N_JOB, 'seed': SEED } logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) xgtst = xgb.DMatrix(X_tst) logging.info('Loading CV Ids') cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) for i, (i_trn, i_val) in enumerate(cv.split(y), 1): xgtrn = xgb.DMatrix(X[i_trn], label=y[i_trn]) xgval = xgb.DMatrix(X[i_val], label=y[i_val]) logging.info('Training model #{}'.format(i)) watchlist = [(xgtrn, 'train'), (xgval, 'val')] if i == 1: logging.info('Training with early stopping') clf = xgb.train(params, xgtrn, n_est, watchlist, early_stopping_rounds=n_stop) n_best = clf.best_iteration logging.info('best iteration={}'.format(n_best)) importance = clf.get_fscore(feature_map_file) df = pd.DataFrame.from_dict(importance, 'index') df.index.name = 'name' df.columns = ['fscore'] df.loc[:, 'fscore'] = df.fscore / df.fscore.sum() df.sort_values('fscore', ascending=False, inplace=True) df.to_csv(feature_importance_file, index=True) logging.info('feature importance is saved in {}'.format( feature_importance_file)) else: clf = xgb.train(params, xgtrn, n_best, watchlist) p_val[i_val] = clf.predict(xgval, ntree_limit=n_best) logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val]))) if not retrain: p_tst += clf.predict(xgtst, ntree_limit=n_best) / N_FOLD logging.info('CV: {:.6f}'.format(kappa(y, p_val))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') xgtrn = xgb.DMatrix(X, label=y) watchlist = [(xgtrn, 'train')] clf = xgb.train(params, xgtrn, n_best, watchlist) p_tst = clf.predict(xgtst, ntree_limit=n_best) logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, n_leaf=200, lrate=.1, n_min=8, subcol=.3, subrow=.8, subrow_freq=100, n_stop=100, retrain=True, log_file=None): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] if log_file is None: log_file = '{}.log'.format(model_name) logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename=log_file, datefmt='%Y-%m-%d %H:%M:%S') logging.info('{}'.format(model_name)) logging.info(('n_est={}, n_leaf={}, lrate={}, ' 'n_min={}, subcol={}, subrow={},' 'subrow_freq={}, n_stop={}').format(n_est, n_leaf, lrate, n_min, subcol, subrow, subrow_freq, n_stop)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) logging.info('Loading CV Ids') cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) for i, (i_trn, i_val) in enumerate(cv.split(y), 1): logging.info('Training model #{}'.format(i)) watchlist = [(X[i_val], y[i_val])] if i == 1: logging.info('Training with early stopping') clf = lgb.LGBMRegressor(n_estimators=n_est, num_leaves=n_leaf, learning_rate=lrate, min_child_samples=n_min, subsample=subrow, subsample_freq=subrow_freq, colsample_bytree=subcol, objective='regression', n_jobs=N_JOB, random_state=SEED) clf = clf.fit(X[i_trn], y[i_trn], eval_set=watchlist, eval_metric='rmse', early_stopping_rounds=n_stop, verbose=10) n_best = clf.best_iteration_ logging.info('best iteration={}'.format(n_best)) else: clf = lgb.LGBMRegressor(n_estimators=n_best, num_leaves=n_leaf, learning_rate=lrate, min_child_samples=n_min, subsample=subrow, subsample_freq=subrow_freq, colsample_bytree=subcol, objective='regression', n_jobs=N_JOB, random_state=SEED) clf = clf.fit(X[i_trn], y[i_trn], eval_set=watchlist, eval_metric='rmse', verbose=10) p_val[i_val] = clf.predict(X[i_val]) logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val]))) if not retrain: p_tst += clf.predict(X_tst) / N_FOLD logging.info('CV: {:.6f}'.format(kappa(y, p_val))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') clf = lgb.LGBMRegressor(n_estimators=n_best, num_leaves=n_leaf, learning_rate=lrate, min_child_samples=n_min, subsample=subrow, subsample_freq=subrow_freq, colsample_bytree=subcol, objective='regression', n_jobs=N_JOB, random_state=SEED) clf = clf.fit(X, y) p_tst = clf.predict(X_tst) logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, n_leaf=200, lrate=.1, n_min=8, subcol=.3, subrow=.8, subrow_freq=100, n_stop=100, log_file=None): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] if log_file is None: log_file = '{}.log'.format(model_name) logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename=log_file, datefmt='%Y-%m-%d %H:%M:%S') logging.info('{}'.format(model_name)) logging.info(('n_est={}, n_leaf={}, lrate={}, ' 'n_min={}, subcol={}, subrow={},' 'subrow_freq={}, n_stop={}').format(n_est, n_leaf, lrate, n_min, subcol, subrow, subrow_freq, n_stop)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) cat_cols = [i for i in range(X.shape[1]) if int(X[0, i]) == X[0, i]] params = { 'boosting_type': 'gbdt', 'objective': 'multiclass', 'metric': 'multi_logloss', 'num_class': 21, 'num_leaves': n_leaf, 'learning_rate': lrate, 'feature_fraction': subcol, 'bagging_fraction': subrow, 'bagging_freq': subrow_freq, 'min_data_in_leaf': n_min, 'metric_freq': 10, 'is_training_metric': True, 'verbose': 0, 'num_threads': N_JOB } logging.info('Loading CV Ids') cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p_val = np.zeros(X.shape[0]) P_tst = np.zeros((X_tst.shape[0], 21)) for i, (i_trn, i_val) in enumerate(cv.split(y), 1): logging.info('Training model #{}'.format(i)) lgb_trn = lgb.Dataset(X[i_trn], y[i_trn]) lgb_val = lgb.Dataset(X[i_val], y[i_val]) watchlist = [(X[i_val], y[i_val])] if i == 1: logging.info('Training with early stopping') clf = lgb.train(params, lgb_trn, num_boost_round=n_est, early_stopping_rounds=n_stop, valid_sets=lgb_val, categorical_feature=cat_cols) n_best = clf.best_iteration logging.info('best iteration={}'.format(n_best)) else: clf = lgb.train(params, lgb_trn, num_boost_round=n_best, valid_sets=lgb_val, categorical_feature=cat_cols) p_val[i_val] = np.argmax(clf.predict(X[i_val]), axis=1) logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val]))) P_tst += clf.predict(X_tst) / N_FOLD logging.info('CV: {:.6f}'.format(kappa(y, p_val))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') logging.info('Saving test predictions...') np.savetxt(predict_test_file, np.argmax(P_tst, axis=1), fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, feature_map_file, predict_valid_file, predict_test_file, feature_importance_file, n_est=100, n_leaf=200, lrate=.1, n_min=8, subcol=.3, subrow=.8, subrow_freq=100, n_stop=100, retrain=True): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.info('{}'.format(model_name)) logging.info(('n_est={}, n_leaf={}, lrate={}, ' 'n_min={}, subcol={}, subrow={},' 'subrow_freq={}, n_stop={}').format(n_est, n_leaf, lrate, n_min, subcol, subrow, subrow_freq, n_stop)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) params = { 'boosting_type': 'gbdt', 'objective': 'regression', 'num_leaves': n_leaf, 'learning_rate': lrate, 'feature_fraction': subcol, 'bagging_fraction': subrow, 'bagging_freq': subrow_freq, 'min_data_in_leaf': n_min, 'feature_fraction_seed': SEED, 'bagging_seed': SEED, 'data_random_seed': SEED, 'metric': 'rmse', 'verbose': 0, 'num_threads': N_JOB } logging.info('Loading CV Ids') cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) feature_name, feature_ext = os.path.splitext(train_file) feature_name = os.path.splitext(feature_name)[0] for i, (i_trn, i_val) in enumerate(cv.split(y), 1): logging.info('Training model #{}'.format(i)) cv_train_file = '{}.trn{}{}'.format(feature_name, i, feature_ext) cv_test_file = '{}.tst{}{}'.format(feature_name, i, feature_ext) if os.path.exists(cv_train_file): is_cv_feature = True X_cv, _ = load_data(cv_train_file) X_tst_cv, _ = load_data(cv_test_file) lgb_trn = lgb.Dataset(np.hstack((X[i_trn], X_cv[i_trn])), y[i_trn]) lgb_val = lgb.Dataset(np.hstack((X[i_val], X_cv[i_val])), y[i_val]) else: is_cv_feature = False lgb_trn = lgb.Dataset(X[i_trn], y[i_trn]) lgb_val = lgb.Dataset(X[i_val], y[i_val]) if i == 1: logging.info('Training with early stopping') clf = lgb.train(params, lgb_trn, num_boost_round=n_est, early_stopping_rounds=n_stop, valid_sets=lgb_val, verbose_eval=100) n_best = clf.best_iteration logging.info('best iteration={}'.format(n_best)) df = pd.read_csv(feature_map_file, sep='\t', names=['id', 'name', 'type']) df['gain'] = clf.feature_importance(importance_type='gain', iteration=n_best) df.loc[:, 'gain'] = df.gain / df.gain.sum() df.sort_values('gain', ascending=False, inplace=True) df.to_csv(feature_importance_file, index=False) logging.info('feature importance is saved in {}'.format( feature_importance_file)) else: clf = lgb.train(params, lgb_trn, num_boost_round=n_best, valid_sets=lgb_val, verbose_eval=100) if is_cv_feature: p_val[i_val] = clf.predict(np.hstack((X[i_val], X_cv[i_val]))) else: p_val[i_val] = clf.predict(X[i_val]) logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val]))) if not retrain: if is_cv_feature: p_tst += clf.predict(np.hstack((X_tst, X_tst_cv))) / N_FOLD else: p_tst += clf.predict(X_tst) / N_FOLD logging.info('CV: {:.6f}'.format(kappa(y, p_val))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') lgb_trn = lgb.Dataset(X, y) clf = lgb.train(params, lgb_trn, num_boost_round=n_best, verbose_eval=100) p_tst = clf.predict(X_tst) logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')