def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_stop=100, retrain=True): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) logging.info('Loading CV Ids') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) model = AutoLGB(objective='binary', metric='auc', n_random_col=0) model.tune(pd.DataFrame(X), pd.Series(y)) params = model.params n_est = model.n_best logging.info(f'params: {params}') logging.info(f'n_best: {n_est}') p = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) n_bests = [] for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1): logging.info('Training model #{}'.format(i)) trn_lgb = lgb.Dataset(X[i_trn], label=y[i_trn]) val_lgb = lgb.Dataset(X[i_val], label=y[i_val]) logging.info('Training with early stopping') clf = lgb.train(params, trn_lgb, n_est, val_lgb, early_stopping_rounds=n_stop, verbose_eval=100) n_best = clf.best_iteration n_bests.append(n_best) logging.info('best iteration={}'.format(n_best)) p[i_val] = clf.predict(X[i_val]) logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val]))) p_tst += clf.predict(X_tst) / N_FOLD logging.info('CV: {:.4f}'.format(AUC(y, p))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',') logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est, depth, n_fold=5): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='rf_{}_{}_{}.log'.format(n_est, depth, feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) clf = RF(n_estimators=n_est, max_depth=depth, random_state=2015) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) logging.info('Cross validation...') p_val = np.zeros_like(y) for i_trn, i_val in cv: clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('AUC = {:.4f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, C, n_fold=5): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='lr_{}_{}.log'.format( C, feature_name )) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) clf = SVC(C=C, class_weight='auto', random_state=2015, probability=True) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training CV #{}...'.format(i)) clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('AUC TRN = {:.4f}'.format(AUC(y[i_trn], clf.predict_proba(X[i_trn])[:, 1]))) logging.info('AUC VAL = {:.4f}'.format(AUC(y[i_val], p_val[i_val]))) logging.info('AUC = {:.4f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, feature_map_file, predict_valid_file, predict_test_file, feature_imp_file, n_est=100, subrow=.5, n_min=1): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename=f'{model_name}.log') logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) with open(feature_map_file) as f: feature_name = [x.strip() for x in f.readlines()] logging.info('Loading CV Ids') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p = np.zeros((X.shape[0], N_CLASS)) p_tst = np.zeros((X_tst.shape[0], N_CLASS)) for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1): logging.info(f'Training model #{i}') clf = RandomForestClassifier(n_estimators=n_est, min_samples_leaf=n_min, max_features='auto', max_samples=subrow, random_state=SEED, n_jobs=-1) clf.fit(X[i_trn], y[i_trn]) p[i_val, :] = clf.predict_proba(X[i_val]) p_tst += clf.predict_proba(X_tst) / N_FOLD logging.info( f'CV #{i}: {accuracy_score(y[i_val], np.argmax(p[i_val], axis=1)) * 100:.4f}%' ) imp = pd.DataFrame({ 'feature': feature_name, 'importance': clf.feature_importances_ }) imp = imp.sort_values('importance').set_index('feature') imp.to_csv(feature_imp_file) logging.info(f'CV: {accuracy_score(y, np.argmax(p, axis=1)) * 100:.4f}%') logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',') logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, encoder_name, encoder_dim, lrate, dropout, model_file, feature_map_file, n_est, n_stop, batch_size): logging.info('loading base feature files') X, y = load_data(train_file) X_tst, _ = load_data(test_file) n_trn = X.shape[0] logging.info('combining training and test features') X = sparse.vstack((X, X_tst)) autoencoder, encoder = get_model(model_name=encoder_name, input_dim=X.shape[1], encoder_dim=encoder_dim, learning_rate=lrate, dropout=dropout) logging.info('training an autoencoder') logging.info(autoencoder.summary()) i_trn, i_val = train_test_split(np.arange(X.shape[0]), test_size=.2, random_state=SEED, shuffle=True) es = EarlyStopping(monitor='val_loss', patience=n_stop) mcp = ModelCheckpoint(model_file, monitor='val_loss', save_best_only=True, save_weights_only=False) h = autoencoder.fit_generator( generator(X[i_trn], X[i_trn], batch_size), steps_per_epoch=int(np.ceil(len(i_trn) / batch_size)), epochs=n_est, validation_data=generator(X[i_val], X[i_val], batch_size), validation_steps=int(np.ceil(len(i_val) / batch_size)), callbacks=[es, mcp]) val_losss = h.history['val_loss'] n_best = val_losss.index(min(val_losss)) + 1 autoencoder.load_weights(model_file) logging.info('best epoch={}'.format(n_best)) with open(feature_map_file, 'w') as f: for i, col in enumerate(range(encoder_dim)): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') P = encoder.predict_generator(generator(X[:n_trn], None, batch_size), steps=int(np.ceil(n_trn / batch_size))) save_data(sparse.csr_matrix(P), y, train_feature_file) P = encoder.predict_generator(generator(X[n_trn:], None, batch_size), steps=int( np.ceil( (X.shape[0] - n_trn) / batch_size))) save_data(sparse.csr_matrix(P), None, test_feature_file)
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, lrate=.1, l1=.0, l2=.0, n_fold=5): dir_feature = os.path.dirname(train_file) dir_val = os.path.dirname(predict_valid_file) feature_name = os.path.basename(train_file)[:-4] algo_name = 'xgl_{}_{}_{}_{}'.format(n_est, lrate, l1, l2) model_name = '{}_{}'.format(algo_name, feature_name) logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) param = {'eta': lrate, 'objective': 'binary:logistic', 'colsample_bytree': .7, 'subsample': .5, 'eval_metric': 'auc', 'seed': 2015, 'booster': 'gblinear', 'alpha': l1, 'lambda': l2} logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}...'.format(i)) dtrain = xgb.DMatrix(X[i_trn], label=y[i_trn]) dvalid = xgb.DMatrix(X[i_val], label=y[i_val]) watchlist = [(dvalid, 'eval'), (dtrain, 'train')] clf = xgb.train(param, dtrain, n_est, watchlist) p_val[i_val] = clf.predict(dvalid) logging.info('AUC TRN = {:.6f}'.format(AUC(y[i_trn], clf.predict(dtrain)))) logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val]))) logging.info('AUC = {:.6f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') dtrain = xgb.DMatrix(X, label=y) dtest = xgb.DMatrix(test_file) watchlist = [(dtrain, 'train')] clf = xgb.train(param, dtrain, n_est, watchlist) p_tst = clf.predict(dtest) logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, batch_size=1024, retrain=True): model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) dims = X.shape[1] logging.info('{} dims'.format(dims)) logging.info('Loading CV Ids') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p = np.zeros_like(y) p_tst = np.zeros((X_tst.shape[0],)) for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1): logging.info('Training model #{}'.format(i)) clf = nn_model(dims) clf.fit_generator(generator=batch_generator(X[i_trn], y[i_trn], batch_size, True), nb_epoch=n_est, samples_per_epoch=X[i_trn].shape[0], verbose=0) p[i_val] = clf.predict_generator(generator=batch_generatorp(X[i_val], batch_size, False), val_samples=X[i_val].shape[0])[:, 0] logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val]))) if not retrain: p_tst += clf.predict_generator(generator=batch_generatorp(X_tst, batch_size, False), val_samples=X_tst.shape[0])[:, 0] / N_FOLD logging.info('Saving validation predictions...') logging.info('CV: {:.4f}'.format(AUC(y, p))) np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') clf = nn_model(dims) clf.fit_generator(generator=batch_generator(X, Y, batch_size, True), nb_epoch=n_est) p_tst = clf.predict_generator(generator=batch_generatorp(X_tst, batch_size, False), val_samples=X_tst.shape[0])[:, 0] logging.info('Saving normalized test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, depth=4, lrate=.1, n_fold=5, n_bag=50, subrow=.5, subcol=.8): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='xg_bag{}_{}_{}_{}_{}_{}_{}.log'.format( n_bag, n_est, depth, lrate, subrow, subcol, feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) xg = xgb.XGBClassifier(max_depth=depth, learning_rate=lrate, n_estimators=n_est, colsample_bytree=.8, subsample=.5, nthread=4) clf = BG(xg, n_estimators=n_bag, max_samples=subrow, max_features=subcol) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}...'.format(i)) clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('AUC TRN = {:.6f}'.format( AUC(y[i_trn], clf.predict_proba(X[i_trn])[:, 1]))) logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val]))) logging.info('AUC = {:.6f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, retrain=True): model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) logging.info('Loading CV Ids') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) n_bests = [] for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1): logging.info('Training model #{}'.format(i)) glm = linear_model.LogisticRegression(solver='lbfgs', max_iter=2020, fit_intercept=True, penalty='none', verbose=0) glm.fit(X[i_trn], y[i_trn]) p[i_val] = glm.predict_proba(X[i_val])[:, 1] logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val]))) if not retrain: p_tst += glm.predict_proba(X_tst)[:,1] / N_FOLD logging.info('CV: {:.4f}'.format(AUC(y, p))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') glm = linear_model.LogisticRegression(random_state=1, solver='lbfgs', max_iter=2020, fit_intercept=True, penalty='none', verbose=0) glb = glb.fit(X, y) p_tst = glb.predict_proba(X_tst) logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def calculate_distance(vec_files): print vec_files q1_vec, _ = load_data(vec_files[0]) q2_vec, _ = load_data(vec_files[1]) distances = [] for d in sklearn.metrics.pairwise.PAIRED_DISTANCES.keys( ): #['euclidean', 'cosine', 'l2', 'l1', 'cityblock', 'manhattan'] distances.append( sklearn.metrics.pairwise.paired_distances(q1_vec, q2_vec, metric=d)) return np.transpose(np.vstack(distances))
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, cv_id_file, n_est, depth, n_fold=5, retrain=False): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='rf_{}_{}_{}.log'.format( n_est, depth, feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file, dense=True) X_tst, _ = load_data(test_file, dense=True) clf = RF(n_estimators=n_est, max_depth=depth, n_jobs=20, random_state=2016) logging.info('Loading CV Ids') cv_id = np.loadtxt(cv_id_file) logging.info('Cross validation...') P_val = np.zeros(X.shape[0]) P_tst = np.zeros(X_tst.shape[0]) for i in range(1, n_fold + 1): logging.info("cv %d" % i) i_trn = np.where(cv_id != i)[0] i_val = np.where(cv_id == i)[0] logging.debug('train: {}'.format(X[i_trn].shape)) logging.debug('valid: {}'.format(X[i_val].shape)) logging.debug(len(set(y[i_trn]))) clf.fit(X[i_trn], y[i_trn]) P_val[i_val] = clf.predict_proba(X[i_val])[:, 1] if not retrain: P_tst += clf.predict_proba(X_tst)[:, 1] / n_fold if retrain: logging.info('Retraining with 100% data...') clf.fit(X, y) P_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, P_val, fmt='%.6f', delimiter=',') np.savetxt(predict_test_file, P_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est, depth, n_fold=5, n_bag=50): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='et_bag{}_{}_{}_{}.log'.format( n_bag, n_est, depth, feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) et = ET(n_estimators=n_est, max_depth=depth, random_state=2015, class_weight='auto', bootstrap=True) clf = BG(et, n_estimators=n_bag, max_samples=.8, max_features=.9) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}...'.format(i)) clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('AUC TRN = {:.6f}'.format( AUC(y[i_trn], clf.predict_proba(X[i_trn])[:, 1]))) logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val]))) logging.info('AUC = {:.6f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_iter=100, dim=4, lrate=.1, n_fold=5): dir_feature = os.path.dirname(train_file) dir_val = os.path.dirname(predict_valid_file) feature_name = os.path.basename(train_file)[:-8] algo_name = 'libfm_{}_{}_{}'.format(n_iter, dim, lrate) model_name = '{}_{}'.format(algo_name, feature_name) logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training data') X, y = load_data(train_file) n_tst = sum(1 for line in open(test_file)) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}...'.format(i)) valid_train_file = os.path.join(dir_feature, '{}.trn{}.sps'.format(feature_name, i)) valid_test_file = os.path.join(dir_feature, '{}.val{}.sps'.format(feature_name, i)) valid_predict_file = os.path.join(dir_val, '{}.val{}.yht'.format(model_name, i)) # if there is no CV training or validation file, then generate them # first. if (not os.path.isfile(valid_train_file) or not os.path.isfile(valid_test_file)): dump_svmlight_file(X[i_trn], y[i_trn], valid_train_file, zero_based=False) dump_svmlight_file(X[i_val], y[i_val], valid_test_file, zero_based=False) subprocess.call(["libFM", "-task", "c", '-dim', '1,1,{}'.format(dim), '-init_stdev', str(lrate), '-iter', str(n_iter), '-train', valid_train_file, '-test', valid_test_file, '-out', valid_predict_file]) p_val[i_val] = np.loadtxt(valid_predict_file) os.remove(valid_predict_file) logging.info('AUC = {:.6f}'.format(AUC(y, p_val))) np.savetxt(predict_valid_file, p_val, fmt='%.6f') logging.info('Retraining with 100% data...') subprocess.call(["libFM", "-task", "c", '-dim', '1,1,{}'.format(dim), '-init_stdev', str(lrate), '-iter', str(n_iter), '-train', train_file, '-test', test_file, '-out', predict_test_file])
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est, depth, retrain=True): logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) logging.info('Loading CV Ids') cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) for i, (i_trn, i_val) in enumerate(cv.split(y), 1): logging.info('Training model #{}'.format(i)) clf = ExtraTreesRegressor(n_estimators=n_est, max_depth=depth, random_state=SEED) clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict(X[i_val]) logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val]))) if not retrain: p_tst += clf.predict(X_tst) / N_FOLD logging.info('CV: {:.6f}'.format(kappa(y, p_val))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') clf = ExtraTreesRegressor(n_estimators=n_est, max_depth=depth, random_state=SEED) clf.fit(X, y) p_tst = clf.predict(X_tst) logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, feature_map_file, predict_valid_file, predict_test_file, feature_importance_file, retrain=True): logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) logging.info('Loading CV Ids') cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) for i, (i_trn, i_val) in enumerate(cv.split(y), 1): logging.info('Training model #{}'.format(i)) clf = LinearRegression() clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict(X[i_val]) logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val]))) if i == 1: df = pd.read_csv(feature_map_file, sep='\t', names=['id', 'name', 'type']) df['coef'] = clf.coef_ df.sort_values('coef', ascending=False, inplace=True) df.to_csv(feature_importance_file, index=False) logging.info('feature importance is saved in {}'.format(feature_importance_file)) if not retrain: p_tst += clf.predict(X_tst) / N_FOLD logging.info('CV: {:.6f}'.format(kappa(y, p_val))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') clf = LinearRegression() clf.fit(X, y) p_tst = clf.predict(X_tst) logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def merge_sub_features(train_file, test_file, train_sub_features, test_sub_features, train_feature_file, test_feature_file, lowest): trn_subfeat = [] tst_subfeat = [] for f_trn, f_tst in zip([x for x in train_sub_features.split(' ') if x], [x for x in test_sub_features.split(' ') if x]): logging.info('Reading trn {0} tst {1}'.format(f_trn, f_tst)) X_sub_trn, _ = load_data(f_trn) X_sub_tst, _ = load_data(f_tst) if not ssp.issparse(X_sub_trn): X_sub_trn = ssp.csr_matrix(X_sub_trn) X_sub_tst = ssp.csr_matrix(X_sub_tst) trn_subfeat.append(X_sub_trn) tst_subfeat.append(X_sub_tst) logging.info('Size trn {0} tst {1}'.format(X_sub_trn.shape, X_sub_tst.shape)) df_train = pd.read_csv(train_file) y_train = df_train[TARGET].values logging.info('Merge sub features') X_trn = ssp.hstack(trn_subfeat).tocsr() X_tst = ssp.hstack(tst_subfeat).tocsr() logging.info('Size trn {0} tst {1}'.format(X_trn.shape, X_tst.shape)) drop = feature_selection.DropInactive(lowest) drop.fit(X_trn) X_trn = drop.transform(X_trn) X_tst = drop.transform(X_tst) logging.info('Size trn {0} tst {1}'.format(X_trn.shape, X_tst.shape)) logging.info('saving features') save_data(X_trn, y_train, train_feature_file) save_data(X_tst, None, test_feature_file)
def train_predict(train_file, test_file, predict_valid_file, predict_test_file): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='avg_{}.log'.format(feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) P_val = X.mean(axis=1) P_tst = X_tst.mean(axis=1) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, P_val, fmt='%.6f', delimiter=',') logging.info('Saving test predictions...') np.savetxt(predict_test_file, P_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_test_file, cid_train_file, cid_test_file, n_fold=5): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='xg_cid_grid_{}.log'.format(feature_name)) logging.info('Loading course IDs for training and test data') cid_trn = np.loadtxt(cid_train_file, dtype=int) cid_tst = np.loadtxt(cid_test_file, dtype=int) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, y_tst = load_data(test_file) xg = xgb.XGBClassifier(subsample=0.4, colsample_bytree=.4, nthread=6) param = {'learning_rate': [0.005, .01, .02], 'max_depth': [4, 5, 6], 'n_estimators': [200, 400, 800, 1000]} p_tst = np.zeros_like(y_tst) for j in range(39): idx_trn = np.where(cid_trn == j)[0] idx_tst = np.where(cid_tst == j)[0] cv = StratifiedKFold(y[idx_trn], n_folds=n_fold, shuffle=True, random_state=2015) clf = GridSearchCV(xg, param, scoring='roc_auc', verbose=1, cv=cv) clf.fit(X[idx_trn], y[idx_trn]) logging.info('CID #{}: {:.4f} {}'.format(j, clf.best_score_, clf.best_params_)) logging.info('Retraining with 100% data...') clf.best_estimator_.fit(X[idx_trn], y[idx_trn]) p_tst[idx_tst] = clf.best_estimator_.predict_proba(X_tst[idx_tst])[:, 1] logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_fold=5): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='xg_grid_{}.log'.format(feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) xg = xgb.XGBClassifier(subsample=0.5, colsample_bytree=0.8, nthread=4) param = { 'learning_rate': [.005, .01, .02], 'max_depth': [4, 5, 6], 'n_estimators': [100, 200, 400] } cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) clf = GridSearchCV(xg, param, scoring='roc_auc', verbose=1, cv=cv) logging.info('Cross validation for grid search...') clf.fit(X, y) p = clf.predict_proba(X)[:, 1] logging.info('best model = {}'.format(clf.best_estimator_)) logging.info('best score = {:.6f}'.format(clf.best_score_)) logging.info('Retraining with 100% data...') clf.best_estimator_.fit(X, y) p_tst = clf.best_estimator_.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, cv_id_file, n_est, depth, n_fold=5): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='et_{}_{}_{}.log'.format(n_est, depth, feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file, dense=True) X_tst, _ = load_data(test_file, dense=True) clf = ET(n_estimators=n_est, max_depth=depth, random_state=SEED) logging.info('Loading CV Ids') cv_id = np.loadtxt(cv_id_file) logging.info('Cross validation...') P_val = np.zeros((X.shape[0], N_CLASS)) for i in range(1, n_fold + 1): i_trn = np.where(cv_id != i)[0] i_val = np.where(cv_id == i)[0] clf.fit(X[i_trn], y[i_trn]) P_val[i_val] = clf.predict_proba(X[i_val]) logging.info('Retraining with 100% data...') clf.fit(X, y) P_tst = clf.predict_proba(X_tst) logging.info('Saving predictions...') np.savetxt(predict_valid_file, P_val, fmt='%.6f', delimiter=',') np.savetxt(predict_test_file, P_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, feature_map_file, predict_valid_file, predict_test_file, feature_imp_file, num_threads, metric, learning_rate, boosting, objective, verbosity, num_boost_round, early_stopping_rounds, verbose_eval, device_type=None, gpu_use_dp=None): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename=f'{model_name}.log') logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) with open(feature_map_file) as f: feature_name = [x.strip() for x in f.readlines()] logging.info('Loading CV Ids') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) if device_type == None or gpu_use_dp == None: device_type = 'cpu' gpu_use_dp = False lgbm_params = { 'num_threads': num_threads, 'metric': metric, 'learning_rate': learning_rate, 'boosting': boosting, 'objective': objective, 'num_class': N_CLASS, 'random_state': SEED, 'device_type': device_type, 'gpu_use_dp': gpu_use_dp, 'verbosity': verbosity, } oof_pred = np.zeros((X.shape[0], N_CLASS)) test_pred = np.zeros((X_tst.shape[0], N_CLASS)) n_bests = [] for fold, (trn_idx, val_idx) in enumerate(cv.split(X, y), 1): logging.info(f'Training model #{fold}') X_trn, X_val = X[trn_idx], X[val_idx] y_trn, y_val = y[trn_idx], y[val_idx] dtrn = lgbm.Dataset(X_trn, label=y_trn) dval = lgbm.Dataset(X_val, label=y_val) logging.info('Training with early stopping') lgbm_clf = lgbm.train(params=lgbm_params, train_set=dtrn, num_boost_round=num_boost_round, valid_sets=[dtrn, dval], early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval) n_best = lgbm_clf.best_iteration n_bests.append(n_best) logging.info('best iteration={}'.format(n_best)) test_pred += lgbm_clf.predict(X_tst) / (N_FOLD) oof_pred[val_idx] += lgbm_clf.predict(X_val) logging.info( f'CV #{fold}: {accuracy_score(y_val, np.argmax(oof_pred[val_idx], axis=1)) * 100:.4f}%' ) imp = pd.DataFrame({ 'feature': feature_name, 'importance': lgbm_clf.feature_importance(importance_type='gain', iteration=n_best) }) imp = imp.sort_values('importance').set_index('feature') imp.to_csv(feature_imp_file) logging.info( f'CV: {accuracy_score(y, np.argmax(oof_pred, axis=1)) * 100:.4f}%') logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, oof_pred, fmt='%.18f', delimiter=',') logging.info('Saving test predictions...') np.savetxt(predict_test_file, test_pred, fmt='%.18f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, cv_id_file, n_est=100, hiddens=2, neurons=512, dropout=0.5, batch=16, n_stop=2, retrain=True, n_fold=5): feature_name = os.path.basename(train_file).split('.')[0] model_name = 'keras_{}_{}_{}_{}_{}_{}_{}'.format(n_est, hiddens, neurons, dropout, batch, n_stop, feature_name) logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_file, dense=True) Y = np_utils.to_categorical(y) X_tst, _ = load_data(test_file, dense=True) scaler = StandardScaler() X = scaler.fit_transform(X) X_tst = scaler.transform(X_tst) nb_classes = Y.shape[1] dims = X.shape[1] logging.info('{} classes, {} dims'.format(nb_classes, dims)) logging.info('Loading CV Ids') cv_id = np.loadtxt(cv_id_file) P_val = np.zeros((Y.shape[0], )) P_tst = np.zeros((X_tst.shape[0], )) for i in range(1, n_fold + 1): i_trn = np.where(cv_id != i)[0] i_val = np.where(cv_id == i)[0] logging.info('Training model #{}'.format(i)) clf = get_model(nb_classes, dims, hiddens, neurons, dropout) if i == 1: early_stopping = EarlyStopping(monitor='val_loss', patience=n_stop) h = clf.fit(X[i_trn], Y[i_trn], validation_data=(X[i_val], Y[i_val]), nb_epoch=n_est, batch_size=batch, callbacks=[early_stopping]) val_losses = h.history['val_loss'] n_best = val_losses.index(min(val_losses)) + 1 logging.info('best epoch={}'.format(n_best)) else: clf.fit(X[i_trn], Y[i_trn], validation_data=(X[i_val], Y[i_val]), nb_epoch=n_best, batch_size=batch) P_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('CV #{} Log Loss: {:.6f}'.format( i, log_loss(Y[i_val], P_val[i_val]))) if not retrain: P_tst += clf.predict_proba(X_tst)[:, 1] / n_fold logging.info('CV Log Loss: {:.6f}'.format(log_loss(y, P_val))) np.savetxt(predict_valid_file, P_val, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') clf = get_model(nb_classes, dims, hiddens, neurons, dropout) clf.fit(X, Y, nb_epoch=n_best, batch_size=batch) P_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving normalized test predictions...') np.savetxt(predict_test_file, P_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, feature_map_file, predict_valid_file, predict_test_file, feature_imp_file, n_est=100, n_leaf=200, lrate=.1, n_min=8, subcol=.3, subrow=.8, subrow_freq=100, n_stop=100): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename=f'{model_name}.log') logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) with open(feature_map_file) as f: feature_name = [x.strip() for x in f.readlines()] logging.info('Loading CV Ids') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) params = { 'random_state': SEED, 'num_classes': N_CLASS, 'n_jobs': -1, 'objective': 'multiclass', 'learning_rate': lrate, 'num_leaves': n_leaf, 'feature_fraction': subcol, 'bagging_fraction': subrow, 'bagging_freq': subrow_freq, 'min_child_samples': n_min } p = np.zeros((X.shape[0], N_CLASS)) p_tst = np.zeros((X_tst.shape[0], N_CLASS)) n_bests = [] for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1): logging.info(f'Training model #{i}') trn_lgb = lgb.Dataset(X[i_trn], label=y[i_trn]) val_lgb = lgb.Dataset(X[i_val], label=y[i_val]) logging.info('Training with early stopping') clf = lgb.train(params, trn_lgb, n_est, val_lgb, early_stopping_rounds=n_stop, verbose_eval=100) n_best = clf.best_iteration n_bests.append(n_best) logging.info('best iteration={}'.format(n_best)) p[i_val, :] = clf.predict(X[i_val]) p_tst += clf.predict(X_tst) / N_FOLD logging.info( f'CV #{i}: {accuracy_score(y[i_val], np.argmax(p[i_val], axis=1)) * 100:.4f}%' ) imp = pd.DataFrame({ 'feature': feature_name, 'importance': clf.feature_importance(importance_type='gain', iteration=n_best) }) imp = imp.sort_values('importance').set_index('feature') imp.to_csv(feature_imp_file) logging.info(f'CV: {accuracy_score(y, np.argmax(p, axis=1)) * 100:.4f}%') logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',') logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, cid_train_file, cid_test_file): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='xg_cid_{}.log'.format(feature_name)) logging.info('Loading course IDs for training and test data') cid_trn = np.loadtxt(cid_train_file, dtype=int) cid_tst = np.loadtxt(cid_test_file, dtype=int) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, y_tst = load_data(test_file) cv = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=2015) p = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}...'.format(i)) X_trn = X[i_trn] y_trn = y[i_trn] X_val = X[i_val] y_val = y[i_val] cid_valtrn = cid_trn[i_trn] cid_valtst = cid_trn[i_val] p_trn = np.zeros_like(y_trn) p_val = np.zeros_like(y_val) for j in range(39): idx_trn = np.where(cid_valtrn == j)[0] idx_val = np.where(cid_valtst == j)[0] clf = xgb.XGBClassifier(max_depth=PARAM[j][2], learning_rate=PARAM[j][1], n_estimators=PARAM[j][0], colsample_bytree=1, subsample=.4, nthread=6) clf.fit(X_trn[idx_trn], y_trn[idx_trn]) p_trn[idx_trn] = clf.predict_proba(X_trn[idx_trn])[:, 1] p_val[idx_val] = clf.predict_proba(X_val[idx_val])[:, 1] logging.info('CID #{}: {:.4f}, {:.4f}'.format( j, AUC(y_trn[idx_trn], p_trn[idx_trn]), AUC(y_val[idx_val], p_val[idx_val]))) logging.info('AUC TRN = {:.6f}'.format(AUC(y_trn, p_trn))) logging.info('AUC VAL = {:.6f}'.format(AUC(y_val, p_val))) p[i_val] = p_val logging.info('AUC = {:.6f}'.format(AUC(y, p))) logging.info('Saving CV predictions...') np.savetxt(predict_valid_file, p, fmt='%.6f') logging.info('Retraining with 100% data...') p_tst = np.zeros_like(y_tst) n_tst = len(p_tst) for j in range(39): idx_trn = np.where(cid_trn == j)[0] idx_tst = np.where(cid_tst == j)[0] logging.info('CID #{}: {:.2f}%'.format(j, len(idx_tst) / n_tst * 100)) clf.fit(X[idx_trn], y[idx_trn]) p_tst[idx_tst] = clf.predict_proba(X_tst[idx_tst])[:, 1] logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict_lr_forward(train_file, test_file, predict_valid_file, predict_test_file, C, n_fold=5): feature_name = os.path.basename(train_file)[:-8] algo_name = 'lr_forward_{}'.format(C) model_name = '{}_{}'.format(algo_name, feature_name) logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info("Loading training and test data...") X_trn, y_trn = load_data(train_file, dense=True) X_tst, _ = load_data(test_file, dense=True) logging.info('Normalizing data') scaler = StandardScaler() X_trn = scaler.fit_transform(X_trn) X_tst = scaler.transform(X_tst) cv = StratifiedKFold(y_trn, n_folds=n_fold, shuffle=True, random_state=2015) selected_features = [] features_to_test = [ x for x in range(X_trn.shape[1]) if x not in selected_features ] auc_cv_old = .5 is_improving = True while is_improving: auc_cvs = [] for feature in features_to_test: logging.info('{}'.format(selected_features + [feature])) X = X_trn[:, selected_features + [feature]] p_val = np.zeros_like(y_trn) for i, (i_trn, i_val) in enumerate(cv, start=1): clf = LR(C=C, class_weight='auto', random_state=2014) clf.fit(X[i_trn], y_trn[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] auc_cv = AUC(y_trn, p_val) logging.info('AUC CV: {:.6f}'.format(auc_cv)) auc_cvs.append(auc_cv) auc_cv_new = max(auc_cvs) if auc_cv_new > auc_cv_old: auc_cv_old = auc_cv_new feature = features_to_test.pop(auc_cvs.index(auc_cv_new)) selected_features.append(feature) logging.info('selected features: {}'.format(selected_features)) else: is_improving = False logging.info( 'final selected features: {}'.format(selected_features)) logging.info('saving selected features as a file') with open('{}_selected.txt'.format(model_name), 'w') as f: f.write('{}\n'.format(selected_features)) X = X_trn[:, selected_features] logging.debug('feature matrix: {}x{}'.format(X.shape[0], X.shape[1])) p_val = np.zeros_like(y_trn) for i, (i_trn, i_val) in enumerate(cv, start=1): logging.info('Training CV #{}'.format(i)) clf = LR(C=C, class_weight='auto', random_state=2015) clf.fit(X[i_trn], y_trn[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] auc_cv = AUC(y_trn, p_val) logging.info('AUC CV: {:.6f}'.format(auc_cv)) logging.info("Writing test predictions to file") np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') logging.info('Retraining with 100% data...') clf.fit(X, y_trn) p_tst = clf.predict_proba(X_tst[:, selected_features])[:, 1] np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, n_leaf=200, lrate=.1, n_min=8, subcol=.3, subrow=.8, subrow_freq=100, n_stop=100, retrain=True): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) y = np.log(y + offset) X_tst, _ = load_data(test_file) logging.info('Loading CV Ids') cv = KFold(len(y), n_folds=N_FOLD, shuffle=True, random_state=SEED) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) n_bests = [] for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}'.format(i)) watchlist = [(X[i_val], y[i_val])] logging.info('Training with early stopping') clf = lgb.LGBMRegressor(n_estimators=n_est, num_leaves=n_leaf, learning_rate=lrate, min_child_samples=n_min, subsample=subrow, subsample_freq=subrow_freq, colsample_bytree=subcol, objective=fairobj, nthread=1, seed=SEED) clf = clf.fit(X[i_trn], y[i_trn], eval_set=watchlist, eval_metric=eval_mae, early_stopping_rounds=n_stop, verbose=10) n_best = clf.best_iteration n_bests.append(n_best) logging.info('best iteration={}'.format(n_best)) p_val[i_val] = clf.predict(X[i_val]) logging.info('CV #{}: {:.4f}'.format( i, MAE(np.exp(y[i_val]), np.exp(p_val[i_val])))) if not retrain: p_tst += clf.predict(X_tst) / N_FOLD logging.info('CV: {:.4f}'.format(MAE(np.exp(y), np.exp(p_val)))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, np.exp(p_val) - offset, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') n_best = sum(n_bests) // N_FOLD clf = lgb.LGBMRegressor(n_estimators=n_best, num_leaves=n_leaf, learning_rate=lrate, min_child_samples=n_min, subsample=subrow, subsample_freq=subrow_freq, colsample_bytree=subcol, objective=fairobj, nthread=1, seed=SEED) clf = clf.fit(X, y) p_tst = clf.predict(X_tst) logging.info('Saving test predictions...') np.savetxt(predict_test_file, np.exp(p_tst) - offset, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, cv_id_file, n_est=100, n_leaf=200, lrate=.1, n_min=8, subcol=.3, subrow=.8, subrow_freq=100, n_stop=100, retrain=True): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) logging.info('Loading CV Ids') cv_id = np.loadtxt(cv_id_file) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) n_bests = [] n_fold = 5 for i in range(1, n_fold + 1): i_trn = np.where(cv_id != i)[0] i_val = np.where(cv_id == i)[0] logging.info('Training model #{}'.format(i)) logging.debug('train: {}'.format(X[i_trn].shape)) logging.debug('valid: {}'.format(X[i_val].shape)) watchlist = [(X[i_val], y[i_val])] logging.info('Training with early stopping') clf = lgb.LGBMRegressor(n_estimators=n_est, num_leaves=n_leaf, learning_rate=lrate, min_child_samples=n_min, subsample=subrow, subsample_freq=subrow_freq, colsample_bytree=subcol, nthread=20, seed=SEED) clf = clf.fit(X[i_trn], y[i_trn], eval_set=watchlist, eval_metric="l2", early_stopping_rounds=n_stop, verbose=10) n_best = clf.best_iteration if clf.best_iteration > 0 else n_est n_bests.append(n_best) logging.info('best iteration={}'.format(n_best)) p_val[i_val] = clf.predict(X[i_val]) if not retrain: p_tst += clf.predict(X_tst) / n_fold logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') n_best = sum(n_bests) // n_fold clf = lgb.LGBMRegressor(n_estimators=n_best, num_leaves=n_leaf, learning_rate=lrate, min_child_samples=n_min, subsample=subrow, subsample_freq=subrow_freq, colsample_bytree=subcol, nthread=20, seed=SEED) clf = clf.fit(X, y, verbose=True) p_tst = clf.predict(X_tst) logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_feature_file, test_feature_file, predict_valid_file, predict_test_file, C=1.0, class_weight='balanced', max_iter=1000, solver='lbfgs', retrain=True): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_feature_file) X_tst, _ = load_data(test_feature_file) logging.info('Loading CV Ids') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED).split(X, y) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}'.format(i)) logging.info('Training Logistic Regression') clf = LogisticRegression(C=C, class_weight=class_weight, max_iter=max_iter, solver=solver, random_state=SEED) clf = clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p_val[i_val]))) if not retrain: p_tst += clf.predict_proba(X_tst)[:, 1] / N_FOLD logging.info('CV: {:.4f}'.format(AUC(y, p_val))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') clf = LogisticRegression(C=C, class_weight=class_weight, max_iter=max_iter, solver=solver, random_state=SEED) clf = clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, feature_map_file, predict_valid_file, predict_test_file, feature_importance_file, n_est=100, depth=4, lrate=.1, l2_leaf_reg=1): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.info(('n_est={}, depth={}, lrate={}, ' 'l2_leaf_reg={}').format(n_est, depth, lrate, l2_leaf_reg)) logging.info('Loading training and test data...') logging.info('{}'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) if sparse.issparse(X): X = X.todense() X_tst = X_tst.todense() features = pd.read_csv(feature_map_file, sep='\t', header=None, names=['idx', 'name', 'type']) cat_cols = features.idx[features.type != 'q'].tolist() logging.info('Loading CV Ids') cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p_val = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) feature_name, feature_ext = os.path.splitext(train_file) feature_name = os.path.splitext(feature_name)[0] for i, (i_trn, i_val) in enumerate(cv.split(y), 1): logging.info('Training model #{}'.format(i)) cv_train_file = '{}.trn{}{}'.format(feature_name, i, feature_ext) cv_test_file = '{}.tst{}{}'.format(feature_name, i, feature_ext) if os.path.exists(cv_train_file): is_cv_feature = True X_cv, _ = load_data(cv_train_file) X_tst_cv, _ = load_data(cv_test_file) X_trn = np.hstack((X[i_trn], X_cv[i_trn])) X_val = np.hstack((X[i_val], X_cv[i_val])) X_tst_ = np.hstack((X_tst, X_tst_cv)) else: is_cv_feature = False X_trn = X[i_trn] X_val = X[i_val] X_tst_ = X_tst if i == 1: logging.info('Training with early stopping') clf = cbt.CatBoostRegressor(learning_rate=lrate, depth=depth, l2_leaf_reg=l2_leaf_reg, iterations=n_est, loss_function='RMSE', random_seed=SEED, thread_count=N_JOB) if len(cat_cols) > 0: clf = clf.fit(X_trn, y[i_trn], eval_set=[X_val, y[i_val]], use_best_model=True, cat_features=cat_cols) else: clf = clf.fit(X_trn, y[i_trn], eval_set=[X_val, y[i_val]], use_best_model=True) n_best = clf.tree_count_ logging.info('best iteration={}'.format(n_best)) df = pd.read_csv(feature_map_file, sep='\t', names=['id', 'name', 'type']) df['gain'] = clf.feature_importances_ df.loc[:, 'gain'] = df.gain / df.gain.sum() df.sort_values('gain', ascending=False, inplace=True) df.to_csv(feature_importance_file, index=False) logging.info('feature importance is saved in {}'.format( feature_importance_file)) else: clf = cbt.CatBoostRegressor(learning_rate=lrate, depth=depth, l2_leaf_reg=l2_leaf_reg, iterations=n_best, loss_function='RMSE', random_seed=SEED, thread_count=N_JOB) if len(cat_cols) > 0: clf = clf.fit(X_trn, y[i_trn], eval_set=(X_val, y[i_val]), use_best_model=False, cat_features=cat_cols) else: clf = clf.fit(X_trn, y[i_trn], eval_set=(X_val, y[i_val]), use_best_model=False) p_val[i_val] = clf.predict(X_val) logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val]))) p_tst += clf.predict(X_tst_) / N_FOLD logging.info('CV: {:.6f}'.format(kappa(y, p_val))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, feature_map, retrain=True): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training and test data...') X_trn, y_trn = load_data(train_file) X_tst, _ = load_data(test_file) feature_map = pd.read_table(feature_map, index_col=0, header=None, names=['feature_names', 'feature_type']) features = feature_map['feature_names'].values train_df = pd.DataFrame(data=X_trn.toarray(), columns=feature_map['feature_names']) test_df = pd.DataFrame(data=X_tst.toarray(), columns=feature_map['feature_names']) train_test = train_df.append(test_df) test_data = [ test_df.loc[:, features].values[:, k] for k in range(test_df.loc[:, features].values.shape[1]) ] logging.info('Loading CV Ids') cv = StratifiedKFold(n_splits=50, shuffle=True, random_state=SEED) vld_preds = np.zeros_like(y_trn) tst_preds = np.zeros((X_tst.shape[0], )) for cv_idx, (i_trn, i_vld) in enumerate(cv.split(X_trn, y_trn), 1): X_trn_cv = train_df.iloc[i_trn, :].reset_index(drop=True) X_vld_cv = train_df.iloc[i_vld, :].reset_index(drop=True) y_trn_cv = y_trn[i_trn] y_vld_cv = y_trn[i_vld] logging.info('Training model #{}'.format(cv_idx)) clf = create_keras_embedding_model(train_test, features) clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc]) X_trn_cv = [ X_trn_cv.loc[:, features].values[:, k] for k in range(X_trn_cv.loc[:, features].values.shape[1]) ] X_vld_cv = [ X_vld_cv.loc[:, features].values[:, k] for k in range(X_vld_cv.loc[:, features].values.shape[1]) ] es = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=5, verbose=1, mode='max', baseline=None, restore_best_weights=True) rlr = callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=3, min_lr=1e-6, mode='max', verbose=1) clf.fit(X_trn_cv, utils.to_categorical(y_trn_cv), validation_data=(X_vld_cv, utils.to_categorical(y_vld_cv)), verbose=0, batch_size=1024, callbacks=[es, rlr], epochs=50) vld_preds[i_vld] = clf.predict(X_vld_cv)[:, 1] logging.info('CV #{}: {:.4f}'.format( cv_idx, auc(y_trn[i_vld], vld_preds[i_vld]))) if not retrain: tst_preds += (model.predict(test_data)[:, 1] / N_FOLDS).ravel() logging.info('Saving validation predictions...') logging.info('CV: {:.4f}'.format(AUC(y_trn, vld_preds))) np.savetxt(predict_valid_file, vld_preds, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') clf = create_keras_embedding_model(train_test, features) clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc]) X_trn_all = [ train_df.loc[:, features].values[:, k] for k in range(train_df.loc[:, features].values.shape[1]) ] clf.fit(X_trn_all, utils.to_categorical(y_trn), validation_data=(X_trn_all, utils.to_categorical(y_trn)), verbose=0, batch_size=1024, callbacks=[es, rlr], epochs=50) tst_preds = (clf.predict(test_data)[:, 1]).ravel() logging.info('Saving normalized test predictions...') np.savetxt(predict_test_file, tst_preds, fmt='%.6f', delimiter=',')