def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est,
                  depth,
                  retrain=True):

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])

    for i, (i_trn, i_val) in enumerate(cv.split(y), 1):
        logging.info('Training model #{}'.format(i))

        clf = ExtraTreesRegressor(n_estimators=n_est,
                                  max_depth=depth,
                                  random_state=SEED)
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict(X[i_val])
        logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val])))

        if not retrain:
            p_tst += clf.predict(X_tst) / N_FOLD

    logging.info('CV: {:.6f}'.format(kappa(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        clf = ExtraTreesRegressor(n_estimators=n_est,
                                  max_depth=depth,
                                  random_state=SEED)
        clf.fit(X, y)
        p_tst = clf.predict(X_tst)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
    def fit(self, X, y):
        X = np.asarray(X)
        res = minimize(lambda x: -kappa(y, X.dot(x)),
                       x0=self.random_state.rand(X.shape[1]),
                       method=self.algo,
                       tol=self.tol)
        self.coef_ = res.x

        return self
Ejemplo n.º 3
0
def train_predict(train_file, test_file, feature_map_file, predict_valid_file, predict_test_file,
                  feature_importance_file, retrain=True):

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])

    for i, (i_trn, i_val) in enumerate(cv.split(y), 1):
        logging.info('Training model #{}'.format(i))

        clf = LinearRegression()
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict(X[i_val])
        logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val])))

        if i == 1:
            df = pd.read_csv(feature_map_file, sep='\t', names=['id', 'name', 'type'])
            df['coef'] = clf.coef_
            df.sort_values('coef', ascending=False, inplace=True)
            df.to_csv(feature_importance_file, index=False)
            logging.info('feature importance is saved in {}'.format(feature_importance_file))

        if not retrain:
            p_tst += clf.predict(X_tst) / N_FOLD

    logging.info('CV: {:.6f}'.format(kappa(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        clf = LinearRegression()
        clf.fit(X, y)
        p_tst = clf.predict(X_tst)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Ejemplo n.º 4
0
def train_predict(train_file,
                  test_file,
                  feature_map_file,
                  predict_valid_file,
                  predict_test_file,
                  feature_importance_file,
                  n_est=100,
                  depth=4,
                  lrate=.1,
                  l2_leaf_reg=1):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.info(('n_est={}, depth={}, lrate={}, '
                  'l2_leaf_reg={}').format(n_est, depth, lrate, l2_leaf_reg))

    logging.info('Loading training and test data...')
    logging.info('{}'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    if sparse.issparse(X):
        X = X.todense()
        X_tst = X_tst.todense()

    features = pd.read_csv(feature_map_file,
                           sep='\t',
                           header=None,
                           names=['idx', 'name', 'type'])
    cat_cols = features.idx[features.type != 'q'].tolist()

    logging.info('Loading CV Ids')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    feature_name, feature_ext = os.path.splitext(train_file)
    feature_name = os.path.splitext(feature_name)[0]

    for i, (i_trn, i_val) in enumerate(cv.split(y), 1):
        logging.info('Training model #{}'.format(i))
        cv_train_file = '{}.trn{}{}'.format(feature_name, i, feature_ext)
        cv_test_file = '{}.tst{}{}'.format(feature_name, i, feature_ext)

        if os.path.exists(cv_train_file):
            is_cv_feature = True
            X_cv, _ = load_data(cv_train_file)
            X_tst_cv, _ = load_data(cv_test_file)

            X_trn = np.hstack((X[i_trn], X_cv[i_trn]))
            X_val = np.hstack((X[i_val], X_cv[i_val]))
            X_tst_ = np.hstack((X_tst, X_tst_cv))
        else:
            is_cv_feature = False
            X_trn = X[i_trn]
            X_val = X[i_val]
            X_tst_ = X_tst

        if i == 1:
            logging.info('Training with early stopping')
            clf = cbt.CatBoostRegressor(learning_rate=lrate,
                                        depth=depth,
                                        l2_leaf_reg=l2_leaf_reg,
                                        iterations=n_est,
                                        loss_function='RMSE',
                                        random_seed=SEED,
                                        thread_count=N_JOB)

            if len(cat_cols) > 0:
                clf = clf.fit(X_trn,
                              y[i_trn],
                              eval_set=[X_val, y[i_val]],
                              use_best_model=True,
                              cat_features=cat_cols)
            else:
                clf = clf.fit(X_trn,
                              y[i_trn],
                              eval_set=[X_val, y[i_val]],
                              use_best_model=True)

            n_best = clf.tree_count_
            logging.info('best iteration={}'.format(n_best))

            df = pd.read_csv(feature_map_file,
                             sep='\t',
                             names=['id', 'name', 'type'])
            df['gain'] = clf.feature_importances_
            df.loc[:, 'gain'] = df.gain / df.gain.sum()
            df.sort_values('gain', ascending=False, inplace=True)
            df.to_csv(feature_importance_file, index=False)
            logging.info('feature importance is saved in {}'.format(
                feature_importance_file))
        else:
            clf = cbt.CatBoostRegressor(learning_rate=lrate,
                                        depth=depth,
                                        l2_leaf_reg=l2_leaf_reg,
                                        iterations=n_best,
                                        loss_function='RMSE',
                                        random_seed=SEED,
                                        thread_count=N_JOB)

            if len(cat_cols) > 0:
                clf = clf.fit(X_trn,
                              y[i_trn],
                              eval_set=(X_val, y[i_val]),
                              use_best_model=False,
                              cat_features=cat_cols)
            else:
                clf = clf.fit(X_trn,
                              y[i_trn],
                              eval_set=(X_val, y[i_val]),
                              use_best_model=False)

        p_val[i_val] = clf.predict(X_val)
        logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val])))

        p_tst += clf.predict(X_tst_) / N_FOLD

    logging.info('CV: {:.6f}'.format(kappa(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Ejemplo n.º 5
0
def train_predict(train_file,
                  test_file,
                  model_file,
                  predict_valid_file,
                  predict_test_file,
                  nn='nn2',
                  n_est=100,
                  lrate=.001,
                  n_stop=100,
                  batch_size=1024):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.info('{}'.format(model_name))
    logging.info(('{}, n_est={}, lrate={}, n_stop={}, batch_size={}').format(
        nn, n_est, lrate, n_stop, batch_size))

    logging.info('Loading CV Ids')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    if sparse.issparse(X):
        X = X.todense()
        X_tst = X_tst.todense()

    logging.debug('Training ({}), and test ({}) data loaded'.format(
        X.shape, X_tst.shape))

    n_bests = []
    p = np.zeros_like(y, dtype=float)
    p_tst = np.zeros((X_tst.shape[0], ))
    input_dim = X.shape[1]
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        clf = get_model(nn, input_dim, None, lrate)
        if i == 1:
            logging.info(clf.summary())
            es = EarlyStopping(monitor='val_loss', patience=n_stop)
            mcp = ModelCheckpoint(model_file,
                                  monitor='val_loss',
                                  save_best_only=True,
                                  save_weights_only=False)
            h = clf.fit_generator(
                generator(X[i_trn], y[i_trn], batch_size),
                steps_per_epoch=int(np.ceil(len(i_trn) / batch_size)),
                epochs=n_est,
                validation_data=generator(X[i_val], y[i_val], batch_size),
                validation_steps=int(np.ceil(len(i_val) / batch_size)),
                callbacks=[es, mcp])

            val_losss = h.history['val_loss']
            n_best = val_losss.index(min(val_losss)) + 1
            clf.load_weights(model_file)
            logging.info('best epoch={}'.format(n_best))
        else:
            clf.fit_generator(
                generator(X[i_trn], y[i_trn], batch_size),
                steps_per_epoch=int(np.ceil(len(i_trn) / batch_size)),
                epochs=n_best,
                validation_data=generator(X[i_val], y[i_val], batch_size),
                validation_steps=int(np.ceil(len(i_val) / batch_size)))

        p[i_val] = clf.predict(X[i_val]).flatten()
        logging.info('CV {} kappa={:.6f}, best iteration={}'.format(
            i, kappa(y[i_val], p[i_val]), n_best))
        p_tst += clf.predict(X_tst).flatten() / N_FOLD

    logging.info('CV kappa: {:.6f}'.format(kappa(y, p)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',')

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Ejemplo n.º 6
0
def train_predict(train_file,
                  test_file,
                  feature_map_file,
                  predict_valid_file,
                  predict_test_file,
                  feature_importance_file,
                  n_est=100,
                  depth=4,
                  lrate=.1,
                  subcol=.5,
                  subrow=.5,
                  sublev=1,
                  weight=1,
                  n_stop=100,
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.info(
        ('n_est={}, depth={}, lrate={}, '
         'subcol={}, subrow={}, sublev={},'
         'weight={}, n_stop={}').format(n_est, depth, lrate, subcol, subrow,
                                        sublev, weight, n_stop))

    logging.info('Loading training and test data...')
    logging.info('{}'.format(model_name))
    # set xgb parameters
    params = {
        'objective': "reg:linear",
        'max_depth': depth,
        'eta': lrate,
        'subsample': subrow,
        'colsample_bytree': subcol,
        'colsample_bylevel': sublev,
        'min_child_weight': weight,
        'eval_metric': 'rmse',
        'silent': 1,
        'nthread': N_JOB,
        'seed': SEED
    }

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)
    xgtst = xgb.DMatrix(X_tst)

    logging.info('Loading CV Ids')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    for i, (i_trn, i_val) in enumerate(cv.split(y), 1):
        xgtrn = xgb.DMatrix(X[i_trn], label=y[i_trn])
        xgval = xgb.DMatrix(X[i_val], label=y[i_val])

        logging.info('Training model #{}'.format(i))
        watchlist = [(xgtrn, 'train'), (xgval, 'val')]

        if i == 1:
            logging.info('Training with early stopping')
            clf = xgb.train(params,
                            xgtrn,
                            n_est,
                            watchlist,
                            early_stopping_rounds=n_stop)
            n_best = clf.best_iteration
            logging.info('best iteration={}'.format(n_best))

            importance = clf.get_fscore(feature_map_file)
            df = pd.DataFrame.from_dict(importance, 'index')
            df.index.name = 'name'
            df.columns = ['fscore']
            df.loc[:, 'fscore'] = df.fscore / df.fscore.sum()
            df.sort_values('fscore', ascending=False, inplace=True)
            df.to_csv(feature_importance_file, index=True)
            logging.info('feature importance is saved in {}'.format(
                feature_importance_file))
        else:
            clf = xgb.train(params, xgtrn, n_best, watchlist)

        p_val[i_val] = clf.predict(xgval, ntree_limit=n_best)
        logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val])))

        if not retrain:
            p_tst += clf.predict(xgtst, ntree_limit=n_best) / N_FOLD

    logging.info('CV: {:.6f}'.format(kappa(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        xgtrn = xgb.DMatrix(X, label=y)
        watchlist = [(xgtrn, 'train')]
        clf = xgb.train(params, xgtrn, n_best, watchlist)
        p_tst = clf.predict(xgtst, ntree_limit=n_best)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est=100,
                  n_leaf=200,
                  lrate=.1,
                  n_min=8,
                  subcol=.3,
                  subrow=.8,
                  subrow_freq=100,
                  n_stop=100,
                  retrain=True,
                  log_file=None):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    if log_file is None:
        log_file = '{}.log'.format(model_name)

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename=log_file,
                        datefmt='%Y-%m-%d %H:%M:%S')

    logging.info('{}'.format(model_name))
    logging.info(('n_est={}, n_leaf={}, lrate={}, '
                  'n_min={}, subcol={}, subrow={},'
                  'subrow_freq={}, n_stop={}').format(n_est, n_leaf, lrate,
                                                      n_min, subcol, subrow,
                                                      subrow_freq, n_stop))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    for i, (i_trn, i_val) in enumerate(cv.split(y), 1):
        logging.info('Training model #{}'.format(i))
        watchlist = [(X[i_val], y[i_val])]

        if i == 1:
            logging.info('Training with early stopping')
            clf = lgb.LGBMRegressor(n_estimators=n_est,
                                    num_leaves=n_leaf,
                                    learning_rate=lrate,
                                    min_child_samples=n_min,
                                    subsample=subrow,
                                    subsample_freq=subrow_freq,
                                    colsample_bytree=subcol,
                                    objective='regression',
                                    n_jobs=N_JOB,
                                    random_state=SEED)
            clf = clf.fit(X[i_trn],
                          y[i_trn],
                          eval_set=watchlist,
                          eval_metric='rmse',
                          early_stopping_rounds=n_stop,
                          verbose=10)
            n_best = clf.best_iteration_
            logging.info('best iteration={}'.format(n_best))
        else:
            clf = lgb.LGBMRegressor(n_estimators=n_best,
                                    num_leaves=n_leaf,
                                    learning_rate=lrate,
                                    min_child_samples=n_min,
                                    subsample=subrow,
                                    subsample_freq=subrow_freq,
                                    colsample_bytree=subcol,
                                    objective='regression',
                                    n_jobs=N_JOB,
                                    random_state=SEED)
            clf = clf.fit(X[i_trn],
                          y[i_trn],
                          eval_set=watchlist,
                          eval_metric='rmse',
                          verbose=10)

        p_val[i_val] = clf.predict(X[i_val])
        logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val])))

        if not retrain:
            p_tst += clf.predict(X_tst) / N_FOLD

    logging.info('CV: {:.6f}'.format(kappa(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        clf = lgb.LGBMRegressor(n_estimators=n_best,
                                num_leaves=n_leaf,
                                learning_rate=lrate,
                                min_child_samples=n_min,
                                subsample=subrow,
                                subsample_freq=subrow_freq,
                                colsample_bytree=subcol,
                                objective='regression',
                                n_jobs=N_JOB,
                                random_state=SEED)

        clf = clf.fit(X, y)
        p_tst = clf.predict(X_tst)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Ejemplo n.º 8
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est=100,
                  n_leaf=200,
                  lrate=.1,
                  n_min=8,
                  subcol=.3,
                  subrow=.8,
                  subrow_freq=100,
                  n_stop=100,
                  log_file=None):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    if log_file is None:
        log_file = '{}.log'.format(model_name)

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename=log_file,
                        datefmt='%Y-%m-%d %H:%M:%S')

    logging.info('{}'.format(model_name))
    logging.info(('n_est={}, n_leaf={}, lrate={}, '
                  'n_min={}, subcol={}, subrow={},'
                  'subrow_freq={}, n_stop={}').format(n_est, n_leaf, lrate,
                                                      n_min, subcol, subrow,
                                                      subrow_freq, n_stop))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    cat_cols = [i for i in range(X.shape[1]) if int(X[0, i]) == X[0, i]]

    params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': 21,
        'num_leaves': n_leaf,
        'learning_rate': lrate,
        'feature_fraction': subcol,
        'bagging_fraction': subrow,
        'bagging_freq': subrow_freq,
        'min_data_in_leaf': n_min,
        'metric_freq': 10,
        'is_training_metric': True,
        'verbose': 0,
        'num_threads': N_JOB
    }

    logging.info('Loading CV Ids')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    P_tst = np.zeros((X_tst.shape[0], 21))
    for i, (i_trn, i_val) in enumerate(cv.split(y), 1):
        logging.info('Training model #{}'.format(i))
        lgb_trn = lgb.Dataset(X[i_trn], y[i_trn])
        lgb_val = lgb.Dataset(X[i_val], y[i_val])
        watchlist = [(X[i_val], y[i_val])]

        if i == 1:
            logging.info('Training with early stopping')
            clf = lgb.train(params,
                            lgb_trn,
                            num_boost_round=n_est,
                            early_stopping_rounds=n_stop,
                            valid_sets=lgb_val,
                            categorical_feature=cat_cols)

            n_best = clf.best_iteration
            logging.info('best iteration={}'.format(n_best))
        else:
            clf = lgb.train(params,
                            lgb_trn,
                            num_boost_round=n_best,
                            valid_sets=lgb_val,
                            categorical_feature=cat_cols)

        p_val[i_val] = np.argmax(clf.predict(X[i_val]), axis=1)
        logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val])))

        P_tst += clf.predict(X_tst) / N_FOLD

    logging.info('CV: {:.6f}'.format(kappa(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file,
               np.argmax(P_tst, axis=1),
               fmt='%.6f',
               delimiter=',')
Ejemplo n.º 9
0
def train_predict(train_file,
                  test_file,
                  feature_map_file,
                  predict_valid_file,
                  predict_test_file,
                  feature_importance_file,
                  n_est=100,
                  n_leaf=200,
                  lrate=.1,
                  n_min=8,
                  subcol=.3,
                  subrow=.8,
                  subrow_freq=100,
                  n_stop=100,
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.info('{}'.format(model_name))
    logging.info(('n_est={}, n_leaf={}, lrate={}, '
                  'n_min={}, subcol={}, subrow={},'
                  'subrow_freq={}, n_stop={}').format(n_est, n_leaf, lrate,
                                                      n_min, subcol, subrow,
                                                      subrow_freq, n_stop))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'num_leaves': n_leaf,
        'learning_rate': lrate,
        'feature_fraction': subcol,
        'bagging_fraction': subrow,
        'bagging_freq': subrow_freq,
        'min_data_in_leaf': n_min,
        'feature_fraction_seed': SEED,
        'bagging_seed': SEED,
        'data_random_seed': SEED,
        'metric': 'rmse',
        'verbose': 0,
        'num_threads': N_JOB
    }

    logging.info('Loading CV Ids')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    feature_name, feature_ext = os.path.splitext(train_file)
    feature_name = os.path.splitext(feature_name)[0]

    for i, (i_trn, i_val) in enumerate(cv.split(y), 1):
        logging.info('Training model #{}'.format(i))
        cv_train_file = '{}.trn{}{}'.format(feature_name, i, feature_ext)
        cv_test_file = '{}.tst{}{}'.format(feature_name, i, feature_ext)

        if os.path.exists(cv_train_file):
            is_cv_feature = True
            X_cv, _ = load_data(cv_train_file)
            X_tst_cv, _ = load_data(cv_test_file)

            lgb_trn = lgb.Dataset(np.hstack((X[i_trn], X_cv[i_trn])), y[i_trn])
            lgb_val = lgb.Dataset(np.hstack((X[i_val], X_cv[i_val])), y[i_val])
        else:
            is_cv_feature = False
            lgb_trn = lgb.Dataset(X[i_trn], y[i_trn])
            lgb_val = lgb.Dataset(X[i_val], y[i_val])

        if i == 1:
            logging.info('Training with early stopping')
            clf = lgb.train(params,
                            lgb_trn,
                            num_boost_round=n_est,
                            early_stopping_rounds=n_stop,
                            valid_sets=lgb_val,
                            verbose_eval=100)

            n_best = clf.best_iteration
            logging.info('best iteration={}'.format(n_best))

            df = pd.read_csv(feature_map_file,
                             sep='\t',
                             names=['id', 'name', 'type'])
            df['gain'] = clf.feature_importance(importance_type='gain',
                                                iteration=n_best)
            df.loc[:, 'gain'] = df.gain / df.gain.sum()
            df.sort_values('gain', ascending=False, inplace=True)
            df.to_csv(feature_importance_file, index=False)
            logging.info('feature importance is saved in {}'.format(
                feature_importance_file))
        else:
            clf = lgb.train(params,
                            lgb_trn,
                            num_boost_round=n_best,
                            valid_sets=lgb_val,
                            verbose_eval=100)

        if is_cv_feature:
            p_val[i_val] = clf.predict(np.hstack((X[i_val], X_cv[i_val])))
        else:
            p_val[i_val] = clf.predict(X[i_val])

        logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val])))

        if not retrain:
            if is_cv_feature:
                p_tst += clf.predict(np.hstack((X_tst, X_tst_cv))) / N_FOLD
            else:
                p_tst += clf.predict(X_tst) / N_FOLD

    logging.info('CV: {:.6f}'.format(kappa(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        lgb_trn = lgb.Dataset(X, y)
        clf = lgb.train(params,
                        lgb_trn,
                        num_boost_round=n_best,
                        verbose_eval=100)

        p_tst = clf.predict(X_tst)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')