Beispiel #1
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_stop=100,
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    model = AutoLGB(objective='binary', metric='auc', n_random_col=0)
    model.tune(pd.DataFrame(X), pd.Series(y))

    params = model.params
    n_est = model.n_best

    logging.info(f'params: {params}')
    logging.info(f'n_best: {n_est}')

    p = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    n_bests = []
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        trn_lgb = lgb.Dataset(X[i_trn], label=y[i_trn])
        val_lgb = lgb.Dataset(X[i_val], label=y[i_val])

        logging.info('Training with early stopping')
        clf = lgb.train(params,
                        trn_lgb,
                        n_est,
                        val_lgb,
                        early_stopping_rounds=n_stop,
                        verbose_eval=100)
        n_best = clf.best_iteration
        n_bests.append(n_best)
        logging.info('best iteration={}'.format(n_best))

        p[i_val] = clf.predict(X[i_val])
        logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val])))

        p_tst += clf.predict(X_tst) / N_FOLD

    logging.info('CV: {:.4f}'.format(AUC(y, p)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',')

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est, depth, n_fold=5):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='rf_{}_{}_{}.log'.format(n_est,
                                                          depth,
                                                          feature_name))


    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    clf = RF(n_estimators=n_est, max_depth=depth, random_state=2015)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    logging.info('Cross validation...')
    p_val = np.zeros_like(y)
    for i_trn, i_val in cv:
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]

    logging.info('AUC = {:.4f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Beispiel #3
0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  C, n_fold=5):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG, filename='lr_{}_{}.log'.format(
                                                        C, feature_name
                                                       ))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    clf = SVC(C=C, class_weight='auto', random_state=2015, probability=True)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training CV #{}...'.format(i))
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        logging.info('AUC TRN = {:.4f}'.format(AUC(y[i_trn], clf.predict_proba(X[i_trn])[:, 1])))
        logging.info('AUC VAL = {:.4f}'.format(AUC(y[i_val], p_val[i_val])))

    logging.info('AUC = {:.4f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Beispiel #4
0
def train_predict(train_file,
                  test_file,
                  feature_map_file,
                  predict_valid_file,
                  predict_test_file,
                  feature_imp_file,
                  n_est=100,
                  subrow=.5,
                  n_min=1):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename=f'{model_name}.log')

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    with open(feature_map_file) as f:
        feature_name = [x.strip() for x in f.readlines()]

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p = np.zeros((X.shape[0], N_CLASS))
    p_tst = np.zeros((X_tst.shape[0], N_CLASS))
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info(f'Training model #{i}')
        clf = RandomForestClassifier(n_estimators=n_est,
                                     min_samples_leaf=n_min,
                                     max_features='auto',
                                     max_samples=subrow,
                                     random_state=SEED,
                                     n_jobs=-1)
        clf.fit(X[i_trn], y[i_trn])
        p[i_val, :] = clf.predict_proba(X[i_val])
        p_tst += clf.predict_proba(X_tst) / N_FOLD
        logging.info(
            f'CV #{i}: {accuracy_score(y[i_val], np.argmax(p[i_val], axis=1)) * 100:.4f}%'
        )

    imp = pd.DataFrame({
        'feature': feature_name,
        'importance': clf.feature_importances_
    })
    imp = imp.sort_values('importance').set_index('feature')
    imp.to_csv(feature_imp_file)

    logging.info(f'CV: {accuracy_score(y, np.argmax(p, axis=1)) * 100:.4f}%')
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',')

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, encoder_name, encoder_dim, lrate,
                     dropout, model_file, feature_map_file, n_est, n_stop,
                     batch_size):
    logging.info('loading base feature files')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)
    n_trn = X.shape[0]

    logging.info('combining training and test features')
    X = sparse.vstack((X, X_tst))

    autoencoder, encoder = get_model(model_name=encoder_name,
                                     input_dim=X.shape[1],
                                     encoder_dim=encoder_dim,
                                     learning_rate=lrate,
                                     dropout=dropout)
    logging.info('training an autoencoder')
    logging.info(autoencoder.summary())

    i_trn, i_val = train_test_split(np.arange(X.shape[0]),
                                    test_size=.2,
                                    random_state=SEED,
                                    shuffle=True)

    es = EarlyStopping(monitor='val_loss', patience=n_stop)
    mcp = ModelCheckpoint(model_file,
                          monitor='val_loss',
                          save_best_only=True,
                          save_weights_only=False)
    h = autoencoder.fit_generator(
        generator(X[i_trn], X[i_trn], batch_size),
        steps_per_epoch=int(np.ceil(len(i_trn) / batch_size)),
        epochs=n_est,
        validation_data=generator(X[i_val], X[i_val], batch_size),
        validation_steps=int(np.ceil(len(i_val) / batch_size)),
        callbacks=[es, mcp])

    val_losss = h.history['val_loss']
    n_best = val_losss.index(min(val_losss)) + 1
    autoencoder.load_weights(model_file)
    logging.info('best epoch={}'.format(n_best))

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(range(encoder_dim)):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    P = encoder.predict_generator(generator(X[:n_trn], None, batch_size),
                                  steps=int(np.ceil(n_trn / batch_size)))
    save_data(sparse.csr_matrix(P), y, train_feature_file)

    P = encoder.predict_generator(generator(X[n_trn:], None, batch_size),
                                  steps=int(
                                      np.ceil(
                                          (X.shape[0] - n_trn) / batch_size)))
    save_data(sparse.csr_matrix(P), None, test_feature_file)
Beispiel #6
0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, lrate=.1, l1=.0, l2=.0, n_fold=5):

    dir_feature = os.path.dirname(train_file)
    dir_val = os.path.dirname(predict_valid_file)

    feature_name = os.path.basename(train_file)[:-4]
    algo_name = 'xgl_{}_{}_{}_{}'.format(n_est, lrate, l1, l2)
    model_name = '{}_{}'.format(algo_name, feature_name)
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    param = {'eta': lrate,
             'objective': 'binary:logistic',
             'colsample_bytree': .7,
             'subsample': .5,
             'eval_metric': 'auc',
             'seed': 2015,
             'booster': 'gblinear',
             'alpha': l1,
             'lambda': l2}

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        dtrain = xgb.DMatrix(X[i_trn], label=y[i_trn])
        dvalid = xgb.DMatrix(X[i_val], label=y[i_val])
        watchlist = [(dvalid, 'eval'), (dtrain, 'train')]

        clf = xgb.train(param, dtrain, n_est, watchlist)

        p_val[i_val] = clf.predict(dvalid)
        logging.info('AUC TRN = {:.6f}'.format(AUC(y[i_trn], clf.predict(dtrain))))
        logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val])))

    logging.info('AUC = {:.6f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    dtrain = xgb.DMatrix(X, label=y)
    dtest = xgb.DMatrix(test_file)
    watchlist = [(dtrain, 'train')]

    clf = xgb.train(param, dtrain, n_est, watchlist)
    p_tst = clf.predict(dtest)

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Beispiel #7
0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, batch_size=1024, retrain=True):

    model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    dims = X.shape[1]
    logging.info('{} dims'.format(dims))

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p = np.zeros_like(y)
    p_tst = np.zeros((X_tst.shape[0],))
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        clf = nn_model(dims)
        clf.fit_generator(generator=batch_generator(X[i_trn],
                                                    y[i_trn],
                                                    batch_size,
                                                    True),
                          nb_epoch=n_est,
                          samples_per_epoch=X[i_trn].shape[0],
                          verbose=0)

        p[i_val] = clf.predict_generator(generator=batch_generatorp(X[i_val], batch_size, False),
                                         val_samples=X[i_val].shape[0])[:, 0]
        logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val])))

        if not retrain:
            p_tst += clf.predict_generator(generator=batch_generatorp(X_tst, batch_size, False),
                                           val_samples=X_tst.shape[0])[:, 0] / N_FOLD

    logging.info('Saving validation predictions...')
    logging.info('CV: {:.4f}'.format(AUC(y, p)))
    np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        clf = nn_model(dims)
        clf.fit_generator(generator=batch_generator(X, Y, batch_size, True),
                          nb_epoch=n_est)
        p_tst = clf.predict_generator(generator=batch_generatorp(X_tst, batch_size, False),
                                      val_samples=X_tst.shape[0])[:, 0]

    logging.info('Saving normalized test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Beispiel #8
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est=100,
                  depth=4,
                  lrate=.1,
                  n_fold=5,
                  n_bag=50,
                  subrow=.5,
                  subcol=.8):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='xg_bag{}_{}_{}_{}_{}_{}_{}.log'.format(
                            n_bag, n_est, depth, lrate, subrow, subcol,
                            feature_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    xg = xgb.XGBClassifier(max_depth=depth,
                           learning_rate=lrate,
                           n_estimators=n_est,
                           colsample_bytree=.8,
                           subsample=.5,
                           nthread=4)

    clf = BG(xg, n_estimators=n_bag, max_samples=subrow, max_features=subcol)
    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        logging.info('AUC TRN = {:.6f}'.format(
            AUC(y[i_trn],
                clf.predict_proba(X[i_trn])[:, 1])))
        logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val])))

    logging.info('AUC = {:.6f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, 
		  predict_test_file, retrain=True):

    model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    n_bests = []
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        glm = linear_model.LogisticRegression(solver='lbfgs',
					      max_iter=2020,
					      fit_intercept=True,
					      penalty='none',
					      verbose=0)
        glm.fit(X[i_trn], y[i_trn])
        p[i_val] = glm.predict_proba(X[i_val])[:, 1]
        logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val])))

        if not retrain:
            p_tst += glm.predict_proba(X_tst)[:,1] / N_FOLD

    logging.info('CV: {:.4f}'.format(AUC(y, p)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        
        glm = linear_model.LogisticRegression(random_state=1,
					      solver='lbfgs',
					      max_iter=2020,
					      fit_intercept=True,
					      penalty='none',
					      verbose=0)
        glb = glb.fit(X, y)
        p_tst = glb.predict_proba(X_tst)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Beispiel #10
0
def calculate_distance(vec_files):
    print vec_files
    q1_vec, _ = load_data(vec_files[0])
    q2_vec, _ = load_data(vec_files[1])

    distances = []

    for d in sklearn.metrics.pairwise.PAIRED_DISTANCES.keys(
    ):  #['euclidean', 'cosine', 'l2', 'l1', 'cityblock', 'manhattan']
        distances.append(
            sklearn.metrics.pairwise.paired_distances(q1_vec, q2_vec,
                                                      metric=d))

    return np.transpose(np.vstack(distances))
Beispiel #11
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  cv_id_file,
                  n_est,
                  depth,
                  n_fold=5,
                  retrain=False):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='rf_{}_{}_{}.log'.format(
                            n_est, depth, feature_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file, dense=True)
    X_tst, _ = load_data(test_file, dense=True)

    clf = RF(n_estimators=n_est, max_depth=depth, n_jobs=20, random_state=2016)

    logging.info('Loading CV Ids')
    cv_id = np.loadtxt(cv_id_file)

    logging.info('Cross validation...')
    P_val = np.zeros(X.shape[0])
    P_tst = np.zeros(X_tst.shape[0])
    for i in range(1, n_fold + 1):
        logging.info("cv %d" % i)
        i_trn = np.where(cv_id != i)[0]
        i_val = np.where(cv_id == i)[0]
        logging.debug('train: {}'.format(X[i_trn].shape))
        logging.debug('valid: {}'.format(X[i_val].shape))
        logging.debug(len(set(y[i_trn])))

        clf.fit(X[i_trn], y[i_trn])
        P_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        if not retrain:
            P_tst += clf.predict_proba(X_tst)[:, 1] / n_fold

    if retrain:
        logging.info('Retraining with 100% data...')
        clf.fit(X, y)
        P_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, P_val, fmt='%.6f', delimiter=',')
    np.savetxt(predict_test_file, P_tst, fmt='%.6f', delimiter=',')
Beispiel #12
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est,
                  depth,
                  n_fold=5,
                  n_bag=50):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='et_bag{}_{}_{}_{}.log'.format(
                            n_bag, n_est, depth, feature_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    et = ET(n_estimators=n_est,
            max_depth=depth,
            random_state=2015,
            class_weight='auto',
            bootstrap=True)

    clf = BG(et, n_estimators=n_bag, max_samples=.8, max_features=.9)
    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        logging.info('AUC TRN = {:.6f}'.format(
            AUC(y[i_trn],
                clf.predict_proba(X[i_trn])[:, 1])))
        logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val])))

    logging.info('AUC = {:.6f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_iter=100, dim=4, lrate=.1, n_fold=5):

    dir_feature = os.path.dirname(train_file)
    dir_val = os.path.dirname(predict_valid_file)

    feature_name = os.path.basename(train_file)[:-8]
    algo_name = 'libfm_{}_{}_{}'.format(n_iter, dim, lrate)
    model_name = '{}_{}'.format(algo_name, feature_name)

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG, filename='{}.log'.format(model_name))

    logging.info('Loading training data')
    X, y = load_data(train_file)
    n_tst = sum(1 for line in open(test_file))

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        valid_train_file = os.path.join(dir_feature, '{}.trn{}.sps'.format(feature_name, i))
        valid_test_file = os.path.join(dir_feature, '{}.val{}.sps'.format(feature_name, i))
        valid_predict_file = os.path.join(dir_val, '{}.val{}.yht'.format(model_name, i))

        # if there is no CV training or validation file, then generate them
        # first.
        if (not os.path.isfile(valid_train_file) or not os.path.isfile(valid_test_file)):
            dump_svmlight_file(X[i_trn], y[i_trn], valid_train_file,
                               zero_based=False)
            dump_svmlight_file(X[i_val], y[i_val], valid_test_file,
                               zero_based=False)

        subprocess.call(["libFM",
                         "-task", "c",
                         '-dim', '1,1,{}'.format(dim),
                         '-init_stdev', str(lrate),
                         '-iter', str(n_iter),
                         '-train', valid_train_file,
                         '-test', valid_test_file,
                         '-out', valid_predict_file])

        p_val[i_val] = np.loadtxt(valid_predict_file)
        os.remove(valid_predict_file)

    logging.info('AUC = {:.6f}'.format(AUC(y, p_val)))
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')

    logging.info('Retraining with 100% data...')
    subprocess.call(["libFM",
                     "-task", "c",
                     '-dim', '1,1,{}'.format(dim),
                     '-init_stdev', str(lrate),
                     '-iter', str(n_iter),
                     '-train', train_file,
                     '-test', test_file,
                     '-out', predict_test_file])
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est,
                  depth,
                  retrain=True):

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])

    for i, (i_trn, i_val) in enumerate(cv.split(y), 1):
        logging.info('Training model #{}'.format(i))

        clf = ExtraTreesRegressor(n_estimators=n_est,
                                  max_depth=depth,
                                  random_state=SEED)
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict(X[i_val])
        logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val])))

        if not retrain:
            p_tst += clf.predict(X_tst) / N_FOLD

    logging.info('CV: {:.6f}'.format(kappa(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        clf = ExtraTreesRegressor(n_estimators=n_est,
                                  max_depth=depth,
                                  random_state=SEED)
        clf.fit(X, y)
        p_tst = clf.predict(X_tst)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Beispiel #15
0
def train_predict(train_file, test_file, feature_map_file, predict_valid_file, predict_test_file,
                  feature_importance_file, retrain=True):

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])

    for i, (i_trn, i_val) in enumerate(cv.split(y), 1):
        logging.info('Training model #{}'.format(i))

        clf = LinearRegression()
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict(X[i_val])
        logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val])))

        if i == 1:
            df = pd.read_csv(feature_map_file, sep='\t', names=['id', 'name', 'type'])
            df['coef'] = clf.coef_
            df.sort_values('coef', ascending=False, inplace=True)
            df.to_csv(feature_importance_file, index=False)
            logging.info('feature importance is saved in {}'.format(feature_importance_file))

        if not retrain:
            p_tst += clf.predict(X_tst) / N_FOLD

    logging.info('CV: {:.6f}'.format(kappa(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        clf = LinearRegression()
        clf.fit(X, y)
        p_tst = clf.predict(X_tst)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def merge_sub_features(train_file, test_file, train_sub_features,
                       test_sub_features, train_feature_file,
                       test_feature_file, lowest):

    trn_subfeat = []
    tst_subfeat = []

    for f_trn, f_tst in zip([x for x in train_sub_features.split(' ') if x],
                            [x for x in test_sub_features.split(' ') if x]):
        logging.info('Reading trn {0} tst {1}'.format(f_trn, f_tst))

        X_sub_trn, _ = load_data(f_trn)
        X_sub_tst, _ = load_data(f_tst)

        if not ssp.issparse(X_sub_trn):
            X_sub_trn = ssp.csr_matrix(X_sub_trn)
            X_sub_tst = ssp.csr_matrix(X_sub_tst)

        trn_subfeat.append(X_sub_trn)
        tst_subfeat.append(X_sub_tst)

        logging.info('Size trn {0} tst {1}'.format(X_sub_trn.shape,
                                                   X_sub_tst.shape))

    df_train = pd.read_csv(train_file)
    y_train = df_train[TARGET].values

    logging.info('Merge sub features')
    X_trn = ssp.hstack(trn_subfeat).tocsr()
    X_tst = ssp.hstack(tst_subfeat).tocsr()
    logging.info('Size trn {0} tst {1}'.format(X_trn.shape, X_tst.shape))

    drop = feature_selection.DropInactive(lowest)

    drop.fit(X_trn)
    X_trn = drop.transform(X_trn)
    X_tst = drop.transform(X_tst)

    logging.info('Size trn {0} tst {1}'.format(X_trn.shape, X_tst.shape))

    logging.info('saving features')
    save_data(X_trn, y_train, train_feature_file)
    save_data(X_tst, None, test_feature_file)
def train_predict(train_file, test_file, predict_valid_file, predict_test_file):
    feature_name = os.path.basename(train_file)[:-4]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='avg_{}.log'.format(feature_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    P_val = X.mean(axis=1)
    P_tst = X_tst.mean(axis=1)

    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, P_val, fmt='%.6f', delimiter=',')

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, P_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_test_file,
                  cid_train_file, cid_test_file, n_fold=5):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='xg_cid_grid_{}.log'.format(feature_name))

    logging.info('Loading course IDs for training and test data')
    cid_trn = np.loadtxt(cid_train_file, dtype=int)
    cid_tst = np.loadtxt(cid_test_file, dtype=int)
    
    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, y_tst = load_data(test_file)

    xg = xgb.XGBClassifier(subsample=0.4, colsample_bytree=.4, nthread=6)
    param = {'learning_rate': [0.005, .01, .02], 'max_depth': [4, 5, 6],
             'n_estimators': [200, 400, 800, 1000]}

    p_tst = np.zeros_like(y_tst)
    for j in range(39):
        idx_trn = np.where(cid_trn == j)[0]
        idx_tst = np.where(cid_tst == j)[0]

        cv = StratifiedKFold(y[idx_trn], n_folds=n_fold, shuffle=True,
                             random_state=2015)
        clf = GridSearchCV(xg, param, scoring='roc_auc', verbose=1, cv=cv)
        clf.fit(X[idx_trn], y[idx_trn])

        logging.info('CID #{}: {:.4f} {}'.format(j, clf.best_score_,
                                                 clf.best_params_))

        logging.info('Retraining with 100% data...')
        clf.best_estimator_.fit(X[idx_trn], y[idx_trn])
        p_tst[idx_tst] = clf.best_estimator_.predict_proba(X_tst[idx_tst])[:, 1]

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Beispiel #19
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_fold=5):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='xg_grid_{}.log'.format(feature_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    xg = xgb.XGBClassifier(subsample=0.5, colsample_bytree=0.8, nthread=4)
    param = {
        'learning_rate': [.005, .01, .02],
        'max_depth': [4, 5, 6],
        'n_estimators': [100, 200, 400]
    }
    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)
    clf = GridSearchCV(xg, param, scoring='roc_auc', verbose=1, cv=cv)

    logging.info('Cross validation for grid search...')
    clf.fit(X, y)
    p = clf.predict_proba(X)[:, 1]

    logging.info('best model = {}'.format(clf.best_estimator_))
    logging.info('best score = {:.6f}'.format(clf.best_score_))

    logging.info('Retraining with 100% data...')
    clf.best_estimator_.fit(X, y)
    p_tst = clf.best_estimator_.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Beispiel #20
0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  cv_id_file,
                  n_est, depth, n_fold=5):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='et_{}_{}_{}.log'.format(n_est,
                                                          depth,
                                                          feature_name))


    logging.info('Loading training and test data...')
    X, y = load_data(train_file, dense=True)
    X_tst, _ = load_data(test_file, dense=True)

    clf = ET(n_estimators=n_est, max_depth=depth, random_state=SEED)

    logging.info('Loading CV Ids')
    cv_id = np.loadtxt(cv_id_file)

    logging.info('Cross validation...')
    P_val = np.zeros((X.shape[0], N_CLASS))
    for i in range(1, n_fold + 1):
        i_trn = np.where(cv_id != i)[0]
        i_val = np.where(cv_id == i)[0]

        clf.fit(X[i_trn], y[i_trn])
        P_val[i_val] = clf.predict_proba(X[i_val])

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    P_tst = clf.predict_proba(X_tst)

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, P_val, fmt='%.6f', delimiter=',')
    np.savetxt(predict_test_file, P_tst, fmt='%.6f', delimiter=',')
Beispiel #21
0
def train_predict(train_file,
                  test_file,
                  feature_map_file,
                  predict_valid_file,
                  predict_test_file,
                  feature_imp_file,
                  num_threads,
                  metric,
                  learning_rate,
                  boosting,
                  objective,
                  verbosity,
                  num_boost_round,
                  early_stopping_rounds,
                  verbose_eval,
                  device_type=None,
                  gpu_use_dp=None):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename=f'{model_name}.log')

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    with open(feature_map_file) as f:
        feature_name = [x.strip() for x in f.readlines()]

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    if device_type == None or gpu_use_dp == None:
        device_type = 'cpu'
        gpu_use_dp = False

    lgbm_params = {
        'num_threads': num_threads,
        'metric': metric,
        'learning_rate': learning_rate,
        'boosting': boosting,
        'objective': objective,
        'num_class': N_CLASS,
        'random_state': SEED,
        'device_type': device_type,
        'gpu_use_dp': gpu_use_dp,
        'verbosity': verbosity,
    }

    oof_pred = np.zeros((X.shape[0], N_CLASS))
    test_pred = np.zeros((X_tst.shape[0], N_CLASS))
    n_bests = []

    for fold, (trn_idx, val_idx) in enumerate(cv.split(X, y), 1):
        logging.info(f'Training model #{fold}')

        X_trn, X_val = X[trn_idx], X[val_idx]
        y_trn, y_val = y[trn_idx], y[val_idx]

        dtrn = lgbm.Dataset(X_trn, label=y_trn)
        dval = lgbm.Dataset(X_val, label=y_val)

        logging.info('Training with early stopping')
        lgbm_clf = lgbm.train(params=lgbm_params,
                              train_set=dtrn,
                              num_boost_round=num_boost_round,
                              valid_sets=[dtrn, dval],
                              early_stopping_rounds=early_stopping_rounds,
                              verbose_eval=verbose_eval)

        n_best = lgbm_clf.best_iteration
        n_bests.append(n_best)
        logging.info('best iteration={}'.format(n_best))

        test_pred += lgbm_clf.predict(X_tst) / (N_FOLD)
        oof_pred[val_idx] += lgbm_clf.predict(X_val)
        logging.info(
            f'CV #{fold}: {accuracy_score(y_val, np.argmax(oof_pred[val_idx], axis=1)) * 100:.4f}%'
        )

    imp = pd.DataFrame({
        'feature':
        feature_name,
        'importance':
        lgbm_clf.feature_importance(importance_type='gain', iteration=n_best)
    })
    imp = imp.sort_values('importance').set_index('feature')
    imp.to_csv(feature_imp_file)

    logging.info(
        f'CV: {accuracy_score(y, np.argmax(oof_pred, axis=1)) * 100:.4f}%')
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, oof_pred, fmt='%.18f', delimiter=',')

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, test_pred, fmt='%.18f', delimiter=',')
Beispiel #22
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  cv_id_file,
                  n_est=100,
                  hiddens=2,
                  neurons=512,
                  dropout=0.5,
                  batch=16,
                  n_stop=2,
                  retrain=True,
                  n_fold=5):

    feature_name = os.path.basename(train_file).split('.')[0]
    model_name = 'keras_{}_{}_{}_{}_{}_{}_{}'.format(n_est, hiddens, neurons,
                                                     dropout, batch, n_stop,
                                                     feature_name)

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file, dense=True)
    Y = np_utils.to_categorical(y)
    X_tst, _ = load_data(test_file, dense=True)

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_tst = scaler.transform(X_tst)

    nb_classes = Y.shape[1]
    dims = X.shape[1]
    logging.info('{} classes, {} dims'.format(nb_classes, dims))

    logging.info('Loading CV Ids')
    cv_id = np.loadtxt(cv_id_file)

    P_val = np.zeros((Y.shape[0], ))
    P_tst = np.zeros((X_tst.shape[0], ))
    for i in range(1, n_fold + 1):
        i_trn = np.where(cv_id != i)[0]
        i_val = np.where(cv_id == i)[0]
        logging.info('Training model #{}'.format(i))
        clf = get_model(nb_classes, dims, hiddens, neurons, dropout)
        if i == 1:
            early_stopping = EarlyStopping(monitor='val_loss', patience=n_stop)
            h = clf.fit(X[i_trn],
                        Y[i_trn],
                        validation_data=(X[i_val], Y[i_val]),
                        nb_epoch=n_est,
                        batch_size=batch,
                        callbacks=[early_stopping])

            val_losses = h.history['val_loss']
            n_best = val_losses.index(min(val_losses)) + 1
            logging.info('best epoch={}'.format(n_best))
        else:
            clf.fit(X[i_trn],
                    Y[i_trn],
                    validation_data=(X[i_val], Y[i_val]),
                    nb_epoch=n_best,
                    batch_size=batch)

        P_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        logging.info('CV #{} Log Loss: {:.6f}'.format(
            i, log_loss(Y[i_val], P_val[i_val])))

        if not retrain:
            P_tst += clf.predict_proba(X_tst)[:, 1] / n_fold

    logging.info('CV Log Loss: {:.6f}'.format(log_loss(y, P_val)))
    np.savetxt(predict_valid_file, P_val, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        clf = get_model(nb_classes, dims, hiddens, neurons, dropout)
        clf.fit(X, Y, nb_epoch=n_best, batch_size=batch)
        P_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving normalized test predictions...')
    np.savetxt(predict_test_file, P_tst, fmt='%.6f', delimiter=',')
Beispiel #23
0
def train_predict(train_file,
                  test_file,
                  feature_map_file,
                  predict_valid_file,
                  predict_test_file,
                  feature_imp_file,
                  n_est=100,
                  n_leaf=200,
                  lrate=.1,
                  n_min=8,
                  subcol=.3,
                  subrow=.8,
                  subrow_freq=100,
                  n_stop=100):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename=f'{model_name}.log')

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    with open(feature_map_file) as f:
        feature_name = [x.strip() for x in f.readlines()]

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    params = {
        'random_state': SEED,
        'num_classes': N_CLASS,
        'n_jobs': -1,
        'objective': 'multiclass',
        'learning_rate': lrate,
        'num_leaves': n_leaf,
        'feature_fraction': subcol,
        'bagging_fraction': subrow,
        'bagging_freq': subrow_freq,
        'min_child_samples': n_min
    }

    p = np.zeros((X.shape[0], N_CLASS))
    p_tst = np.zeros((X_tst.shape[0], N_CLASS))
    n_bests = []
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info(f'Training model #{i}')
        trn_lgb = lgb.Dataset(X[i_trn], label=y[i_trn])
        val_lgb = lgb.Dataset(X[i_val], label=y[i_val])

        logging.info('Training with early stopping')
        clf = lgb.train(params,
                        trn_lgb,
                        n_est,
                        val_lgb,
                        early_stopping_rounds=n_stop,
                        verbose_eval=100)
        n_best = clf.best_iteration
        n_bests.append(n_best)
        logging.info('best iteration={}'.format(n_best))

        p[i_val, :] = clf.predict(X[i_val])
        p_tst += clf.predict(X_tst) / N_FOLD
        logging.info(
            f'CV #{i}: {accuracy_score(y[i_val], np.argmax(p[i_val], axis=1)) * 100:.4f}%'
        )

    imp = pd.DataFrame({
        'feature':
        feature_name,
        'importance':
        clf.feature_importance(importance_type='gain', iteration=n_best)
    })
    imp = imp.sort_values('importance').set_index('feature')
    imp.to_csv(feature_imp_file)

    logging.info(f'CV: {accuracy_score(y, np.argmax(p, axis=1)) * 100:.4f}%')
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',')

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Beispiel #24
0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  cid_train_file, cid_test_file):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='xg_cid_{}.log'.format(feature_name))

    logging.info('Loading course IDs for training and test data')
    cid_trn = np.loadtxt(cid_train_file, dtype=int)
    cid_tst = np.loadtxt(cid_test_file, dtype=int)

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, y_tst = load_data(test_file)

    cv = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=2015)

    p = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        X_trn = X[i_trn]
        y_trn = y[i_trn]
        X_val = X[i_val]
        y_val = y[i_val]
        cid_valtrn = cid_trn[i_trn]
        cid_valtst = cid_trn[i_val]

        p_trn = np.zeros_like(y_trn)
        p_val = np.zeros_like(y_val)
        for j in range(39):
            idx_trn = np.where(cid_valtrn == j)[0]
            idx_val = np.where(cid_valtst == j)[0]

            clf = xgb.XGBClassifier(max_depth=PARAM[j][2],
                                    learning_rate=PARAM[j][1],
                                    n_estimators=PARAM[j][0],
                                    colsample_bytree=1,
                                    subsample=.4,
                                    nthread=6)

            clf.fit(X_trn[idx_trn], y_trn[idx_trn])
            p_trn[idx_trn] = clf.predict_proba(X_trn[idx_trn])[:, 1]
            p_val[idx_val] = clf.predict_proba(X_val[idx_val])[:, 1]
            logging.info('CID #{}: {:.4f}, {:.4f}'.format(
                j, AUC(y_trn[idx_trn], p_trn[idx_trn]),
                AUC(y_val[idx_val], p_val[idx_val])))

        logging.info('AUC TRN = {:.6f}'.format(AUC(y_trn, p_trn)))
        logging.info('AUC VAL = {:.6f}'.format(AUC(y_val, p_val)))
        p[i_val] = p_val

    logging.info('AUC = {:.6f}'.format(AUC(y, p)))
    logging.info('Saving CV predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f')

    logging.info('Retraining with 100% data...')
    p_tst = np.zeros_like(y_tst)
    n_tst = len(p_tst)
    for j in range(39):
        idx_trn = np.where(cid_trn == j)[0]
        idx_tst = np.where(cid_tst == j)[0]
        logging.info('CID #{}: {:.2f}%'.format(j, len(idx_tst) / n_tst * 100))
        clf.fit(X[idx_trn], y[idx_trn])
        p_tst[idx_tst] = clf.predict_proba(X_tst[idx_tst])[:, 1]

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Beispiel #25
0
def train_predict_lr_forward(train_file,
                             test_file,
                             predict_valid_file,
                             predict_test_file,
                             C,
                             n_fold=5):

    feature_name = os.path.basename(train_file)[:-8]
    algo_name = 'lr_forward_{}'.format(C)
    model_name = '{}_{}'.format(algo_name, feature_name)
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info("Loading training and test data...")
    X_trn, y_trn = load_data(train_file, dense=True)
    X_tst, _ = load_data(test_file, dense=True)

    logging.info('Normalizing data')
    scaler = StandardScaler()
    X_trn = scaler.fit_transform(X_trn)
    X_tst = scaler.transform(X_tst)

    cv = StratifiedKFold(y_trn,
                         n_folds=n_fold,
                         shuffle=True,
                         random_state=2015)

    selected_features = []
    features_to_test = [
        x for x in range(X_trn.shape[1]) if x not in selected_features
    ]

    auc_cv_old = .5
    is_improving = True
    while is_improving:
        auc_cvs = []
        for feature in features_to_test:
            logging.info('{}'.format(selected_features + [feature]))
            X = X_trn[:, selected_features + [feature]]

            p_val = np.zeros_like(y_trn)
            for i, (i_trn, i_val) in enumerate(cv, start=1):
                clf = LR(C=C, class_weight='auto', random_state=2014)
                clf.fit(X[i_trn], y_trn[i_trn])
                p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]

            auc_cv = AUC(y_trn, p_val)
            logging.info('AUC CV: {:.6f}'.format(auc_cv))
            auc_cvs.append(auc_cv)

        auc_cv_new = max(auc_cvs)
        if auc_cv_new > auc_cv_old:
            auc_cv_old = auc_cv_new
            feature = features_to_test.pop(auc_cvs.index(auc_cv_new))
            selected_features.append(feature)
            logging.info('selected features: {}'.format(selected_features))
        else:
            is_improving = False
            logging.info(
                'final selected features: {}'.format(selected_features))

    logging.info('saving selected features as a file')
    with open('{}_selected.txt'.format(model_name), 'w') as f:
        f.write('{}\n'.format(selected_features))

    X = X_trn[:, selected_features]
    logging.debug('feature matrix: {}x{}'.format(X.shape[0], X.shape[1]))

    p_val = np.zeros_like(y_trn)
    for i, (i_trn, i_val) in enumerate(cv, start=1):
        logging.info('Training CV #{}'.format(i))
        clf = LR(C=C, class_weight='auto', random_state=2015)
        clf.fit(X[i_trn], y_trn[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]

    auc_cv = AUC(y_trn, p_val)
    logging.info('AUC CV: {:.6f}'.format(auc_cv))
    logging.info("Writing test predictions to file")
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    logging.info('Retraining with 100% data...')
    clf.fit(X, y_trn)
    p_tst = clf.predict_proba(X_tst[:, selected_features])[:, 1]
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est=100,
                  n_leaf=200,
                  lrate=.1,
                  n_min=8,
                  subcol=.3,
                  subrow=.8,
                  subrow_freq=100,
                  n_stop=100,
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    y = np.log(y + offset)

    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = KFold(len(y), n_folds=N_FOLD, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    n_bests = []
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}'.format(i))
        watchlist = [(X[i_val], y[i_val])]

        logging.info('Training with early stopping')
        clf = lgb.LGBMRegressor(n_estimators=n_est,
                                num_leaves=n_leaf,
                                learning_rate=lrate,
                                min_child_samples=n_min,
                                subsample=subrow,
                                subsample_freq=subrow_freq,
                                colsample_bytree=subcol,
                                objective=fairobj,
                                nthread=1,
                                seed=SEED)
        clf = clf.fit(X[i_trn],
                      y[i_trn],
                      eval_set=watchlist,
                      eval_metric=eval_mae,
                      early_stopping_rounds=n_stop,
                      verbose=10)
        n_best = clf.best_iteration
        n_bests.append(n_best)
        logging.info('best iteration={}'.format(n_best))

        p_val[i_val] = clf.predict(X[i_val])
        logging.info('CV #{}: {:.4f}'.format(
            i, MAE(np.exp(y[i_val]), np.exp(p_val[i_val]))))

        if not retrain:
            p_tst += clf.predict(X_tst) / N_FOLD

    logging.info('CV: {:.4f}'.format(MAE(np.exp(y), np.exp(p_val))))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file,
               np.exp(p_val) - offset,
               fmt='%.6f',
               delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        n_best = sum(n_bests) // N_FOLD
        clf = lgb.LGBMRegressor(n_estimators=n_best,
                                num_leaves=n_leaf,
                                learning_rate=lrate,
                                min_child_samples=n_min,
                                subsample=subrow,
                                subsample_freq=subrow_freq,
                                colsample_bytree=subcol,
                                objective=fairobj,
                                nthread=1,
                                seed=SEED)

        clf = clf.fit(X, y)
        p_tst = clf.predict(X_tst)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file,
               np.exp(p_tst) - offset,
               fmt='%.6f',
               delimiter=',')
Beispiel #27
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  cv_id_file,
                  n_est=100,
                  n_leaf=200,
                  lrate=.1,
                  n_min=8,
                  subcol=.3,
                  subrow=.8,
                  subrow_freq=100,
                  n_stop=100,
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv_id = np.loadtxt(cv_id_file)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    n_bests = []
    n_fold = 5
    for i in range(1, n_fold + 1):
        i_trn = np.where(cv_id != i)[0]
        i_val = np.where(cv_id == i)[0]
        logging.info('Training model #{}'.format(i))
        logging.debug('train: {}'.format(X[i_trn].shape))
        logging.debug('valid: {}'.format(X[i_val].shape))

        watchlist = [(X[i_val], y[i_val])]

        logging.info('Training with early stopping')
        clf = lgb.LGBMRegressor(n_estimators=n_est,
                                num_leaves=n_leaf,
                                learning_rate=lrate,
                                min_child_samples=n_min,
                                subsample=subrow,
                                subsample_freq=subrow_freq,
                                colsample_bytree=subcol,
                                nthread=20,
                                seed=SEED)
        clf = clf.fit(X[i_trn],
                      y[i_trn],
                      eval_set=watchlist,
                      eval_metric="l2",
                      early_stopping_rounds=n_stop,
                      verbose=10)
        n_best = clf.best_iteration if clf.best_iteration > 0 else n_est
        n_bests.append(n_best)
        logging.info('best iteration={}'.format(n_best))

        p_val[i_val] = clf.predict(X[i_val])

        if not retrain:
            p_tst += clf.predict(X_tst) / n_fold

    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        n_best = sum(n_bests) // n_fold
        clf = lgb.LGBMRegressor(n_estimators=n_best,
                                num_leaves=n_leaf,
                                learning_rate=lrate,
                                min_child_samples=n_min,
                                subsample=subrow,
                                subsample_freq=subrow_freq,
                                colsample_bytree=subcol,
                                nthread=20,
                                seed=SEED)

        clf = clf.fit(X, y, verbose=True)
        p_tst = clf.predict(X_tst)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Beispiel #28
0
def train_predict(train_feature_file,
                  test_feature_file,
                  predict_valid_file,
                  predict_test_file,
                  C=1.0,
                  class_weight='balanced',
                  max_iter=1000,
                  solver='lbfgs',
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_feature_file)
    X_tst, _ = load_data(test_feature_file)

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True,
                         random_state=SEED).split(X, y)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}'.format(i))

        logging.info('Training Logistic Regression')
        clf = LogisticRegression(C=C,
                                 class_weight=class_weight,
                                 max_iter=max_iter,
                                 solver=solver,
                                 random_state=SEED)

        clf = clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p_val[i_val])))

        if not retrain:
            p_tst += clf.predict_proba(X_tst)[:, 1] / N_FOLD

    logging.info('CV: {:.4f}'.format(AUC(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        clf = LogisticRegression(C=C,
                                 class_weight=class_weight,
                                 max_iter=max_iter,
                                 solver=solver,
                                 random_state=SEED)

        clf = clf.fit(X, y)
        p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Beispiel #29
0
def train_predict(train_file,
                  test_file,
                  feature_map_file,
                  predict_valid_file,
                  predict_test_file,
                  feature_importance_file,
                  n_est=100,
                  depth=4,
                  lrate=.1,
                  l2_leaf_reg=1):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.info(('n_est={}, depth={}, lrate={}, '
                  'l2_leaf_reg={}').format(n_est, depth, lrate, l2_leaf_reg))

    logging.info('Loading training and test data...')
    logging.info('{}'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    if sparse.issparse(X):
        X = X.todense()
        X_tst = X_tst.todense()

    features = pd.read_csv(feature_map_file,
                           sep='\t',
                           header=None,
                           names=['idx', 'name', 'type'])
    cat_cols = features.idx[features.type != 'q'].tolist()

    logging.info('Loading CV Ids')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    feature_name, feature_ext = os.path.splitext(train_file)
    feature_name = os.path.splitext(feature_name)[0]

    for i, (i_trn, i_val) in enumerate(cv.split(y), 1):
        logging.info('Training model #{}'.format(i))
        cv_train_file = '{}.trn{}{}'.format(feature_name, i, feature_ext)
        cv_test_file = '{}.tst{}{}'.format(feature_name, i, feature_ext)

        if os.path.exists(cv_train_file):
            is_cv_feature = True
            X_cv, _ = load_data(cv_train_file)
            X_tst_cv, _ = load_data(cv_test_file)

            X_trn = np.hstack((X[i_trn], X_cv[i_trn]))
            X_val = np.hstack((X[i_val], X_cv[i_val]))
            X_tst_ = np.hstack((X_tst, X_tst_cv))
        else:
            is_cv_feature = False
            X_trn = X[i_trn]
            X_val = X[i_val]
            X_tst_ = X_tst

        if i == 1:
            logging.info('Training with early stopping')
            clf = cbt.CatBoostRegressor(learning_rate=lrate,
                                        depth=depth,
                                        l2_leaf_reg=l2_leaf_reg,
                                        iterations=n_est,
                                        loss_function='RMSE',
                                        random_seed=SEED,
                                        thread_count=N_JOB)

            if len(cat_cols) > 0:
                clf = clf.fit(X_trn,
                              y[i_trn],
                              eval_set=[X_val, y[i_val]],
                              use_best_model=True,
                              cat_features=cat_cols)
            else:
                clf = clf.fit(X_trn,
                              y[i_trn],
                              eval_set=[X_val, y[i_val]],
                              use_best_model=True)

            n_best = clf.tree_count_
            logging.info('best iteration={}'.format(n_best))

            df = pd.read_csv(feature_map_file,
                             sep='\t',
                             names=['id', 'name', 'type'])
            df['gain'] = clf.feature_importances_
            df.loc[:, 'gain'] = df.gain / df.gain.sum()
            df.sort_values('gain', ascending=False, inplace=True)
            df.to_csv(feature_importance_file, index=False)
            logging.info('feature importance is saved in {}'.format(
                feature_importance_file))
        else:
            clf = cbt.CatBoostRegressor(learning_rate=lrate,
                                        depth=depth,
                                        l2_leaf_reg=l2_leaf_reg,
                                        iterations=n_best,
                                        loss_function='RMSE',
                                        random_seed=SEED,
                                        thread_count=N_JOB)

            if len(cat_cols) > 0:
                clf = clf.fit(X_trn,
                              y[i_trn],
                              eval_set=(X_val, y[i_val]),
                              use_best_model=False,
                              cat_features=cat_cols)
            else:
                clf = clf.fit(X_trn,
                              y[i_trn],
                              eval_set=(X_val, y[i_val]),
                              use_best_model=False)

        p_val[i_val] = clf.predict(X_val)
        logging.info('CV #{}: {:.6f}'.format(i, kappa(y[i_val], p_val[i_val])))

        p_tst += clf.predict(X_tst_) / N_FOLD

    logging.info('CV: {:.6f}'.format(kappa(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Beispiel #30
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  feature_map,
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X_trn, y_trn = load_data(train_file)
    X_tst, _ = load_data(test_file)

    feature_map = pd.read_table(feature_map,
                                index_col=0,
                                header=None,
                                names=['feature_names', 'feature_type'])
    features = feature_map['feature_names'].values
    train_df = pd.DataFrame(data=X_trn.toarray(),
                            columns=feature_map['feature_names'])
    test_df = pd.DataFrame(data=X_tst.toarray(),
                           columns=feature_map['feature_names'])
    train_test = train_df.append(test_df)

    test_data = [
        test_df.loc[:, features].values[:, k]
        for k in range(test_df.loc[:, features].values.shape[1])
    ]

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=50, shuffle=True, random_state=SEED)

    vld_preds = np.zeros_like(y_trn)
    tst_preds = np.zeros((X_tst.shape[0], ))
    for cv_idx, (i_trn, i_vld) in enumerate(cv.split(X_trn, y_trn), 1):

        X_trn_cv = train_df.iloc[i_trn, :].reset_index(drop=True)
        X_vld_cv = train_df.iloc[i_vld, :].reset_index(drop=True)
        y_trn_cv = y_trn[i_trn]
        y_vld_cv = y_trn[i_vld]

        logging.info('Training model #{}'.format(cv_idx))

        clf = create_keras_embedding_model(train_test, features)
        clf.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=[auc])

        X_trn_cv = [
            X_trn_cv.loc[:, features].values[:, k]
            for k in range(X_trn_cv.loc[:, features].values.shape[1])
        ]
        X_vld_cv = [
            X_vld_cv.loc[:, features].values[:, k]
            for k in range(X_vld_cv.loc[:, features].values.shape[1])
        ]

        es = callbacks.EarlyStopping(monitor='val_auc',
                                     min_delta=0.001,
                                     patience=5,
                                     verbose=1,
                                     mode='max',
                                     baseline=None,
                                     restore_best_weights=True)

        rlr = callbacks.ReduceLROnPlateau(monitor='val_auc',
                                          factor=0.5,
                                          patience=3,
                                          min_lr=1e-6,
                                          mode='max',
                                          verbose=1)

        clf.fit(X_trn_cv,
                utils.to_categorical(y_trn_cv),
                validation_data=(X_vld_cv, utils.to_categorical(y_vld_cv)),
                verbose=0,
                batch_size=1024,
                callbacks=[es, rlr],
                epochs=50)

        vld_preds[i_vld] = clf.predict(X_vld_cv)[:, 1]

        logging.info('CV #{}: {:.4f}'.format(
            cv_idx, auc(y_trn[i_vld], vld_preds[i_vld])))

        if not retrain:
            tst_preds += (model.predict(test_data)[:, 1] / N_FOLDS).ravel()

    logging.info('Saving validation predictions...')
    logging.info('CV: {:.4f}'.format(AUC(y_trn, vld_preds)))
    np.savetxt(predict_valid_file, vld_preds, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')

        clf = create_keras_embedding_model(train_test, features)
        clf.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=[auc])

        X_trn_all = [
            train_df.loc[:, features].values[:, k]
            for k in range(train_df.loc[:, features].values.shape[1])
        ]

        clf.fit(X_trn_all,
                utils.to_categorical(y_trn),
                validation_data=(X_trn_all, utils.to_categorical(y_trn)),
                verbose=0,
                batch_size=1024,
                callbacks=[es, rlr],
                epochs=50)

        tst_preds = (clf.predict(test_data)[:, 1]).ravel()

    logging.info('Saving normalized test predictions...')
    np.savetxt(predict_test_file, tst_preds, fmt='%.6f', delimiter=',')