コード例 #1
0
def lgb_gs(set_params, dtrn_X, trn_y,  dval_X, val_X, val_y):
    min_score = 100
    for params in tqdm(list(ParameterGrid(set_params))):
        logger.debug('params:\n {}'.format(params))
        model = lgb.train(params, dtrn_X,
                          num_boost_round=1000,
                          valid_sets=[dval_X],
                          early_stopping_rounds=100,
                          verbose_eval=50)

        pred = model.predict(val_X,
                             num_iteration=model.best_iteration)
        sc_rmse = rmse(val_y, pred)

        if min_score > sc_rmse:
            min_score = sc_rmse
            min_params = params

        logger.debug('rmse: {}'.format(sc_rmse))
        logger.info('current min rmse: {}'.format(min_score))

    logger.info('')
    logger.info('Top min params:\n {}'.format(min_params))
    logger.info('Top min rmse: {}'.format(min_score))

    return min_params
コード例 #2
0
ファイル: train_06.py プロジェクト: yuyuyu0706/kaggle1
def run_lgb(train_X, train_y):
    lg_params = {
        "objective": "regression",
        "boosting": "gbdt",
        "metric": "rmse",
        "num_leaves": 128,  # [32, 48, 64, 128]
        "learning_rate": 0.07,  # [0.05, 0.07, 0.1, 0.2]
        "feature_fraction": 0.7,
        "bagging_freq": 5,
        "bagging_fraction": 0.7,
        "bagging_seed": 2018,
        "verbosity": -1
    }

    trn_X, val_X, trn_y, val_y = train_test_split(train_X,
                                                  train_y,
                                                  test_size=0.20,
                                                  shuffle=True,
                                                  random_state=0)
    lg_trn = lgb.Dataset(trn_X, label=trn_y)
    lg_val = lgb.Dataset(val_X, label=val_y)

    logger.info('split.train: {}'.format(trn_X.shape))
    logger.info('split.valid: {}'.format(val_X.shape))

    # GridSearch
    # min_params = lgb_gs(lg_params, lg_trn, trn_y, lg_val, val_X, val_y)

    model = lgb.train(lg_params,
                      lg_trn,
                      num_boost_round=5000,
                      valid_sets=[lg_val],
                      early_stopping_rounds=100,
                      verbose_eval=50)

    pred_trn = model.predict(trn_X, num_iteration=model.best_iteration)
    pred_val = model.predict(val_X, num_iteration=model.best_iteration)
    rmse_trn = rmse(trn_y, pred_trn)
    rmse_val = rmse(val_y, pred_val)
    logger.info('rmse - Train: {}'.format(rmse_trn))
    logger.info('rmse - valid: {}'.format(rmse_val))

    # Feature Importance
    logger.debug('Feature Importances')
    feat_n = model.feature_name()
    feat_i = list(model.feature_importance())

    df_tmp1 = pd.DataFrame(feat_n, columns={'feat_n'})
    df_tmp2 = pd.DataFrame(feat_i, columns={'feat_i'})
    df_tmp = df_tmp1.join(df_tmp2, how='inner')
    df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False)
    df_tmp = df_tmp.reset_index(drop=True)
    df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum()

    for i in range(len(df_tmp.index)):
        logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0],
                                                     df_tmp.ix[i, 1]))
    return model
コード例 #3
0
ファイル: train_21.py プロジェクト: yuyuyu0706/kaggle1
def run_lgb(trn_X, trn_y, val_X, val_y, tfvocab, cat_vars):
    lg_params = {
        "objective": "regression",
        "boosting": "gbdt",
        "metric": "rmse",
        # "max_depth": 15,         # [15]
        "num_leaves": 128,      # [256]
        "learning_rate": 0.07,   # [0.018]
        "feature_fraction": 0.7, # [0.5]
        "bagging_freq": 5,
        "bagging_fraction": 0.7, # [0.75]
        "bagging_seed": 2018,
        "verbosity": -1,
        # "verbose": 0
        }
    
    lg_trn = lgb.Dataset(trn_X, label=trn_y, feature_name=tfvocab, categorical_feature=cat_vars)
    lg_val = lgb.Dataset(val_X, label=val_y, feature_name=tfvocab, categorical_feature=cat_vars)
    
    logger.info('split.train: {}'.format(trn_X.shape))
    logger.info('split.valid: {}'.format(val_X.shape))

    # GridSearch
    # min_params = lgb_gs(lg_params, lg_trn, trn_y, lg_val, val_X, val_y)
   
    # Train Start
    model = lgb.train(lg_params, lg_trn,
                      num_boost_round=16000,
                      valid_sets=[lg_val],
                      early_stopping_rounds=200,
                      verbose_eval=100)
    
    pred_trn = model.predict(trn_X, num_iteration=model.best_iteration)
    pred_val = model.predict(val_X, num_iteration=model.best_iteration)
    rmse_trn = rmse(trn_y, pred_trn)
    rmse_val = rmse(val_y, pred_val)
    logger.info('rmse - Train: {}'.format(rmse_trn))
    logger.info('rmse - valid: {}'.format(rmse_val))

    # Feature Importance
    logger.debug('Feature Importances')
    feat_n = model.feature_name()
    feat_i = list(model.feature_importance())

    df_tmp1 = pd.DataFrame(feat_n, columns={'feat_n'})
    df_tmp2 = pd.DataFrame(feat_i, columns={'feat_i'})
    df_tmp = df_tmp1.join(df_tmp2, how='inner')
    df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False)
    df_tmp = df_tmp.reset_index(drop=True)
    df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum()

    # for i in range(len(df_tmp.index)):
    for i in range(50):
        logger.debug('\t{0:20s} : {1:>10.6f}'.format(
                         df_tmp.ix[i, 0], df_tmp.ix[i, 1]))
    return model
コード例 #4
0
ファイル: train_06.py プロジェクト: yuyuyu0706/kaggle1
def run_xgb(train_X, train_y):
    xg_params = {
        "max_depth": 8,  # [4, 6, 8]
        "min_child_weight": 6,  # [4, 6, 8]
        "learning_rate": 0.1,  # [0.05, 0.075, 0.1, 0.2]
        "colsample_bytree": 0.8,
        "colsample_bylevel": 0.8,
        "reg_alpha": 0,
    }

    trn_X, val_X, trn_y, val_y = train_test_split(train_X,
                                                  train_y,
                                                  test_size=0.20,
                                                  random_state=0)
    xg_trn = xgb.DMatrix(trn_X, label=trn_y)
    xg_val = xgb.DMatrix(val_X, label=val_y)
    watchlist = [(xg_trn, 'train'), (xg_val, 'eval')]

    logger.info('split.train: {}'.format(trn_X.shape))
    logger.info('split.valid: {}'.format(val_X.shape))

    # GridSearch
    # min_params = xgb_gs(xg_params, xg_trn, trn_y, xg_val, val_y, wl=watchlist)

    model = xgb.train(xg_params,
                      xg_trn,
                      num_boost_round=5000,
                      evals=watchlist,
                      early_stopping_rounds=100,
                      verbose_eval=50)

    pred_trn = model.predict(xg_trn, ntree_limit=model.best_ntree_limit)
    pred_val = model.predict(xg_val, ntree_limit=model.best_ntree_limit)
    rmse_trn = rmse(trn_y, pred_trn)
    rmse_val = rmse(val_y, pred_val)
    logger.info('rmse - Train: {}'.format(rmse_trn))
    logger.info('rmse - valid: {}'.format(rmse_val))

    # Feature Importance
    create_feats_map(list(trn_X.columns[2:]))
    feat_i = model.get_fscore(fmap=XGBFMAP)

    df_tmp = pd.DataFrame(list(feat_i.items()), columns=['feat_n', 'feat_i'])
    df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False)
    df_tmp = df_tmp.reset_index(drop=True)
    df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum()

    for i in range(len(df_tmp.index)):
        logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0],
                                                     df_tmp.ix[i, 1]))
    return model
コード例 #5
0
ファイル: train_14.py プロジェクト: yuyuyu0706/kaggle1
def ens_nn(trn_X, trn_y, trn_rows, test_X, test_rows):
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    pred_trn_X = np.zeros((trn_rows, ))
    pred_test_X = np.zeros((test_rows, ))
    pred_test_skf = np.empty((NFOLDS, test_rows))

    lr = 0.1
    bz = int(trn_X.shape[0] / 10)
    ep = 50
    op = ks.optimizers.Adam(lr=lr)

    model_in = ks.Input(shape=(trn_X.shape[1], ), dtype='float32', sparse=True)
    out = ks.layers.Dense(192, activation='relu')(model_in)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.Dense(1)(out)
    model = ks.Model(model_in, out)
    model.compile(loss='mean_squared_error', optimizer=op)

    for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))):
        x_trn = trn_X[trn_i]
        y_trn = trn_y[trn_i]
        x_test = trn_X[test_i]

        for j in range(ep):
            model.fit(x=x_trn, y=y_trn, batch_size=bz, epochs=1, verbose=0)
            pred_trn = model.predict(x_trn, batch_size=bz)[:, 0]
            rmse_trn = rmse(y_trn, pred_trn)
            logger.debug('epochs {0}: rmse - Train:{1:.6f}'.format(
                j + 1, rmse_trn))

        pred_trn_X[test_i] = model.predict(x_test, batch_size=bz)[:, 0]
        pred_test_skf[i, :] = model.predict(test_X, batch_size=bz)[:, 0]

    pred_test_X[:] = pred_test_skf.mean(axis=0)

    rmse_trn = rmse(trn_y, pred_trn_X)
    logger.info('rmse - NN Train: {}'.format(rmse_trn))

    return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
コード例 #6
0
ファイル: train_18.py プロジェクト: yuyuyu0706/kaggle1
    # ElasticNet Feature Processing
    '''en_train, en_test = ens_en(ready_df[:train_row],
                               train_y, train_row,
                               ready_df[train_row:], test_row)
    en_preds = np.concatenate([en_train, en_test])
    df['en_preds'] = en_preds
    del en_preds, en_train, en_test
    gc.collect()'''

    # NN Feature Processing - TFIDF
    # nn_train, nn_test = ens_nn(ready_df[:train_row],
    #                           train_y, train_row,
    #                           ready_df[train_row:], test_row)
    # NN Feature Processing - Features
    logger.debug(df.isnull().sum())
    nn_train, nn_test = ens_nn(df[:train_row], train_y, train_row,
                               df[train_row:], test_row)
    nn_preds = np.concatenate([nn_train, nn_test])
    df['nn_preds'] = nn_preds
    del nn_preds, nn_train, nn_test
    gc.collect()

    # XGB Feature Processing
    '''xgb_train, xgb_test = ens_xgb(ready_df[:train_row],
                                  train_y, train_row,
                                  ready_df[train_row:], test_row)
    xgb_preds = np.concatenate([xgb_train, xgb_test])
    df['xgb_preds'] = xgb_preds
    del xgb_preds, xgb_train, xgb_test
    gc.collect()'''
コード例 #7
0
ファイル: train_19.py プロジェクト: yuyuyu0706/kaggle1
def ens_nn(trn_X, trn_y, trn_rows, test_X, test_rows):

    # trn_X['param_feat'], test_X['param_feat'], tknzr_pf = tknzr_fit('param_feat', trn_X, test_X)
    # trn_X['description'], test_X['description'], tknzr_pf = tknzr_desc_fit('description', trn_X, test_X)

    trn_X = trn_X.ix[:, [1, 2, 3, 4, 8, 11, 5]]
    test_X = test_X.ix[:, [1, 2, 3, 4, 8, 11, 5]]

    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    pred_trn_X = np.zeros((trn_rows, ))
    pred_test_X = np.zeros((test_rows, ))
    pred_test_skf = np.empty((NFOLDS, test_rows))

    lr = 0.005
    bz = 100000
    ep = 500
    op = Adam(lr=lr)
    # early = EarlyStopping(monitor='val_loss', patience=500, mode='min')

    logger.info('NN Train Shape: {}'.format(trn_X.shape))
    logger.info('NN Test Shape : {}'.format(test_X.shape))
    '''# model_in = ks.Input(shape=(trn_X.shape[1],), dtype='float32', sparse=True)
    # model_in = Input(shape=(trn_X.shape[1],), dtype='float32', sparse=False)
    # out = ks.layers.Dense(192, activation='relu')(model_in)
    # out = ks.layers.Dense(64, activation='relu')(out)
    # out = ks.layers.Dense(64, activation='relu')(out)
    out = Dense(16, activation='relu')(model_in)
    out = Dense(8, activation='relu')(out)
    out = Dense(8, activation='relu')(out)
    out = Dense(1, activation='sigmoid')(out)
    model = Model(model_in, out)'''

    model = make_model_nn(trn_X, test_X)
    model.compile(loss='mean_squared_error', optimizer=op)
    '''tr_reg, ts_reg = ((trn_X['region']), (test_X['region']))
    tr_city, ts_city = (np.array(trn_X['city']), np.array(test_X['city']))
    tr_pcn, ts_pcn = (np.array(trn_X['parent_category_name']), np.array(test_X['parent_category_name']))
    tr_cn, ts_cn = (np.array(trn_X['category_name']), np.array(test_X['category_name']))
    tr_ut, ts_ut = (np.array(trn_X['user_type']), np.array(test_X['user_type']))
    tr_pf, ts_pf, tknzr_pf = tknzr_fit('param_feat', trn_X, test_X)'''
    trn_X['param_feat'], test_X['param_feat'], tknzr_pf = tknzr_fit(
        'param_feat', trn_X, test_X)

    # trn_X = np.array([tr_reg, tr_city, tr_pcn, tr_cn, tr_ut, tr_pf[:,0]])
    # test_X = np.array([ts_reg, ts_city, ts_pcn, tr_cn, tr_ut, tr_pf[:,0]])

    trn_X = np.array(trn_X)
    test_X = np.array(test_X)

    logger.debug(model.summary())

    for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))):
        # for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X.index)))):
        x_trn = trn_X[trn_i]
        y_trn = trn_y[trn_i]
        # x_trn = trn_X.iloc[trn_i]
        # y_trn = trn_y.iloc[trn_i]
        x_trn, x_val, y_trn, y_val = train_test_split(x_trn,
                                                      y_trn,
                                                      test_size=0.10,
                                                      shuffle=False,
                                                      random_state=23)
        x_test = trn_X[test_i]
        # x_test = trn_X.iloc[test_i]
        '''x_trn = np.array(x_trn)
        y_trn = np.array(y_trn)
        x_val = np.array(x_val)
        y_val = np.array(y_val)
        x_test = np.array(x_test)
        test_X = np.array(test_X)'''

        model.fit(
            x=[
                x_trn[:, 0], x_trn[:, 1], x_trn[:, 2], x_trn[:, 3],
                x_trn[:, 4], x_trn[:, 5]
            ],
            y=y_trn,
            # model.fit(x=[x_trn[:,[0,1,2,3,4,5]]], y=y_trn,
            validation_data=([
                x_val[:, 0], x_val[:, 1], x_val[:, 2], x_val[:, 3],
                x_val[:, 4], x_val[:, 5]
            ], y_val),
            batch_size=bz,
            epochs=ep,
            verbose=1)
        # batch_size=bz, epochs=ep, callbacks=[early], verbose=1)
        pred_trn_X[test_i] = model.predict([
            x_test[:, 0], x_test[:, 1], x_test[:, 2], x_test[:, 3],
            x_test[:, 4], x_test[:, 5]
        ],
                                           batch_size=bz)[:, 0]
        pred_test_skf[i, :] = model.predict([
            test_X[:, 0], test_X[:, 1], test_X[:, 2], test_X[:, 3],
            test_X[:, 4], test_X[:, 5]
        ],
                                            batch_size=bz)[:, 0]

    pred_test_X[:] = pred_test_skf.mean(axis=0)

    rmse_trn = rmse(trn_y, pred_trn_X)
    logger.info('Rmse - NN Train: {}, lr: {}, bz: {}, ep: {}'.format(
        rmse_trn, lr, bz, ep))

    return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)