Esempio n. 1
0
    df_train, df_test = data.load_data(cache=True)
    df = data.create_fulldf(df_train, df_test)

    df = df.fillna(NULL_VALUE)
    df = data.clean_data(df)
    df = data.encode_labels(df)
    #df = features.add_features(df)

    logerror = df['logerror'].values
    targets = logerror
    df = data.select_features(df)

    print df.columns
    if cv_flag:
        df_full_train, targets, df_test = data.split_data(df, logerror)
        df_train, df_test, train_targets, test_targets = data.split_cv(
            df_full_train, targets, cv_split_ratio)

        dtest = xgb.DMatrix(df_test.values, test_targets)
        dtrain = xgb.DMatrix(df_train.values, train_targets)

        params = model_params.get_xtune11k()
        cv_preds = np.repeat(0, len(df_test))
        num_boost_rounds = 110
        for i in range(n_bags):
            watchlist = [(dtrain, 'train'), (dtest, 'eval')]
            model = xgb.train(params,
                              dtrain,
                              num_boost_round=num_boost_rounds,
                              evals=watchlist,
                              early_stopping_rounds=50)
            cv_preds = model.predict(dtest) + cv_preds
Esempio n. 2
0
    df = data.create_fulldf(df_train, df_test)

    df = df.fillna(NULL_VALUE)
    df = data.clean_data(df)
    df = data.encode_labels(df)
    #df = features.add_features(df)

    logerror = df['logerror'].values
    targets = logerror
    df = data.select_features(df)
    df = df.drop(['assessmentyear'], axis=1)

    print df.columns
    if cv_flag:
        df_full_train, targets, df_test = data.split_data(df, logerror)
        df_train, df_val, train_targets, val_targets = data.split_cv(df_full_train, targets, cv_split_ratio)

        cv_preds = np.repeat(0., len(df_val))
        for i in range(n_bags):
            x_train, x_val = tools.normalise_data(df_train.values, df_val.values)
            model = model_params.get_keras(x_train.shape[1])
            history = model.fit(
                    x_train, train_targets,
                    nb_epoch=epochs, batch_size=batch_size,
                            validation_data=(x_val, val_targets), verbose=2)
            model.history = history
            cv_preds += model.predict(x_val).squeeze()
        cv_preds /= float(n_bags)

        mae = tools.get_mae_loss(val_targets, cv_preds)
        mse = mean_squared_error(val_targets, cv_preds)
Esempio n. 3
0
if __name__ == '__main__':

    df_train, df_test = data.load_data(cache=True)
    df = data.create_fulldf(df_train, df_test)

    df = df.fillna(NULL_VALUE)
    df = data.clean_data(df)
    df = data.encode_labels(df)
    #df = features.add_features(df)

    targets = df['logerror'].values
    df = data.select_features(df)

    print df.columns
    df_full_train, targets, df_test = data.split_data(df, targets)
    df_train, df_test, train_targets, test_targets = data.split_cv(
        df_full_train, targets, 0.8)

    dtrain = xgb.DMatrix(df_train.values, train_targets)
    dtest = xgb.DMatrix(df_test.values, test_targets)

    y_mean = np.mean(train_targets)
    watchlist = [(dtrain, 'train'), (dtest, 'eval')]

    num_boost_rounds = 80000
    while True:
        params = sample_params(random=False)
        model = xgb.train(params,
                          dtrain,
                          num_boost_round=num_boost_rounds,
                          evals=watchlist,
                          early_stopping_rounds=15)