df_train, df_test = data.load_data(cache=True) df = data.create_fulldf(df_train, df_test) df = df.fillna(NULL_VALUE) df = data.clean_data(df) df = data.encode_labels(df) #df = features.add_features(df) logerror = df['logerror'].values targets = logerror df = data.select_features(df) print df.columns if cv_flag: df_full_train, targets, df_test = data.split_data(df, logerror) df_train, df_test, train_targets, test_targets = data.split_cv( df_full_train, targets, cv_split_ratio) dtest = xgb.DMatrix(df_test.values, test_targets) dtrain = xgb.DMatrix(df_train.values, train_targets) params = model_params.get_xtune11k() cv_preds = np.repeat(0, len(df_test)) num_boost_rounds = 110 for i in range(n_bags): watchlist = [(dtrain, 'train'), (dtest, 'eval')] model = xgb.train(params, dtrain, num_boost_round=num_boost_rounds, evals=watchlist, early_stopping_rounds=50) cv_preds = model.predict(dtest) + cv_preds
df = data.create_fulldf(df_train, df_test) df = df.fillna(NULL_VALUE) df = data.clean_data(df) df = data.encode_labels(df) #df = features.add_features(df) logerror = df['logerror'].values targets = logerror df = data.select_features(df) df = df.drop(['assessmentyear'], axis=1) print df.columns if cv_flag: df_full_train, targets, df_test = data.split_data(df, logerror) df_train, df_val, train_targets, val_targets = data.split_cv(df_full_train, targets, cv_split_ratio) cv_preds = np.repeat(0., len(df_val)) for i in range(n_bags): x_train, x_val = tools.normalise_data(df_train.values, df_val.values) model = model_params.get_keras(x_train.shape[1]) history = model.fit( x_train, train_targets, nb_epoch=epochs, batch_size=batch_size, validation_data=(x_val, val_targets), verbose=2) model.history = history cv_preds += model.predict(x_val).squeeze() cv_preds /= float(n_bags) mae = tools.get_mae_loss(val_targets, cv_preds) mse = mean_squared_error(val_targets, cv_preds)
if __name__ == '__main__': df_train, df_test = data.load_data(cache=True) df = data.create_fulldf(df_train, df_test) df = df.fillna(NULL_VALUE) df = data.clean_data(df) df = data.encode_labels(df) #df = features.add_features(df) targets = df['logerror'].values df = data.select_features(df) print df.columns df_full_train, targets, df_test = data.split_data(df, targets) df_train, df_test, train_targets, test_targets = data.split_cv( df_full_train, targets, 0.8) dtrain = xgb.DMatrix(df_train.values, train_targets) dtest = xgb.DMatrix(df_test.values, test_targets) y_mean = np.mean(train_targets) watchlist = [(dtrain, 'train'), (dtest, 'eval')] num_boost_rounds = 80000 while True: params = sample_params(random=False) model = xgb.train(params, dtrain, num_boost_round=num_boost_rounds, evals=watchlist, early_stopping_rounds=15)