df_train, df_test = data.load_data(cache=True) df = data.create_fulldf(df_train, df_test) df = df.fillna(NULL_VALUE) df = data.clean_data(df) df = data.encode_labels(df) #df = features.add_features(df) logerror = df['logerror'].values targets = logerror df = data.select_features(df) df = df.drop(['assessmentyear'], axis=1) print df.columns if cv_flag: df_full_train, targets, df_test = data.split_data(df, logerror) df_train, df_val, train_targets, val_targets = data.split_cv(df_full_train, targets, cv_split_ratio) cv_preds = np.repeat(0., len(df_val)) for i in range(n_bags): x_train, x_val = tools.normalise_data(df_train.values, df_val.values) model = model_params.get_keras(x_train.shape[1]) history = model.fit( x_train, train_targets, nb_epoch=epochs, batch_size=batch_size, validation_data=(x_val, val_targets), verbose=2) model.history = history cv_preds += model.predict(x_val).squeeze() cv_preds /= float(n_bags) mae = tools.get_mae_loss(val_targets, cv_preds)
return predictions def predict_multiple_months(df_test, predict_func): sub = pd.read_csv('../input/sample_submission.csv') df = df_test.copy() for c in sub.columns[sub.columns != 'ParcelId']: df['transaction_month'] = np.repeat(c[4:6], len(df)) df['transaction_year'] = np.repeat(c[0:4], len(df)) predictions = predict_func(model, df.values) print 'predicting for ' + c + ' ' + str(predictions.sum()) sub[c] = predictions return sub if __name__ == '__main__': df_train, df_test = data.load_data() 1/0 df = data.create_fulldf(df_train, df_test) df = df.fillna(NULL_VALUE) df = data.clean_data(df) df = data.encode_labels(df) df = features.add_features(df) #df = data.add_month_and_year(df) targets = df['logerror'].values df = data.select_features(df) df_train, targets, df_test = data.split_data(df, targets) model = train_xgb_cv(df_train, targets) sub = predict_multiple_months(df_test, predict_xgb) data.generate_kaggle_file(sub, 'sub/xgb_try_exper_quasi.csv')