def predict_mlp(df, save_name): model = keras.models.load_model(save_name) return model.predict(get_inputs(df)[0], batch_size=1024) if __name__ == "__main__": """ python scripts/04_predict_mlp_meter.py --normalize_target python scripts/04_predict_mlp_meter.py """ args = parser.parse_args() with timer("Loading data"): if args.normalize_target: test = load_data("test_nn_target_normalized_meter") test_square_feet = load_data("test_clean")["square_feet"].values else: test = load_data("test_nn_meter") test["target"] = -1 with timer("Predicting"): test_preds = np.zeros(len(test)) for m in range(4): print(m) # get base file name model_name = f"mlp-split_meter" make_dir(f"{MODEL_PATH}/{model_name}")
# month columns "month_x", "month_y", "building_month", #"month", "gte_meter_building_id_month" ] if __name__ == "__main__": """ python scripts/04_predict_cb_site.py --normalize_target python scripts/04_predict_cb_site.py """ args = parser.parse_args() with timer("Loading data"): test = load_data("test_clean") test.drop(DROP_COLS, axis=1, inplace=True) with timer("Preprocesing"): for x in CAT_COLS: test[x] = test[x].astype("category") if args.normalize_target: target_encode_cols = [x for x in test.columns if "gte" in x] test[target_encode_cols] = test[target_encode_cols] / np.log1p( test[["square_feet"]].values) # get base file name test_preds = np.zeros(len(test)) for s in range(16):
# test_indices = test.site_id == s # for col in CAT_COLS: # x = np.concatenate([train.loc[train_indices, col], test.loc[test_indices, col]]) # encoder = FastLabelEncoder() # encoder.fit(x) # train.loc[train_indices, col] = encoder.transform(train.loc[train_indices, col]) # test.loc[test_indices, col] = encoder.transform(test.loc[test_indices, col]) # with timer("Save Data"): # train.to_pickle(f"{DATA_PATH}/preprocessed/train_nn_target_normalized_site.pkl") # test.to_pickle(f"{DATA_PATH}/preprocessed/test_nn_target_normalized_site.pkl") # meter site no normalization with timer("Loading data"): train, test = load_data("clean") with timer("Standardize Numeric Features"): for s in range(16): train_indices = train.site_id == s test_indices = test.site_id == s X = np.concatenate([ train.loc[train_indices, NUM_COLS].values, test.loc[test_indices, NUM_COLS].values ]) mu = X.mean(0) sig = X.std(0) train.loc[train_indices,
import os import glob import numpy as np import pandas as pd from functools import partial from sklearn.metrics import mean_squared_error from ashrae.blenders import load_preds, GeneralizedMeanBlender from ashrae.utils import OUTPUT_PATH, load_data, rmsle, timer if __name__ == "__main__": """ python scripts/05_blend_predictions.py """ # load test and leak with timer("load test and leak"): test = load_data("test_clean") leak = load_data("is_leak") target = leak["meter_reading"].values # load predictions with timer("load predictions"): preds_matrix = [np.load(x) for x in glob.glob(f"{OUTPUT_PATH}/*.npy")] if len(glob.glob(f"{OUTPUT_PATH}/*.csv")) > 0: preds_matrix += [ pd.read_csv(x).meter_reading.values for x in glob.glob(f"{OUTPUT_PATH}/*.csv") ] preds_matrix = np.vstack(preds_matrix).T preds_matrix[preds_matrix < 0] = 0
def train_mlp(train, valid, cat_counts, save_name, lr=1e-3, lr_decay=1e-4, batch_size=512, epochs=25, emb_l2_reg=1e-3, emb_dim=1, n_dense_max=256, n_dense_min=32, n_layers=3, dropout=0.5): #------------------------- with timer("Create model"): # inputs num_inputs = keras.layers.Input(shape=(len(NUM_COLS), ), name="numerical_inputs") cat_inputs = [ keras.layers.Input(shape=(1, ), name=x) for x in CAT_COLS ] # embedding emb_inputs = [] for x, i in zip(cat_counts, cat_inputs): emb = keras.layers.Embedding( input_dim=cat_counts[x], output_dim=emb_dim, embeddings_regularizer=keras.regularizers.l2(emb_l2_reg)) emb = keras.layers.Flatten()(emb(i)) emb_inputs.append(emb) # mlp inputs = keras.layers.Concatenate(name="general_features")( [num_inputs, *emb_inputs]) for i in range(n_layers): n_dense = int(max((0.5**i) * n_dense_max, n_dense_min)) inputs = keras.layers.Dense(n_dense, activation="relu")(inputs) inputs = keras.layers.Dropout(dropout)(inputs) inputs = keras.layers.BatchNormalization()(inputs) # output outputs = keras.layers.Dense(1, activation=None, name="outputs")(inputs) model = keras.models.Model(inputs=[num_inputs, *cat_inputs], outputs=outputs) # compile model.compile(loss=keras.losses.mean_squared_error, optimizer=keras.optimizers.Adam(lr=lr, decay=lr_decay)) model.summary() #------------------------- with timer("Training"): model.fit( *get_inputs(train), batch_size=batch_size, epochs=epochs, validation_data=get_inputs(valid), callbacks=[ keras.callbacks.EarlyStopping(patience=2, verbose=1), keras.callbacks.ModelCheckpoint( save_name, # f"{MODEL_PATH}/model_oof.hdf5" save_best_only=True, verbose=1, monitor='val_loss', mode='min') ]) return
# month columns "month_x", "month_y", "building_month", #"month", "gte_meter_building_id_month" ] if __name__ == "__main__": """ python scripts/03_train_cb_site.py --normalize_target python scripts/03_train_cb_site.py """ args = parser.parse_args() with timer("Loading data"): train = load_data("train_clean") train.drop(DROP_COLS, axis=1, inplace=True) train = train.loc[train.is_bad_meter_reading == 0].reset_index( drop=True) with timer("Preprocesing"): for x in CAT_COLS: train[x] = train[x].astype("category") if args.normalize_target: target_encode_cols = [x for x in train.columns if "gte" in x] train[target_encode_cols] = train[target_encode_cols] / np.log1p( train[["square_feet"]].values) train["target"] = np.log1p(train["meter_reading"]) / np.log1p( train["square_feet"])
df["building_weekday"] = bm_ + df.weekday.astype(str) df["building_month"] = bm_ + df.month.astype(str) df["building_hour"] = bm_ + df.hour.astype(str) df["building_meter"] = bm_ # get holidays dates_range = pd.date_range(start="2015-12-31", end="2019-01-01") us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max()) df["is_holiday"] = ( df.ts.dt.date.astype("datetime64").isin(us_holidays)).astype(np.int8) if __name__ == "__main__": with timer("Loading data"): train, test = load_data("input") building_meta = load_data("meta") train_weather, test_weather = load_data("weather") with timer("Process timestamp"): train["ts"] = pd.to_datetime(train.timestamp) test["ts"] = pd.to_datetime(test.timestamp) process_timestamp(train) process_timestamp(test) process_timestamp(train_weather) process_timestamp(test_weather) with timer("Process weather"): process_weather(train_weather, "train") process_weather(test_weather, "test")
# month columns "month_x", "month_y", "building_month", #"month", "gte_meter_building_id_month" ] if __name__ == "__main__": """ python scripts/04_predict_lgb_meter.py --normalize_target python scripts/04_predict_lgb_meter.py """ args = parser.parse_args() with timer("Loading data"): test = load_data("test_clean") test.drop(DROP_COLS, axis=1, inplace=True) with timer("Preprocesing"): for x in CAT_COLS: test[x] = test[x].astype("category") if args.normalize_target: target_encode_cols = [x for x in test.columns if "gte" in x] test[target_encode_cols] = test[target_encode_cols]/np.log1p(test[["square_feet"]].values) with timer("Predicting"): # get base file name test_preds = np.zeros(len(test)) for m in range(4):
f"{OUTPUT_PATH}/cb-split_primary_use-target_normalization.npy", f"{OUTPUT_PATH}/cb-split_site-no_normalization.npy", f"{OUTPUT_PATH}/cb-split_site-target_normalization.npy", f"{OUTPUT_PATH}/mlp-split_meter-no_normalization.npy", f"{OUTPUT_PATH}/submission_cleanup.csv", f"{OUTPUT_PATH}/submission_kfold.csv", f"{OUTPUT_PATH}/submission_meter.csv", ] if __name__ == "__main__": """ python scripts/05_blend_predictions.py """ # load test data with timer("load test data"): test = load_data("test_clean") leak = load_data("is_leak") target = leak["meter_reading"].values # load predictions with timer("load predictions"): preds_matrix = [np.load(x) for x in MODEL_LIST if ".npy" in x] replace_inds = (test.site_id == 0) & (test.meter == 0) if len([x for x in MODEL_LIST if ".csv" in x]) > 0: preds_matrix += [ pd.read_csv(x).meter_reading.values for x in MODEL_LIST if ".csv" in x ]
def predict_mlp(df, save_name): model = keras.models.load_model(save_name) return model.predict(get_inputs(df)[0], batch_size=1024) if __name__ == "__main__": """ python scripts/04_predict_mlp_meter.py --normalize_target python scripts/04_predict_mlp_meter.py """ args = parser.parse_args() with timer("Loading data"): if args.normalize_target: test = load_data("test_nn_tareget_normalized_meter") else: test = load_data("test_nn_meter") test_preds = np.zeros(len(test)) for m in range(4): print(m) # get base file name model_name = f"mlp-split_meter" make_dir(f"{MODEL_PATH}/{model_name}") # create sub model path if args.normalize_target:
f"{OUTPUT_PATH}/cb-split_primary_use-target_normalization.npy", f"{OUTPUT_PATH}/cb-split_site-no_normalization.npy", f"{OUTPUT_PATH}/cb-split_site-target_normalization.npy", f"{OUTPUT_PATH}/mlp-split_meter-no_normalization.npy", f"{OUTPUT_PATH}/submission_cleanup.csv", f"{OUTPUT_PATH}/submission_kfold.csv", f"{OUTPUT_PATH}/submission_meter.csv", ] if __name__ == "__main__": """ python scripts/05_optimize_blend_predictions.py """ # load test data with timer("load test data"): test = load_data("test_clean") leak = load_data("is_leak") target = leak["meter_reading"].values # load predictions with timer("load predictions"): preds_matrix = [np.load(x) for x in MODEL_LIST if ".npy" in x] replace_inds = (test.site_id == 0) & (test.meter == 0) if len([x for x in MODEL_LIST if ".csv" in x]) > 0: preds_matrix += [pd.read_csv(x).meter_reading.values for x in MODEL_LIST if ".csv" in x] preds_matrix = np.vstack(preds_matrix).T preds_matrix[preds_matrix < 0] = 0