Exemple #1
0

def predict_mlp(df, save_name):
    model = keras.models.load_model(save_name)
    return model.predict(get_inputs(df)[0], batch_size=1024)


if __name__ == "__main__":
    """
    python scripts/04_predict_mlp_meter.py --normalize_target
    python scripts/04_predict_mlp_meter.py
    """

    args = parser.parse_args()

    with timer("Loading data"):
        if args.normalize_target:
            test = load_data("test_nn_target_normalized_meter")
            test_square_feet = load_data("test_clean")["square_feet"].values
        else:
            test = load_data("test_nn_meter")
        test["target"] = -1

    with timer("Predicting"):
        test_preds = np.zeros(len(test))

        for m in range(4):
            print(m)
            # get base file name
            model_name = f"mlp-split_meter"
            make_dir(f"{MODEL_PATH}/{model_name}")
    # month columns
    "month_x",
    "month_y",
    "building_month",  #"month", 
    "gte_meter_building_id_month"
]

if __name__ == "__main__":
    """
    python scripts/04_predict_cb_site.py --normalize_target
    python scripts/04_predict_cb_site.py
    """

    args = parser.parse_args()

    with timer("Loading data"):
        test = load_data("test_clean")
        test.drop(DROP_COLS, axis=1, inplace=True)

    with timer("Preprocesing"):
        for x in CAT_COLS:
            test[x] = test[x].astype("category")

        if args.normalize_target:
            target_encode_cols = [x for x in test.columns if "gte" in x]
            test[target_encode_cols] = test[target_encode_cols] / np.log1p(
                test[["square_feet"]].values)

    # get base file name
    test_preds = np.zeros(len(test))
    for s in range(16):
    #             test_indices = test.site_id == s
    #             for col in CAT_COLS:

    #                 x = np.concatenate([train.loc[train_indices, col], test.loc[test_indices, col]])
    #                 encoder = FastLabelEncoder()
    #                 encoder.fit(x)

    #                 train.loc[train_indices, col] = encoder.transform(train.loc[train_indices, col])
    #                 test.loc[test_indices, col] = encoder.transform(test.loc[test_indices, col])

    #     with timer("Save Data"):
    #         train.to_pickle(f"{DATA_PATH}/preprocessed/train_nn_target_normalized_site.pkl")
    #         test.to_pickle(f"{DATA_PATH}/preprocessed/test_nn_target_normalized_site.pkl")

    # meter site no normalization
    with timer("Loading data"):
        train, test = load_data("clean")

    with timer("Standardize Numeric Features"):
        for s in range(16):
            train_indices = train.site_id == s
            test_indices = test.site_id == s

            X = np.concatenate([
                train.loc[train_indices, NUM_COLS].values,
                test.loc[test_indices, NUM_COLS].values
            ])
            mu = X.mean(0)
            sig = X.std(0)

            train.loc[train_indices,
Exemple #4
0
import os
import glob
import numpy as np
import pandas as pd
from functools import partial
from sklearn.metrics import mean_squared_error
from ashrae.blenders import load_preds, GeneralizedMeanBlender
from ashrae.utils import OUTPUT_PATH, load_data, rmsle, timer

if __name__ == "__main__":
    """
    python scripts/05_blend_predictions.py
    """

    # load test and leak
    with timer("load test and leak"):
        test = load_data("test_clean")
        leak = load_data("is_leak")
        target = leak["meter_reading"].values

    # load predictions
    with timer("load predictions"):
        preds_matrix = [np.load(x) for x in glob.glob(f"{OUTPUT_PATH}/*.npy")]
        if len(glob.glob(f"{OUTPUT_PATH}/*.csv")) > 0:
            preds_matrix += [
                pd.read_csv(x).meter_reading.values
                for x in glob.glob(f"{OUTPUT_PATH}/*.csv")
            ]
        preds_matrix = np.vstack(preds_matrix).T
        preds_matrix[preds_matrix < 0] = 0
def train_mlp(train,
              valid,
              cat_counts,
              save_name,
              lr=1e-3,
              lr_decay=1e-4,
              batch_size=512,
              epochs=25,
              emb_l2_reg=1e-3,
              emb_dim=1,
              n_dense_max=256,
              n_dense_min=32,
              n_layers=3,
              dropout=0.5):

    #-------------------------
    with timer("Create  model"):

        # inputs
        num_inputs = keras.layers.Input(shape=(len(NUM_COLS), ),
                                        name="numerical_inputs")
        cat_inputs = [
            keras.layers.Input(shape=(1, ), name=x) for x in CAT_COLS
        ]

        # embedding
        emb_inputs = []
        for x, i in zip(cat_counts, cat_inputs):
            emb = keras.layers.Embedding(
                input_dim=cat_counts[x],
                output_dim=emb_dim,
                embeddings_regularizer=keras.regularizers.l2(emb_l2_reg))
            emb = keras.layers.Flatten()(emb(i))
            emb_inputs.append(emb)

        # mlp
        inputs = keras.layers.Concatenate(name="general_features")(
            [num_inputs, *emb_inputs])
        for i in range(n_layers):
            n_dense = int(max((0.5**i) * n_dense_max, n_dense_min))
            inputs = keras.layers.Dense(n_dense, activation="relu")(inputs)
            inputs = keras.layers.Dropout(dropout)(inputs)
            inputs = keras.layers.BatchNormalization()(inputs)

        # output
        outputs = keras.layers.Dense(1, activation=None,
                                     name="outputs")(inputs)
        model = keras.models.Model(inputs=[num_inputs, *cat_inputs],
                                   outputs=outputs)

        # compile
        model.compile(loss=keras.losses.mean_squared_error,
                      optimizer=keras.optimizers.Adam(lr=lr, decay=lr_decay))

        model.summary()

    #-------------------------
    with timer("Training"):
        model.fit(
            *get_inputs(train),
            batch_size=batch_size,
            epochs=epochs,
            validation_data=get_inputs(valid),
            callbacks=[
                keras.callbacks.EarlyStopping(patience=2, verbose=1),
                keras.callbacks.ModelCheckpoint(
                    save_name,  # f"{MODEL_PATH}/model_oof.hdf5"
                    save_best_only=True,
                    verbose=1,
                    monitor='val_loss',
                    mode='min')
            ])
    return
Exemple #6
0
    # month columns
    "month_x",
    "month_y",
    "building_month",  #"month", 
    "gte_meter_building_id_month"
]

if __name__ == "__main__":
    """
    python scripts/03_train_cb_site.py --normalize_target
    python scripts/03_train_cb_site.py
    """

    args = parser.parse_args()

    with timer("Loading data"):
        train = load_data("train_clean")
        train.drop(DROP_COLS, axis=1, inplace=True)
        train = train.loc[train.is_bad_meter_reading == 0].reset_index(
            drop=True)

    with timer("Preprocesing"):
        for x in CAT_COLS:
            train[x] = train[x].astype("category")

        if args.normalize_target:
            target_encode_cols = [x for x in train.columns if "gte" in x]
            train[target_encode_cols] = train[target_encode_cols] / np.log1p(
                train[["square_feet"]].values)
            train["target"] = np.log1p(train["meter_reading"]) / np.log1p(
                train["square_feet"])
Exemple #7
0
    df["building_weekday"] = bm_ + df.weekday.astype(str)
    df["building_month"] = bm_ + df.month.astype(str)
    df["building_hour"] = bm_ + df.hour.astype(str)
    df["building_meter"] = bm_

    # get holidays
    dates_range = pd.date_range(start="2015-12-31", end="2019-01-01")
    us_holidays = calendar().holidays(start=dates_range.min(),
                                      end=dates_range.max())
    df["is_holiday"] = (
        df.ts.dt.date.astype("datetime64").isin(us_holidays)).astype(np.int8)


if __name__ == "__main__":

    with timer("Loading data"):
        train, test = load_data("input")
        building_meta = load_data("meta")
        train_weather, test_weather = load_data("weather")

    with timer("Process timestamp"):
        train["ts"] = pd.to_datetime(train.timestamp)
        test["ts"] = pd.to_datetime(test.timestamp)
        process_timestamp(train)
        process_timestamp(test)
        process_timestamp(train_weather)
        process_timestamp(test_weather)

    with timer("Process weather"):
        process_weather(train_weather, "train")
        process_weather(test_weather, "test")
Exemple #8
0
    
    # month columns
    "month_x", "month_y", "building_month", #"month", 
    "gte_meter_building_id_month"
]    


if __name__ == "__main__":
    """
    python scripts/04_predict_lgb_meter.py --normalize_target
    python scripts/04_predict_lgb_meter.py    
    """
        
    args = parser.parse_args()

    with timer("Loading data"):
        test = load_data("test_clean")
        test.drop(DROP_COLS, axis=1, inplace=True)

    with timer("Preprocesing"):
        for x in CAT_COLS:
            test[x] = test[x].astype("category")

        if args.normalize_target:
            target_encode_cols = [x for x in test.columns if "gte" in x]
            test[target_encode_cols] = test[target_encode_cols]/np.log1p(test[["square_feet"]].values)

    with timer("Predicting"):
        # get base file name
        test_preds = np.zeros(len(test))
        for m in range(4):    
    f"{OUTPUT_PATH}/cb-split_primary_use-target_normalization.npy",
    f"{OUTPUT_PATH}/cb-split_site-no_normalization.npy",
    f"{OUTPUT_PATH}/cb-split_site-target_normalization.npy",
    f"{OUTPUT_PATH}/mlp-split_meter-no_normalization.npy",
    f"{OUTPUT_PATH}/submission_cleanup.csv",
    f"{OUTPUT_PATH}/submission_kfold.csv",
    f"{OUTPUT_PATH}/submission_meter.csv",
]

if __name__ == "__main__":
    """
    python scripts/05_blend_predictions.py
    """

    # load test data
    with timer("load test data"):
        test = load_data("test_clean")
        leak = load_data("is_leak")
        target = leak["meter_reading"].values

    # load predictions
    with timer("load predictions"):
        preds_matrix = [np.load(x) for x in MODEL_LIST if ".npy" in x]
        replace_inds = (test.site_id == 0) & (test.meter == 0)

        if len([x for x in MODEL_LIST if ".csv" in x]) > 0:
            preds_matrix += [
                pd.read_csv(x).meter_reading.values for x in MODEL_LIST
                if ".csv" in x
            ]

def predict_mlp(df, save_name):
    model = keras.models.load_model(save_name)
    return model.predict(get_inputs(df)[0], batch_size=1024)


if __name__ == "__main__":
    """
    python scripts/04_predict_mlp_meter.py --normalize_target
    python scripts/04_predict_mlp_meter.py
    """

    args = parser.parse_args()

    with timer("Loading data"):
        if args.normalize_target:
            test = load_data("test_nn_tareget_normalized_meter")
        else:
            test = load_data("test_nn_meter")

    test_preds = np.zeros(len(test))

    for m in range(4):
        print(m)
        # get base file name
        model_name = f"mlp-split_meter"
        make_dir(f"{MODEL_PATH}/{model_name}")

        # create sub model path
        if args.normalize_target:
Exemple #11
0
    f"{OUTPUT_PATH}/cb-split_primary_use-target_normalization.npy",
    f"{OUTPUT_PATH}/cb-split_site-no_normalization.npy",
    f"{OUTPUT_PATH}/cb-split_site-target_normalization.npy",
    f"{OUTPUT_PATH}/mlp-split_meter-no_normalization.npy",
    f"{OUTPUT_PATH}/submission_cleanup.csv",
    f"{OUTPUT_PATH}/submission_kfold.csv",
    f"{OUTPUT_PATH}/submission_meter.csv",
]

if __name__ == "__main__":
    """
    python scripts/05_optimize_blend_predictions.py
    """           

    # load test data
    with timer("load test data"):
        test = load_data("test_clean")
        leak = load_data("is_leak")
        target = leak["meter_reading"].values

    # load predictions
    with timer("load predictions"):
        preds_matrix = [np.load(x) for x in MODEL_LIST if ".npy" in x]
        replace_inds = (test.site_id == 0) & (test.meter == 0)

        if len([x for x in MODEL_LIST if ".csv" in x]) > 0:
            preds_matrix += [pd.read_csv(x).meter_reading.values for x in MODEL_LIST if ".csv" in x]

        preds_matrix = np.vstack(preds_matrix).T
        preds_matrix[preds_matrix < 0] = 0