Esempio n. 1
0
def train(dataset, labels):
    """ """
    pollutants = ["NO2", "PM10", "PM25"]
    # split dataset
    NO2_df, PM10_df, PM25_df = split_pollutant_dataset(dataset)
    # build data dict
    ds = dict(
        ((poll, df)
         for poll, df in zip(pollutants, split_pollutant_dataset(dataset))))
    # build features dict
    f = {}
    for poll in pollutants:
        f[poll] = {}
        f[poll]["X"] = make_features(ds[poll], **features_config[poll])
        f[poll]["Y"] = get_Y(labels, ds[poll])
    # train model for each pollutant
    model_dict = {}
    for poll in pollutants:
        xgb_model = xgb.XGBRegressor(max_depth=6,
                                     n_estimators=200,
                                     reg_lambda=1)
        # train model
        xgb_model.fit(f[poll]["X"], f[poll]["Y"])
        # mse on training set
        y_pred = xgb_model.predict(f[poll]["X"])
        mse = mean_squared_error(f[poll]["Y"], y_pred)
        print("%s: MSE on training set: %.3f" % (poll, mse))
        # store model
        model_dict[poll] = xgb_model
    # return model dict
    return model_dict
Esempio n. 2
0
def predict(model_dict, dataset):
    """ """
    # split dataset
    NO2_df, PM10_df, PM25_df = split_pollutant_dataset(dataset)
    # build features
    NO2_f = make_features(NO2_df, **features_config["NO2"])
    PM10_f = make_features(PM10_df, **features_config["PM10"])
    PM25_f = make_features(PM25_df, **features_config["PM25"])
    # apply each model
    Y_pred_NO2 = pd.DataFrame(model_dict["NO2"].predict(NO2_f),
                              columns=["TARGET"],
                              index=NO2_f.index)
    Y_pred_PM10 = pd.DataFrame(model_dict["PM10"].predict(PM10_f),
                               columns=["TARGET"],
                               index=PM10_f.index)
    Y_pred_PM25 = pd.DataFrame(model_dict["PM25"].predict(PM25_f),
                               columns=["TARGET"],
                               index=PM25_f.index)
    # concatenate result
    Y_pred = pd.concat([Y_pred_NO2, Y_pred_PM10, Y_pred_PM25], axis=0)
    # return
    return Y_pred
Esempio n. 3
0
    b = pd.DataFrame(0, index=dev.index, columns=["fold"])
    test_fold = pd.concat([a, b], axis=0)
    return test_fold


X_train_path = "/Users/thomasopsomer/data/plume-data/X_train.csv"
X_test_path = "/Users/thomasopsomer/data/plume-data/X_test.csv"
Y_train_path = "/Users/thomasopsomer/data/plume-data/Y_train.csv"

# load all dataset
df = pd.read_csv(X_train_path, index_col="ID")
df = preprocess_dataset(df)
Y = pd.read_csv(Y_train_path, index_col="ID")

# split for each pollutant
NO2_df, PM10_df, PM25_df = split_pollutant_dataset(df)

# split in train / dev for each pollutant
NO2_train, NO2_dev = split_train_dev(NO2_df, zone_station_train,
                                     zone_station_dev)
PM10_train, PM10_dev = split_train_dev(PM10_df, zone_station_train,
                                       zone_station_dev)
PM25_train, PM25_dev = split_train_dev(PM25_df, zone_station_train,
                                       zone_station_dev)

# make features and get labels

# NO2
NO2_train_f, NO2_dev_f = make_features(NO2_train,
                                       NO2_dev,
                                       normalize=False,
Esempio n. 4
0
def train_predict(train,
                  test,
                  Y_train,
                  model_dict=None,
                  output_path=None,
                  pm=False,
                  model="rf"):
    """ """
    pollutants = ["NO2", "PM"] if pm else ["NO2", "PM10", "PM25"]
    print("%i regressor will be trained for each pollutant of %s" %
          (len(pollutants), pollutants))
    # split dataset, build data dict
    train_ds = dict(
        ((poll, df)
         for poll, df in zip(pollutants, split_pollutant_dataset(train, pm))))
    test_ds = dict(
        ((poll, df)
         for poll, df in zip(pollutants, split_pollutant_dataset(test, pm))))
    # build features dict
    f = {}
    for poll in pollutants:
        f[poll] = {}
        f[poll]["X_train"], f[poll]["X_test"] = make_features(
            train_ds[poll], dev=test_ds[poll], **features_config[poll])
        if Y_train is not None:
            f[poll]["Y"] = get_Y(Y_train, train_ds[poll])
    # train model for each pollutant
    if model_dict is None:
        model_dict = {}
        for poll in pollutants:
            # shuffle X,Y
            X, Y = shuffle_XY(f[poll]["X_train"], f[poll]["Y"])
            # init model
            if model == "rf":
                reg = RandomForestRegressor(**rf_config)
            else:
                reg = xgb.XGBRegressor(max_depth=6, **xgb_config[poll])
            # train model
            print("Training a %s model on pollutant %s ..." % (model, poll))
            reg.fit(X, Y)
            print("Training done on %s" % poll)
            # store model
            model_dict[poll] = reg
        if output_path is not None:
            print("Saving the dictionnary of models in %s" % output_path)
            with open(output_path, "wb") as fout:
                pickle.dump(model_dict, fout)
    # predict on train set
    preds = []
    for poll in pollutants:
        # mse on training set
        Y_pred_poll = pd.DataFrame(model_dict[poll].predict(
            f[poll]["X_train"]),
                                   columns=["TARGET"],
                                   index=f[poll]["X_train"].index)
        preds.append(Y_pred_poll)
        mse = mean_squared_error(f[poll]["Y"], Y_pred_poll)
        print("%s: MSE on training set: %.3f" % (poll, mse))
    # concat and compute global MSE
    Y_pred = pd.concat(preds, axis=0).sort_index()
    mse = mean_squared_error(Y_train, Y_pred)
    print("GLOBAL MSE on training set: %.3f" % mse)
    # predict on test set
    print("Computing prediction on test data...")
    preds = []
    for poll in pollutants:
        Y_pred_poll = pd.DataFrame(model_dict[poll].predict(f[poll]["X_test"]),
                                   columns=["TARGET"],
                                   index=f[poll]["X_test"].index)
        preds.append(Y_pred_poll)
    # concatenate pred for each pollutant and sort index
    Y_pred = pd.concat(preds, axis=0).sort_index()
    print("Prediction done.")
    #
    return Y_pred