Beispiel #1
0
def train_models(**load_parameters):
    rtd_ray = RtdRay()
    train = rtd_ray.load_for_ml_model(**load_parameters).compute()
    status_encoder = {}
    status_encoder["ar"] = pickle.load(
        open(ENCODER_PATH.format(encoder="ar_cs"), "rb"))
    status_encoder["dp"] = pickle.load(
        open(ENCODER_PATH.format(encoder="dp_cs"), "rb"))

    ar_train = train.loc[~train["ar_delay"].isna() |
                         (train["ar_cs"] == status_encoder["ar"]["c"])]
    dp_train = train.loc[~train["dp_delay"].isna() |
                         (train["dp_cs"] == status_encoder["dp"]["c"])]
    del train

    ar_labels = {}
    dp_labels = {}
    for label in CLASSES_TO_COMPUTE:
        ar_labels[label] = (ar_train["ar_delay"] <= label) & (
            ar_train["ar_cs"] != status_encoder["ar"]["c"])
        dp_labels[label + 1] = (dp_train["dp_delay"] >=
                                (label + 1)) & (dp_train["dp_cs"] !=
                                                status_encoder["dp"]["c"])

    del ar_train["ar_delay"]
    del ar_train["dp_delay"]
    del ar_train["ar_cs"]
    del ar_train["dp_cs"]

    del dp_train["ar_delay"]
    del dp_train["dp_delay"]
    del dp_train["ar_cs"]
    del dp_train["dp_cs"]

    newpath = "cache/models"
    if not os.path.exists(newpath):
        os.makedirs(newpath)

    parameters = pickle.load(open(CACHE_PATH + "/hyperparameters.pkl", "rb"))

    for label in CLASSES_TO_COMPUTE:
        model_name = f"ar_{label}"
        print("training", model_name)
        pickle.dump(
            train_model(ar_train, ar_labels[label], **parameters[label]),
            open(MODEL_PATH.format(model_name), "wb"),
        )

        label += 1
        model_name = f"dp_{label}"
        print("training", model_name)
        pickle.dump(
            train_model(dp_train, dp_labels[label], **parameters[
                label -
                1]),  # **parameters[label] # n_estimators=50, max_depth=6
            open(MODEL_PATH.format(model_name), "wb"),
        )
Beispiel #2
0
 def __init__(self):
     self.cat_encoders = {}
     for cat in ["o", "c", "n", "station", 'pp']:
         self.cat_encoders[cat] = pickle.load(
             open(ENCODER_PATH.format(encoder=cat), "rb")
         )
     self.ar_models = []
     self.dp_models = []
     for model in range(40):
         self.ar_models.append(pickle.load(open(MODEL_PATH.format('ar_' + str(model)), "rb")))
         self.dp_models.append(pickle.load(open(MODEL_PATH.format('dp_' + str(model)), "rb")))
Beispiel #3
0
def train_and_predict(gas_station_id=DEFAULT_GAS_STATION_ID, start_time=None, end_time=None,
                      up_to_days=DEFAULT_UP_TO_DAYS, plot=False, use_cached=False, cache=False):
    """
    Train the model for gas_station_id and return the prediction for the next up_to_days days
    :param gas_station_id: Internal identifier of the gas station
    :param up_to_days: Amount of days that should be predicted, ignored if start_time and end_time are not None
    :param start_time: Timestamp of the beginning of the forecast
    :param end_time: Timestamp of the end of the forecast
    :param plot: Whether to plot the forecast
    :param use_cached: Whether to load the serialized model if it exists
    :param cache: Whether to persist the model that was fitted on the possibly partial dataset
    :return: Fitted Model,
             DataFrame containing the true future prices
             DataFrame containing the predicted prices
    """
    model_loaded = False
    if use_cached:
        model_path = MODEL_PATH.format(gas_station_id)
        try:
            if not os.path.isfile(model_path):
                raise ValueError("No model was found at {}".format(model_path))

            model = pickle.load(open(model_path, "rb"))
            df_future = None
            model_loaded = True
        except Exception as e:
            print(e)

    if not model_loaded:
        model, df_future = train(gas_station_id=gas_station_id, up_to_days=up_to_days, cache=cache)
    df_forecast = predict(model, start_time=start_time, end_time=end_time, up_to_days=up_to_days, plot=plot)
    return model, df_future, df_forecast
Beispiel #4
0
 def test_model_caching(self):
     model_path = MODEL_PATH.format(GAS_STATION_ID)
     if os.path.isfile(model_path):
         os.remove(model_path)
     model_new, df_future_new, df_forecast_new = train_and_predict(
         gas_station_id=GAS_STATION_ID, use_cached=True, predict_days=31)
     model_cached, df_future_cached, df_forecast_cached = train_and_predict(
         gas_station_id=GAS_STATION_ID, use_cached=True, predict_days=31)
     assert sum(
         df_forecast_new['yhat'] - df_forecast_cached['yhat']
     ) == 0, "Predictions of freshly trained and serialized model are not equal"
Beispiel #5
0
def train(gas_station_id=DEFAULT_GAS_STATION_ID,
          up_to_days=None,
          up_to_timestamp=None,
          cache=True):
    """
    Train Prophet on the prices of the given gas station up to a specified amount of days
    :param gas_station_id: Internal identifier of the gas station
    :param up_to_days: Last days that should be excluded from training
    :param up_to_timestamp: Data will be excluded that is older than this timestamp
    :param cache: Whether to persist the model
    :return: fitted model, DataFrame the model was not fitted to according to up_to_days and up_to_timestamp
    """
    gas_station_path = os.path.join(GAS_PRICE_PATH,
                                    "{}.csv".format(gas_station_id))
    # If we're on the CI server, overwrite the path to the specific gas station with a fixed to save bandwidth
    if os.environ.get('CI', False):
        gas_station_path = os.path.join(TESTS_DATA_PATH,
                                        "{}.csv".format(gas_station_id))
    gas_stations_df = pd.read_csv(GAS_STATIONS_PATH, sep=',')
    gas_station_state = gas_stations_df[gas_stations_df["id"] ==
                                        gas_station_id]["State"].iloc[0]

    df_gas_station = pd.read_csv(gas_station_path,
                                 names=['Timestamp', 'Price'],
                                 sep=';')
    df_holidays = get_holidays_df_from_state(gas_station_state)
    df_vacations = get_vacations_df_from_state(gas_station_state)

    holidays_df = pd.concat((df_holidays, df_vacations))
    m = Prophet(holidays=holidays_df)
    df_fb = df_gas_station.copy()
    df_fb['y'] = df_fb['Price']
    df_fb['ds'] = df_fb['Timestamp'].apply(
        lambda x: get_datetime_from_string(str(x)))
    df_fb.drop(['Timestamp', 'Price'], inplace=True, axis=1)
    if up_to_days is not None and up_to_days > 0:
        start_future = df_fb.iloc[-1, :]['ds'] - datetime.timedelta(
            days=up_to_days)
        df_past = df_fb[df_fb['ds'] < start_future]
        df_future = df_fb[df_fb['ds'] >= start_future]
    elif up_to_timestamp is not None:
        df_past = df_fb[df_fb['ds'] < up_to_timestamp]
        df_future = df_fb[df_fb['ds'] >= up_to_timestamp]
    else:
        df_past = df_fb
        df_future = pd.DataFrame(columns=['y'])
    m.fit(df_past)
    if cache:
        pickle.dump(m,
                    open(MODEL_PATH.format(gas_station_id), "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)
    return m, df_future
Beispiel #6
0
    ar_test_x = test.loc[~test["ar_delay"].isna() |
                         (test["ar_cs"] == status_encoder["ar"]["c"])].drop(
                             columns=[
                                 "ar_delay", "dp_delay", "ar_cs", "dp_cs"
                             ],
                             axis=0)
    dp_test_x = test.loc[~test["dp_delay"].isna() |
                         (test["dp_cs"] == status_encoder["dp"]["c"])].drop(
                             columns=[
                                 "ar_delay", "dp_delay", "ar_cs", "dp_cs"
                             ],
                             axis=0)
    del test

    for model_number in CLASSES_TO_COMPUTE:
        model_name = f"ar_{model_number}"
        print("test_results for model {}".format(model_name))
        test_y = (ar_test["ar_delay"] <= model_number) & (
            ar_test["ar_cs"] != status_encoder["ar"]["c"])
        model = pickle.load(open(MODEL_PATH.format(model_name), "rb"))
        test_model(model, ar_test_x, test_y, model_name)

        model_number += 1
        model_name = f"dp_{model_number}"
        print("test_results for model {}".format(model_name))
        test_y = (dp_test["dp_delay"] >= model_number) & (
            dp_test["dp_cs"] != status_encoder["dp"]["c"])
        model = pickle.load(open(MODEL_PATH.format(model_name), "rb"))
        test_model(model, dp_test_x, test_y, model_name)