def train_models(**load_parameters): rtd_ray = RtdRay() train = rtd_ray.load_for_ml_model(**load_parameters).compute() status_encoder = {} status_encoder["ar"] = pickle.load( open(ENCODER_PATH.format(encoder="ar_cs"), "rb")) status_encoder["dp"] = pickle.load( open(ENCODER_PATH.format(encoder="dp_cs"), "rb")) ar_train = train.loc[~train["ar_delay"].isna() | (train["ar_cs"] == status_encoder["ar"]["c"])] dp_train = train.loc[~train["dp_delay"].isna() | (train["dp_cs"] == status_encoder["dp"]["c"])] del train ar_labels = {} dp_labels = {} for label in CLASSES_TO_COMPUTE: ar_labels[label] = (ar_train["ar_delay"] <= label) & ( ar_train["ar_cs"] != status_encoder["ar"]["c"]) dp_labels[label + 1] = (dp_train["dp_delay"] >= (label + 1)) & (dp_train["dp_cs"] != status_encoder["dp"]["c"]) del ar_train["ar_delay"] del ar_train["dp_delay"] del ar_train["ar_cs"] del ar_train["dp_cs"] del dp_train["ar_delay"] del dp_train["dp_delay"] del dp_train["ar_cs"] del dp_train["dp_cs"] newpath = "cache/models" if not os.path.exists(newpath): os.makedirs(newpath) parameters = pickle.load(open(CACHE_PATH + "/hyperparameters.pkl", "rb")) for label in CLASSES_TO_COMPUTE: model_name = f"ar_{label}" print("training", model_name) pickle.dump( train_model(ar_train, ar_labels[label], **parameters[label]), open(MODEL_PATH.format(model_name), "wb"), ) label += 1 model_name = f"dp_{label}" print("training", model_name) pickle.dump( train_model(dp_train, dp_labels[label], **parameters[ label - 1]), # **parameters[label] # n_estimators=50, max_depth=6 open(MODEL_PATH.format(model_name), "wb"), )
def __init__(self): self.cat_encoders = {} for cat in ["o", "c", "n", "station", 'pp']: self.cat_encoders[cat] = pickle.load( open(ENCODER_PATH.format(encoder=cat), "rb") ) self.ar_models = [] self.dp_models = [] for model in range(40): self.ar_models.append(pickle.load(open(MODEL_PATH.format('ar_' + str(model)), "rb"))) self.dp_models.append(pickle.load(open(MODEL_PATH.format('dp_' + str(model)), "rb")))
def train_and_predict(gas_station_id=DEFAULT_GAS_STATION_ID, start_time=None, end_time=None, up_to_days=DEFAULT_UP_TO_DAYS, plot=False, use_cached=False, cache=False): """ Train the model for gas_station_id and return the prediction for the next up_to_days days :param gas_station_id: Internal identifier of the gas station :param up_to_days: Amount of days that should be predicted, ignored if start_time and end_time are not None :param start_time: Timestamp of the beginning of the forecast :param end_time: Timestamp of the end of the forecast :param plot: Whether to plot the forecast :param use_cached: Whether to load the serialized model if it exists :param cache: Whether to persist the model that was fitted on the possibly partial dataset :return: Fitted Model, DataFrame containing the true future prices DataFrame containing the predicted prices """ model_loaded = False if use_cached: model_path = MODEL_PATH.format(gas_station_id) try: if not os.path.isfile(model_path): raise ValueError("No model was found at {}".format(model_path)) model = pickle.load(open(model_path, "rb")) df_future = None model_loaded = True except Exception as e: print(e) if not model_loaded: model, df_future = train(gas_station_id=gas_station_id, up_to_days=up_to_days, cache=cache) df_forecast = predict(model, start_time=start_time, end_time=end_time, up_to_days=up_to_days, plot=plot) return model, df_future, df_forecast
def test_model_caching(self): model_path = MODEL_PATH.format(GAS_STATION_ID) if os.path.isfile(model_path): os.remove(model_path) model_new, df_future_new, df_forecast_new = train_and_predict( gas_station_id=GAS_STATION_ID, use_cached=True, predict_days=31) model_cached, df_future_cached, df_forecast_cached = train_and_predict( gas_station_id=GAS_STATION_ID, use_cached=True, predict_days=31) assert sum( df_forecast_new['yhat'] - df_forecast_cached['yhat'] ) == 0, "Predictions of freshly trained and serialized model are not equal"
def train(gas_station_id=DEFAULT_GAS_STATION_ID, up_to_days=None, up_to_timestamp=None, cache=True): """ Train Prophet on the prices of the given gas station up to a specified amount of days :param gas_station_id: Internal identifier of the gas station :param up_to_days: Last days that should be excluded from training :param up_to_timestamp: Data will be excluded that is older than this timestamp :param cache: Whether to persist the model :return: fitted model, DataFrame the model was not fitted to according to up_to_days and up_to_timestamp """ gas_station_path = os.path.join(GAS_PRICE_PATH, "{}.csv".format(gas_station_id)) # If we're on the CI server, overwrite the path to the specific gas station with a fixed to save bandwidth if os.environ.get('CI', False): gas_station_path = os.path.join(TESTS_DATA_PATH, "{}.csv".format(gas_station_id)) gas_stations_df = pd.read_csv(GAS_STATIONS_PATH, sep=',') gas_station_state = gas_stations_df[gas_stations_df["id"] == gas_station_id]["State"].iloc[0] df_gas_station = pd.read_csv(gas_station_path, names=['Timestamp', 'Price'], sep=';') df_holidays = get_holidays_df_from_state(gas_station_state) df_vacations = get_vacations_df_from_state(gas_station_state) holidays_df = pd.concat((df_holidays, df_vacations)) m = Prophet(holidays=holidays_df) df_fb = df_gas_station.copy() df_fb['y'] = df_fb['Price'] df_fb['ds'] = df_fb['Timestamp'].apply( lambda x: get_datetime_from_string(str(x))) df_fb.drop(['Timestamp', 'Price'], inplace=True, axis=1) if up_to_days is not None and up_to_days > 0: start_future = df_fb.iloc[-1, :]['ds'] - datetime.timedelta( days=up_to_days) df_past = df_fb[df_fb['ds'] < start_future] df_future = df_fb[df_fb['ds'] >= start_future] elif up_to_timestamp is not None: df_past = df_fb[df_fb['ds'] < up_to_timestamp] df_future = df_fb[df_fb['ds'] >= up_to_timestamp] else: df_past = df_fb df_future = pd.DataFrame(columns=['y']) m.fit(df_past) if cache: pickle.dump(m, open(MODEL_PATH.format(gas_station_id), "wb"), protocol=pickle.HIGHEST_PROTOCOL) return m, df_future
ar_test_x = test.loc[~test["ar_delay"].isna() | (test["ar_cs"] == status_encoder["ar"]["c"])].drop( columns=[ "ar_delay", "dp_delay", "ar_cs", "dp_cs" ], axis=0) dp_test_x = test.loc[~test["dp_delay"].isna() | (test["dp_cs"] == status_encoder["dp"]["c"])].drop( columns=[ "ar_delay", "dp_delay", "ar_cs", "dp_cs" ], axis=0) del test for model_number in CLASSES_TO_COMPUTE: model_name = f"ar_{model_number}" print("test_results for model {}".format(model_name)) test_y = (ar_test["ar_delay"] <= model_number) & ( ar_test["ar_cs"] != status_encoder["ar"]["c"]) model = pickle.load(open(MODEL_PATH.format(model_name), "rb")) test_model(model, ar_test_x, test_y, model_name) model_number += 1 model_name = f"dp_{model_number}" print("test_results for model {}".format(model_name)) test_y = (dp_test["dp_delay"] >= model_number) & ( dp_test["dp_cs"] != status_encoder["dp"]["c"]) model = pickle.load(open(MODEL_PATH.format(model_name), "rb")) test_model(model, dp_test_x, test_y, model_name)