def forecating_errors(ts, ts_idx): data = regression_matrix.RegMatrix(ts, y_idx=ts_idx, x_idx=ts_idx) # Create regression matrix data.create_matrix(nsteps=1, norm_flag=True) frc_model = frc_class.CustomModel(Lasso, name="Lasso", alpha=0.001) # frc_model = frc_class.CustomModel(LSTM.LSTM, name="LSTM") # frc_model = frc_class.CustomModel(GatingEnsemble.GatingEnsemble, # estimators=[LinearRegression() for i in range(N_EXPERTS)]) # (LSTM.LSTM, name="LSTM") # Split data for training and testing data.train_test_split(TRAIN_TEST_RATIO) model = frc_class.PipelineModel(gen_mdl=None, sel_mdl=None, frc_mdl=frc_model) model, _, _, _ = model.train_model( data.trainX, data.trainY) # model parameters are changed inside data.forecast(model, replace=True) train_mae = data.mae(idx_rows=data.idx_train, idx_original=data.original_index) train_mape = data.mape(idx_rows=data.idx_train, idx_original=data.original_index) test_mae = data.mae(idx_rows=data.idx_test, idx_original=data.original_index) test_mape = data.mape(idx_rows=data.idx_test, idx_original=data.original_index) return train_mae, train_mape, test_mae, test_mape
def competition_errors(model, names, y_idx=None): """ Returns MAPE, averaged over a set of multivariate time series, specified by names :param model: trained forecasting model :type model: PipelineModel :param names: (parts of) names of time series in the set :type names: list :param y_idx: :type y_idx: :return: :rtype: """ if isinstance(model, str): model = frc_class.PipelineModel().load_model(model) mape = [] for name in names: ts = load_time_series.load_all_time_series(datasets=['EnergyWeather'], load_raw=False, name_pattern=name, verbose=False)[0] data = regression_matrix.RegMatrix(ts, y_idx=y_idx) data.create_matrix() data.forecast(model) mape.append(data.mape()) return np.mean(mape), np.std(mape)
def test_identity(self): input_ts = random_data.create_random_ts(n_ts=3, n_req=1, n_hist=2, max_length=200, min_length=200) data = regression_matrix.RegMatrix(input_ts) data.create_matrix() data.X = data.Y # for identity frc data.train_test_split(0.25) model = frc_class.PipelineModel(frc_mdl=frc_class.IdentityFrc()) model.train_model(data.trainX, data.trainY) # model parameters are changed inside frc, idx_frc = data.forecast(model, idx_rows=data.idx_train, replace=True) self.assertTrue( (frc == data.trainY).all()) # first check that identity frc works frc, idx_frc = data.forecast(model, idx_rows=data.idx_test, replace=True) self.assertTrue((frc == data.testY ).all()) # ones again, check that identity frc works # now check forecats: #print data.mae(), data.mape() self.assertTrue((data.mae() < TOL).all()) self.assertTrue((data.mape() < TOL).all())
def test_y_slicing_args(self): """ Check that individual forecasts are the same if sliced in init or at create_matrix """ input_ts = random_data.create_random_ts(n_ts=3, n_req=2, n_hist=13, max_length=500, min_length=200) # include all ts explicitly data = regression_matrix.RegMatrix(input_ts, y_idx=range(len(input_ts.data))) data.create_matrix() data.train_test_split(0.25) model = frc_class.PipelineModel(frc_mdl=frc_class.MartingalFrc()) model.train_model(data.trainX, data.trainY) # model parameters are changed inside frc1, idx_frc = data.forecast(model) # let the model define y_idx data = regression_matrix.RegMatrix(input_ts) data.create_matrix() data.train_test_split(0.25) model = frc_class.PipelineModel(frc_mdl=frc_class.MartingalFrc()) model.train_model(data.trainX, data.trainY) frc2, idx_frc = data.forecast(model) data = regression_matrix.RegMatrix(input_ts) data.create_matrix(y_idx=range(len(input_ts.data))) data.train_test_split(0.25) model = frc_class.PipelineModel(frc_mdl=frc_class.MartingalFrc()) model.train_model(data.trainX, data.trainY) # model parameters are changed inside frc3, idx_frc = data.forecast(model) self.assertTrue((frc1 == frc2).all()) self.assertTrue((frc3 == frc2).all()) self.assertTrue((frc1 == frc3).all())
def test_frc_by_one_2(self): """ Check that individual forecasts do not depend on the rest of the matrix """ input_ts = random_data.create_random_ts(n_ts=3, n_req=11, n_hist=23, max_length=200, min_length=200) # create the data object for all ts data = regression_matrix.RegMatrix(input_ts, y_idx=range(len(input_ts.data))) # then construct the matrix just for one ts: data.create_matrix(y_idx=0, x_idx=0) data.train_test_split(0.25) model = frc_class.PipelineModel(frc_mdl=frc_class.MartingalFrc()) model.train_model(data.trainX, data.trainY) # model parameters are changed inside frc0, idx_frc = data.forecast(model) # Remember the first ts: ts0 = input_ts.data[0] for i in xrange(5): # generate new data input_ts = random_data.create_random_ts(n_ts=3, n_req=11, n_hist=23, max_length=200, min_length=200) # keep the first ts the same new_ts = [ts0] new_ts.extend(input_ts.data[1:]) input_ts.data = new_ts data = regression_matrix.RegMatrix(input_ts) data.create_matrix(y_idx=0, x_idx=0) data.train_test_split(0.25) model = frc_class.PipelineModel(frc_mdl=frc_class.MartingalFrc()) model.train_model( data.trainX, data.trainY) # model parameters are changed inside frc, idx_frc = data.forecast(model) self.assertTrue((frc0 == frc).all())
def test_frc_by_one(self): """Check that individual forecasts are the same as frc for a set of one ts""" input_ts = random_data.create_random_ts(n_ts=5, n_req=11, n_hist=23, max_length=500, min_length=200) for i_ts, ts in enumerate(input_ts.data): data = regression_matrix.RegMatrix(input_ts, y_idx=i_ts) data.create_matrix() data.train_test_split(0.25) model = frc_class.PipelineModel(frc_mdl=frc_class.MartingalFrc()) model.train_model( data.trainX, data.trainY) # model parameters are changed inside frc1, idx_frc = data.forecast(model) Y1 = data.Y input_ts2 = copy.deepcopy(input_ts) input_ts2.data = input_ts.data[i_ts:i_ts + 1] data = regression_matrix.RegMatrix(input_ts2) data.create_matrix() data.train_test_split(0.25) model = frc_class.PipelineModel(frc_mdl=frc_class.MartingalFrc()) model.train_model( data.trainX, data.trainY) # model parameters are changed inside frc2, idx_frc = data.forecast(model) Y2 = data.Y self.assertTrue((frc1 == frc2).all()) self.assertTrue((Y1 == Y2).all()) return None
def main(file_name=None, line_indices="all", header=True, format_="date"): """ Runs forecasting models and reports results in latex file :param file_name: file name (.csv) with data in IoT format :type file_name: str :param line_indices: indices of lines to read from file. Lines are enumerated from 1. If "all", read the whole file :param header: Specifies if the file contains a header row :type header: bool :return: latex report :rtype: str """ # Init string for latex results: latex_str = "" time_at_start = time.time() if format_ == "date": folder = os.path.join( "results", datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')) else: folder = os.path.join( "results", datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d-%H-%M-%S')) if not os.path.exists(folder): os.makedirs(folder) # Load data in IoT format try: data, metric_ids, host_ids, header_names = get_iot_data.get_data( file_name, line_indices, header) except BaseException as e: print("{}. Line indices: {}. Filename {}".format( e.message, line_indices, file_name)) return None # Select only data from first dataset in host_ids: dataset = list(host_ids.keys())[0] # select the first dataset # FIXIT ts = load_time_series.from_iot_to_struct( data, host_ids[dataset], dataset) # get all time series from dataset in TsStruct format ts.replace_nans() ts.align_time_series( max_history=50000 ) # truncate time series to align starting and ending points latex_str += ts.summarize_ts(latex=True) # split time series into train and validation train, test = ts.train_test_split( train_test_ratio=0.75 ) # split raw time series into train and test parts # Plot periodics: for i, tsi in enumerate(ts.data): save_to = os.path.join(folder, "decompose", "_".join(tsi.name.split(" "))) # infer periodicity and try to decompose ts into tend, seasonality and resid: try: period, msg = arima_model.decompose(tsi, nhist=500, folder=save_to, nsplits=50) except Exception as e: msg = "Failed to decompose, error message: {}".format(e.message) latex_str += my_plots.check_text_for_latex(tsi.name) + ": " latex_str += msg latex_str += arima_model.make_report( os.path.join(save_to), write=False) # adds figures from "save_to" to latex_str # Declare models to compare: random_forest = frc_class.CustomModel(RandomForestRegressor, n_jobs=24, name="RandomForest") # mixture_experts = frc_class.CustomModel(GatingEnsemble.GatingEnsemble, name="Mixture", # estimators=[RidgeCV(), LassoCV()]) # lstm = frc_class.CustomModel(LSTM.LSTM, name="LSTM", n_epochs=50, plot_loss=True) lasso = frc_class.CustomModel(Lasso, name="Lasso", fit_intercept=True, alpha=2.0) lasso_model = frc_class.PipelineModel(frc_mdl=lasso) model_list = [lasso_model] # random_forest, mixture_experts, lstm params_range = {} params_range["RandomForest"] = {"n_estimators": [3000]} params_range["Mixture"] = {"n_hidden_units": [10, 20, 30, 50, 100]} params_range["LSTM"] = {"batch_size": [20, 30, 50, 100]} params_range["Lasso"] = { "alpha": [float(i) / 10000 for i in range(1, 11, 1)] + [0.01, 0.05] } # [20, 30, 50, 100]} #[1.0, 1.25, 1.5, 1.75, 2.0] WINDOWS = [2, 5, 7, 10, 15, 20] N_FOLDS = 2 for model in model_list: model_save_path = os.path.join(folder, model.name) if not os.path.exists(model_save_path): os.makedirs(model_save_path) # select number of trees and history parameter: # (history parameter is divisible by request) n_req, params, best_train_mse, plt = train_model_CV( train, model, n_fold=N_FOLDS, windows=WINDOWS, params=params_range[model.named_steps['frc'].name], plot=True) #windows=[5, 10, 25, 50, 75, 100, 150]) plt.savefig(os.path.join(model_save_path, "cv_optimization.png")) plt.clf() # n_req, nr_tree, best_train_mse = 10, 500, 0.00658112163657 # previously estimated opt_string = model.name + ". Best CV error: {0}, estimated parameters: history = {1}, {2} = {3} " \ "\\\\ \n".format(best_train_mse, n_req, my_plots.check_text_for_latex(list(params.keys())[0]), list(params.values())[0]) print(opt_string) latex_str += opt_string # use selected parameters to forecast trainning data: if not len(params) == 0: model.__setattr__(list(params.keys())[0], list(params.values())[0]) data = regression_matrix.RegMatrix(ts) data.history = n_req * data.request data.create_matrix() data.train_test_split() model, frc, _, _ = model.train_model(data.trainX, data.trainY) if hasattr(frc, "msg"): latex_str += frc.msg if hasattr(frc, "fig"): frc.fig.savefig(os.path.join(model_save_path, "fitting.png")) train_frc, _ = data.forecast(model, idx_rows=data.idx_train) train_mse = mean_squared_error(train_frc, data.trainY) test_frc, _ = data.forecast(model, idx_rows=data.idx_test) test_mse = mean_squared_error(test_frc, data.testY) latex_str += my_plots.check_text_for_latex(model.name) + "\\\\ \n" latex_str += "Train error for estimated parameters: {0}, " \ "test error with estimated parameters {1} \\\\ \n".format(train_mse, test_mse) err_all = forecasting_errors(data, ts.original_index) column_names = [("MAE", "train"), ("MAPE", "train"), ("MAE", "test"), ("MAPE", "test")] res_all = data_frame_res(err_all, column_names, ts) print(model.name) print(res_all) latex_str += res_all.to_latex() latex_str += "\\bigskip \n \\\\" data.plot_frc(n_frc=10, n_hist=10, folder=model_save_path) latex_str += my_plots.include_figures_from_folder(model_save_path) total_time = time.time() - time_at_start latex_str += "\n Total time: {0}\n \\".format(total_time) my_plots.print_to_latex(latex_str, check=False, file_name="IoT_example", folder=folder) return latex_str
def train_model_CV(data, model, n_fold=5, windows=(5, 10, 25, 50, 75, 100, 150), params={}, f_horizon=1, plot=False): if len(params) == 0: par_name, params_range = None, [] else: par_name, params_range = list(params.items())[0] params_range = params[par_name] scores = np.zeros((len(windows), len(params_range), n_fold)) for w_ind in range(0, len(windows)): # obtain the matrix from the time series data with a given window-size data.history = windows[w_ind] * data.request mat = regression_matrix.RegMatrix(data) mat.create_matrix(f_horizon) w_train = mat.X y_wtrain = mat.Y # cross-validation kf = KFold(n_splits=n_fold) kf.get_n_splits(w_train) for par_ind in range(0, len(params_range)): model.named_steps['frc'].__setattr__(par_name, params_range[par_ind]) n = 0 for train_index, val_index in kf.split(w_train): print("\rWindow size: {0}, {1} = {2}, kfold = {3}".format( windows[w_ind], par_name, params_range[par_ind], n), end="") sys.stdout.flush() # getting training and validation data X_train, X_val = w_train[train_index, :], w_train[val_index, :] y_train, y_val = y_wtrain[train_index], y_wtrain[val_index] # train the model and predict the MSE try: model.train_model(X_train, y_train) pred_val = model.predict(X_val) scores[w_ind, par_ind, n] = mean_squared_error(pred_val, y_val) except BaseException as e: print(e) if n > 0: scores[w_ind, par_ind, n] = scores[w_ind, par_ind, n - 1] else: scores[w_ind, par_ind, n] = 0 n += 1 m_scores = np.average(scores, axis=2) mse = m_scores.min() # select best window_size and best n_tree with smallest MSE b_w_ind, b_tree_ind = np.where(m_scores == mse) b_w_ind, b_tree_ind = b_w_ind[0], b_tree_ind[0] window_size, best_par = windows[b_w_ind], params_range[b_tree_ind] best_par = {par_name: best_par} if not plot: return window_size, best_par, mse plt = my_plots.imagesc(m_scores, xlabel=par_name, ylabel="n_req", yticks=windows, xticks=params_range) return window_size, best_par, mse, plt
def main(frc_model=None, generator=None, selector=None): # Experiment settings. TRAIN_TEST_RATIO = 0.75 N_PREDICTIONS = 10 # plotting par # Load and prepare dataset. load_raw = True # not os.path.exists(os.path.join("ProcessedData", "EnergyWeather_orig_train.pkl")) ts_struct_list = load_time_series.load_all_time_series( datasets='EnergyWeather', load_raw=load_raw, name_pattern="") if frc_model is None: frc_model = frc_class.CustomModel( Lasso, name="Lasso", alpha=0.01 ) # LSTM.LSTM() #frc_class.IdenitityFrc() #LinearRegression() # Create regression model model = frc_class.PipelineModel(gen_mdl=generator, sel_mdl=selector, frc_mdl=frc_model) results = [] res_text = [] for ts in ts_struct_list: data = regression_matrix.RegMatrix(ts) # Create regression matrix data.create_matrix(nsteps=1, norm_flag=True) # Split data for training and testing data.train_test_split(TRAIN_TEST_RATIO) model, frc, gen, sel = model.train_model(data.trainX, data.trainY) #model, frc, gen, sel = data.train_model(frc_model=frc_model, generator=generator, selector=selector) # model parameters are changed inside data.forecast(model, data.idx_test, replace=True) data.forecast(model, data.idx_train, replace=True) train_mae = data.mae(idx_rows=data.idx_train, idx_original=data.original_index) train_mape = data.mape(idx_rows=data.idx_train, idx_original=data.original_index) test_mae = data.mae(idx_rows=data.idx_test, idx_original=data.original_index) test_mape = data.mape(idx_rows=data.idx_test, idx_original=data.original_index) res1 = pd.DataFrame(train_mae, index=[t.name for t in ts.data], columns=[("MAE", "train")]) res2 = pd.DataFrame(train_mape, index=[t.name for t in ts.data], columns=[("MAPE", "train")]) res3 = pd.DataFrame(test_mae, index=[t.name for t in ts.data], columns=[("MAE", "test")]) res4 = pd.DataFrame(test_mape, index=[t.name for t in ts.data], columns=[("MAPE", "test")]) res = pd.concat([res1, res2, res3, res4], axis=1) print(res) results.append(res) res_text.append(ts.name) data.plot_frc(n_frc=N_PREDICTIONS) my_plots.save_to_latex(results, df_names=res_text) return results
def main(file_name, line_indices, header): """ Forecast data simultaneously and separately and compare errors :param file_name: file name (.csv) with data in IoT format :type file_name: str :param line_indices: indices of lines to read from file. Lines are enumerated from 1. If "all", read the whole file :param header: Specifies if the file contains a header row :type header: bool :return: forecasting errors :rtype: pandas.DataFrame """ TRAIN_TEST_RATIO = 0.75 N_PREDICTIONS = 10 N_EXPERTS = 4 VERBOSE = True # frc_model = frc_class.CustomModel(Lasso, name="Lasso", alpha=0.001) # frc_model = frc_class.CustomModel(GatingEnsemble.GatingEnsemble, # estimators = [LinearRegression() for i in range(N_EXPERTS)])#(LSTM.LSTM, name="LSTM") ts = utils_.safe_read_iot_data(file_name=file_name, line_indices=line_indices, header=header, default="poisson", verbose=VERBOSE) if VERBOSE: print(ts.summarize_ts()) # my_plots.plot_multiple_ts(ts.data, shared_x=True) data = regression_matrix.RegMatrix(ts) # Create regression matrix data.create_matrix(nsteps=1, norm_flag=True) # Split data for training and testing data.train_test_split(TRAIN_TEST_RATIO) lr_list = [2e-6, 2e-5, 2e-4] n_lstm_units = [20, 30, 40, 50] hyperpars = {"learning_rate": lr_list, "n_lstm_units": n_lstm_units} frc_model = frc_class.CustomModel(LSTM.LSTM, name="LSTM", n_epochs=20, plot_loss=True) model = frc_class.PipelineModel(frc_mdl=frc_model) model, frc, _, _ = model.train_model( data.trainX, data.trainY, hyperpars=hyperpars, n_cvs=5) # model parameters are changed inside if hasattr(frc, "fig"): frc.fig.savefig("fitting_learn_rate_{}.png".format(frc.learning_rate)) # data.forecasts returns model obj, forecasted rows of Y matrix and a list [nts] of "flat"/ts indices of forecasted points data.forecast(model, replace=True) train_mae = data.mae(idx_rows=data.idx_train) train_mape = data.mape(idx_rows=data.idx_train) test_mae = data.mae(idx_rows=data.idx_test) test_mape = data.mape(idx_rows=data.idx_test) res1 = pd.DataFrame(train_mae, index=[t.name for t in ts.data], columns=[("MAE", "train")]) res2 = pd.DataFrame(train_mape, index=[t.name for t in ts.data], columns=[("MAPE", "train")]) res3 = pd.DataFrame(test_mae, index=[t.name for t in ts.data], columns=[("MAE", "test")]) res4 = pd.DataFrame(test_mape, index=[t.name for t in ts.data], columns=[("MAPE", "test")]) res = pd.concat([res1, res2, res3, res4], axis=1) print("LSTM") print(res) data.plot_frc(n_frc=N_PREDICTIONS) frc_model = frc_class.CustomModel(Lasso, name="Lasso", alpha=0.001) model = frc_class.PipelineModel(frc_mdl=frc_model) model, _, _, _ = model.train_model(data.trainX, data.trainY) data.forecast(model, replace=True) train_mae = data.mae(idx_rows=data.idx_train) train_mape = data.mape(idx_rows=data.idx_train) test_mae = data.mae(idx_rows=data.idx_test) test_mape = data.mape(idx_rows=data.idx_test) res1 = pd.DataFrame(train_mae, index=[t.name for t in ts.data], columns=[("MAE", "train")]) res2 = pd.DataFrame(train_mape, index=[t.name for t in ts.data], columns=[("MAPE", "train")]) res3 = pd.DataFrame(test_mae, index=[t.name for t in ts.data], columns=[("MAE", "test")]) res4 = pd.DataFrame(test_mape, index=[t.name for t in ts.data], columns=[("MAPE", "test")]) res = pd.concat([res1, res2, res3, res4], axis=1) print("Lasso") print(res) return res
def demo_train(ts_struct_list, frc_model=None, fg_mdl=None, fs_mdl=None, verbose=False, return_model=False, rewrite=True): """ Train and save the model. :param ts_struct_list: list of namedtuples tsStruct :param frc_model: forecasting model, instance of CustomModel :param fg_mdl: feature generation model, instance of FeatureGeneration :param fs_mdl: feature selection model, instance of FeatureSelection :param verbose: controls the output :return: testError, trainError, bias, model """ # Check arguments: if fg_mdl is None: fg_mdl = frc_class.IdentityGenerator(name="Identity generator", on=False) if fs_mdl is None: fs_mdl = gnt_class.FeatureGeneration( ) # IdentityModel(name="Identity selector") if frc_model is None: frc_model = frc_class.CustomModel(Lasso, name="Lasso", alpha=0.01) model = frc_class.PipelineModel(gen_mdl=fg_mdl, sel_mdl=fs_mdl, frc_mdl=frc_model) results = [] res_text = [] for ts in ts_struct_list: data = regression_matrix.RegMatrix(ts, x_idx=TS_IDX, y_idx=TS_IDX) # Create regression matrix data.create_matrix( nsteps=N_STEPS, norm_flag=True ) # this creates data.Y, data.X and some other fields # Split data for training and testing data.train_test_split(TRAIN_TEST_RATIO) # train the model. This returns trained pipeline and its steps model, frc, gen, sel = model.train_model(data.trainX, data.trainY) selection_res = "\n Feature selection results: problem status {}, selected {} from {} \\\\ \n".\ format(sel.status, len(sel.selected), sel.n_vars) frcY, _ = data.forecast( model) # returns forecasted matrix of the same shape as data.Y # frcY, idx_frc = data.forecast(model, idx_rows=data.idx_test) # this would return forecasts only for data.testY data.plot_frc(n_frc=5, n_hist=10, folder=SAVE_DIR) #this saves figures into SAVE_DIR train_mae = data.mae(idx_rows=data.idx_train, idx_original=data.original_index) train_mape = data.mape(idx_rows=data.idx_train, idx_original=data.original_index) test_mae = data.mae(idx_rows=data.idx_test, idx_original=data.original_index) test_mape = data.mape(idx_rows=data.idx_test, idx_original=data.original_index) index = [ts.data[i].name for i in TS_IDX] res1 = pd.DataFrame(train_mae, index=index, columns=[("MAE", "train")]) res2 = pd.DataFrame(train_mape, index=index, columns=[("MAPE", "train")]) res3 = pd.DataFrame(test_mae, index=index, columns=[("MAE", "test")]) res4 = pd.DataFrame(test_mape, index=index, columns=[("MAPE", "test")]) res = pd.concat([res1, res2, res3, res4], axis=1) configuration_str = "\n Time series {} forecasted with {} + '{}' feature generation model and " \ "'{}' feature selection model \\\\ \n".format(ts.name, frc.name, gen.name, sel.name) if verbose: print(configuration_str) print(selection_res) print(res) results.append(res) res_text.append(configuration_str) res_text.append(selection_res) saved_mdl_fname = model.save_model( file_name=FNAME_PREFIX, folder=SAVE_DIR) # saving in not an option yet # model = frc_class.PipelineModel().load_model(file_name=fname) # write results into a latex file my_plots.save_to_latex(results, df_names=res_text, folder=SAVE_DIR, rewrite=rewrite) print("Results saved to folder {}".format(SAVE_DIR)) if return_model: return model, saved_mdl_fname return saved_mdl_fname