def feature_generation_demo(): ts_list = load_energy_weather_data(load_raw=False, fnames=TRAIN_FILE_NAMES) frc_model = frc_class.CustomModel(Lasso, name="Lasso", alpha=0.0001) selector = sel_class.FeatureSelection(name="Katrutsa") rewrite = True for fg_name in feature_gnt_names[:-2]: #:["all"] generator = gnt_class.FeatureGeneration(name=fg_name, replace=False, transformations=[fg_name], norm=True) model, _ = demo_train(ts_list, frc_model=frc_model, fg_mdl=generator, fs_mdl=selector, verbose=True, return_model=True, rewrite=rewrite) rewrite = False train_error, train_std = competition_errors(model=model, names=TRAIN_FILE_NAMES, y_idx=TS_IDX) test_error, test_std = competition_errors(model=model, names=TEST_FILE_NAMES, y_idx=TS_IDX) res_text = "\n Average MAPE across time series: train = {} with std {}, test = {} with std {} \\\\ \n".\ format(train_error, train_std, test_error, test_std) print(res_text) my_plots.save_to_latex(text=res_text, folder=SAVE_DIR, rewrite=rewrite)
def forecating_errors(ts, ts_idx): data = regression_matrix.RegMatrix(ts, y_idx=ts_idx, x_idx=ts_idx) # Create regression matrix data.create_matrix(nsteps=1, norm_flag=True) frc_model = frc_class.CustomModel(Lasso, name="Lasso", alpha=0.001) # frc_model = frc_class.CustomModel(LSTM.LSTM, name="LSTM") # frc_model = frc_class.CustomModel(GatingEnsemble.GatingEnsemble, # estimators=[LinearRegression() for i in range(N_EXPERTS)]) # (LSTM.LSTM, name="LSTM") # Split data for training and testing data.train_test_split(TRAIN_TEST_RATIO) model = frc_class.PipelineModel(gen_mdl=None, sel_mdl=None, frc_mdl=frc_model) model, _, _, _ = model.train_model( data.trainX, data.trainY) # model parameters are changed inside data.forecast(model, replace=True) train_mae = data.mae(idx_rows=data.idx_train, idx_original=data.original_index) train_mape = data.mape(idx_rows=data.idx_train, idx_original=data.original_index) test_mae = data.mae(idx_rows=data.idx_test, idx_original=data.original_index) test_mape = data.mape(idx_rows=data.idx_test, idx_original=data.original_index) return train_mae, train_mape, test_mae, test_mape
def main(file_name=None, line_indices="all", header=True, format_="date"): """ Runs forecasting models and reports results in latex file :param file_name: file name (.csv) with data in IoT format :type file_name: str :param line_indices: indices of lines to read from file. Lines are enumerated from 1. If "all", read the whole file :param header: Specifies if the file contains a header row :type header: bool :return: latex report :rtype: str """ # Init string for latex results: latex_str = "" time_at_start = time.time() if format_ == "date": folder = os.path.join( "results", datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')) else: folder = os.path.join( "results", datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d-%H-%M-%S')) if not os.path.exists(folder): os.makedirs(folder) # Load data in IoT format try: data, metric_ids, host_ids, header_names = get_iot_data.get_data( file_name, line_indices, header) except BaseException as e: print("{}. Line indices: {}. Filename {}".format( e.message, line_indices, file_name)) return None # Select only data from first dataset in host_ids: dataset = list(host_ids.keys())[0] # select the first dataset # FIXIT ts = load_time_series.from_iot_to_struct( data, host_ids[dataset], dataset) # get all time series from dataset in TsStruct format ts.replace_nans() ts.align_time_series( max_history=50000 ) # truncate time series to align starting and ending points latex_str += ts.summarize_ts(latex=True) # split time series into train and validation train, test = ts.train_test_split( train_test_ratio=0.75 ) # split raw time series into train and test parts # Plot periodics: for i, tsi in enumerate(ts.data): save_to = os.path.join(folder, "decompose", "_".join(tsi.name.split(" "))) # infer periodicity and try to decompose ts into tend, seasonality and resid: try: period, msg = arima_model.decompose(tsi, nhist=500, folder=save_to, nsplits=50) except Exception as e: msg = "Failed to decompose, error message: {}".format(e.message) latex_str += my_plots.check_text_for_latex(tsi.name) + ": " latex_str += msg latex_str += arima_model.make_report( os.path.join(save_to), write=False) # adds figures from "save_to" to latex_str # Declare models to compare: random_forest = frc_class.CustomModel(RandomForestRegressor, n_jobs=24, name="RandomForest") # mixture_experts = frc_class.CustomModel(GatingEnsemble.GatingEnsemble, name="Mixture", # estimators=[RidgeCV(), LassoCV()]) # lstm = frc_class.CustomModel(LSTM.LSTM, name="LSTM", n_epochs=50, plot_loss=True) lasso = frc_class.CustomModel(Lasso, name="Lasso", fit_intercept=True, alpha=2.0) lasso_model = frc_class.PipelineModel(frc_mdl=lasso) model_list = [lasso_model] # random_forest, mixture_experts, lstm params_range = {} params_range["RandomForest"] = {"n_estimators": [3000]} params_range["Mixture"] = {"n_hidden_units": [10, 20, 30, 50, 100]} params_range["LSTM"] = {"batch_size": [20, 30, 50, 100]} params_range["Lasso"] = { "alpha": [float(i) / 10000 for i in range(1, 11, 1)] + [0.01, 0.05] } # [20, 30, 50, 100]} #[1.0, 1.25, 1.5, 1.75, 2.0] WINDOWS = [2, 5, 7, 10, 15, 20] N_FOLDS = 2 for model in model_list: model_save_path = os.path.join(folder, model.name) if not os.path.exists(model_save_path): os.makedirs(model_save_path) # select number of trees and history parameter: # (history parameter is divisible by request) n_req, params, best_train_mse, plt = train_model_CV( train, model, n_fold=N_FOLDS, windows=WINDOWS, params=params_range[model.named_steps['frc'].name], plot=True) #windows=[5, 10, 25, 50, 75, 100, 150]) plt.savefig(os.path.join(model_save_path, "cv_optimization.png")) plt.clf() # n_req, nr_tree, best_train_mse = 10, 500, 0.00658112163657 # previously estimated opt_string = model.name + ". Best CV error: {0}, estimated parameters: history = {1}, {2} = {3} " \ "\\\\ \n".format(best_train_mse, n_req, my_plots.check_text_for_latex(list(params.keys())[0]), list(params.values())[0]) print(opt_string) latex_str += opt_string # use selected parameters to forecast trainning data: if not len(params) == 0: model.__setattr__(list(params.keys())[0], list(params.values())[0]) data = regression_matrix.RegMatrix(ts) data.history = n_req * data.request data.create_matrix() data.train_test_split() model, frc, _, _ = model.train_model(data.trainX, data.trainY) if hasattr(frc, "msg"): latex_str += frc.msg if hasattr(frc, "fig"): frc.fig.savefig(os.path.join(model_save_path, "fitting.png")) train_frc, _ = data.forecast(model, idx_rows=data.idx_train) train_mse = mean_squared_error(train_frc, data.trainY) test_frc, _ = data.forecast(model, idx_rows=data.idx_test) test_mse = mean_squared_error(test_frc, data.testY) latex_str += my_plots.check_text_for_latex(model.name) + "\\\\ \n" latex_str += "Train error for estimated parameters: {0}, " \ "test error with estimated parameters {1} \\\\ \n".format(train_mse, test_mse) err_all = forecasting_errors(data, ts.original_index) column_names = [("MAE", "train"), ("MAPE", "train"), ("MAE", "test"), ("MAPE", "test")] res_all = data_frame_res(err_all, column_names, ts) print(model.name) print(res_all) latex_str += res_all.to_latex() latex_str += "\\bigskip \n \\\\" data.plot_frc(n_frc=10, n_hist=10, folder=model_save_path) latex_str += my_plots.include_figures_from_folder(model_save_path) total_time = time.time() - time_at_start latex_str += "\n Total time: {0}\n \\".format(total_time) my_plots.print_to_latex(latex_str, check=False, file_name="IoT_example", folder=folder) return latex_str
def main(frc_model=None, generator=None, selector=None): # Experiment settings. TRAIN_TEST_RATIO = 0.75 N_PREDICTIONS = 10 # plotting par # Load and prepare dataset. load_raw = True # not os.path.exists(os.path.join("ProcessedData", "EnergyWeather_orig_train.pkl")) ts_struct_list = load_time_series.load_all_time_series( datasets='EnergyWeather', load_raw=load_raw, name_pattern="") if frc_model is None: frc_model = frc_class.CustomModel( Lasso, name="Lasso", alpha=0.01 ) # LSTM.LSTM() #frc_class.IdenitityFrc() #LinearRegression() # Create regression model model = frc_class.PipelineModel(gen_mdl=generator, sel_mdl=selector, frc_mdl=frc_model) results = [] res_text = [] for ts in ts_struct_list: data = regression_matrix.RegMatrix(ts) # Create regression matrix data.create_matrix(nsteps=1, norm_flag=True) # Split data for training and testing data.train_test_split(TRAIN_TEST_RATIO) model, frc, gen, sel = model.train_model(data.trainX, data.trainY) #model, frc, gen, sel = data.train_model(frc_model=frc_model, generator=generator, selector=selector) # model parameters are changed inside data.forecast(model, data.idx_test, replace=True) data.forecast(model, data.idx_train, replace=True) train_mae = data.mae(idx_rows=data.idx_train, idx_original=data.original_index) train_mape = data.mape(idx_rows=data.idx_train, idx_original=data.original_index) test_mae = data.mae(idx_rows=data.idx_test, idx_original=data.original_index) test_mape = data.mape(idx_rows=data.idx_test, idx_original=data.original_index) res1 = pd.DataFrame(train_mae, index=[t.name for t in ts.data], columns=[("MAE", "train")]) res2 = pd.DataFrame(train_mape, index=[t.name for t in ts.data], columns=[("MAPE", "train")]) res3 = pd.DataFrame(test_mae, index=[t.name for t in ts.data], columns=[("MAE", "test")]) res4 = pd.DataFrame(test_mape, index=[t.name for t in ts.data], columns=[("MAPE", "test")]) res = pd.concat([res1, res2, res3, res4], axis=1) print(res) results.append(res) res_text.append(ts.name) data.plot_frc(n_frc=N_PREDICTIONS) my_plots.save_to_latex(results, df_names=res_text) return results
def main(file_name, line_indices, header): """ Forecast data simultaneously and separately and compare errors :param file_name: file name (.csv) with data in IoT format :type file_name: str :param line_indices: indices of lines to read from file. Lines are enumerated from 1. If "all", read the whole file :param header: Specifies if the file contains a header row :type header: bool :return: forecasting errors :rtype: pandas.DataFrame """ TRAIN_TEST_RATIO = 0.75 N_PREDICTIONS = 10 N_EXPERTS = 4 VERBOSE = True # frc_model = frc_class.CustomModel(Lasso, name="Lasso", alpha=0.001) # frc_model = frc_class.CustomModel(GatingEnsemble.GatingEnsemble, # estimators = [LinearRegression() for i in range(N_EXPERTS)])#(LSTM.LSTM, name="LSTM") ts = utils_.safe_read_iot_data(file_name=file_name, line_indices=line_indices, header=header, default="poisson", verbose=VERBOSE) if VERBOSE: print(ts.summarize_ts()) # my_plots.plot_multiple_ts(ts.data, shared_x=True) data = regression_matrix.RegMatrix(ts) # Create regression matrix data.create_matrix(nsteps=1, norm_flag=True) # Split data for training and testing data.train_test_split(TRAIN_TEST_RATIO) lr_list = [2e-6, 2e-5, 2e-4] n_lstm_units = [20, 30, 40, 50] hyperpars = {"learning_rate": lr_list, "n_lstm_units": n_lstm_units} frc_model = frc_class.CustomModel(LSTM.LSTM, name="LSTM", n_epochs=20, plot_loss=True) model = frc_class.PipelineModel(frc_mdl=frc_model) model, frc, _, _ = model.train_model( data.trainX, data.trainY, hyperpars=hyperpars, n_cvs=5) # model parameters are changed inside if hasattr(frc, "fig"): frc.fig.savefig("fitting_learn_rate_{}.png".format(frc.learning_rate)) # data.forecasts returns model obj, forecasted rows of Y matrix and a list [nts] of "flat"/ts indices of forecasted points data.forecast(model, replace=True) train_mae = data.mae(idx_rows=data.idx_train) train_mape = data.mape(idx_rows=data.idx_train) test_mae = data.mae(idx_rows=data.idx_test) test_mape = data.mape(idx_rows=data.idx_test) res1 = pd.DataFrame(train_mae, index=[t.name for t in ts.data], columns=[("MAE", "train")]) res2 = pd.DataFrame(train_mape, index=[t.name for t in ts.data], columns=[("MAPE", "train")]) res3 = pd.DataFrame(test_mae, index=[t.name for t in ts.data], columns=[("MAE", "test")]) res4 = pd.DataFrame(test_mape, index=[t.name for t in ts.data], columns=[("MAPE", "test")]) res = pd.concat([res1, res2, res3, res4], axis=1) print("LSTM") print(res) data.plot_frc(n_frc=N_PREDICTIONS) frc_model = frc_class.CustomModel(Lasso, name="Lasso", alpha=0.001) model = frc_class.PipelineModel(frc_mdl=frc_model) model, _, _, _ = model.train_model(data.trainX, data.trainY) data.forecast(model, replace=True) train_mae = data.mae(idx_rows=data.idx_train) train_mape = data.mape(idx_rows=data.idx_train) test_mae = data.mae(idx_rows=data.idx_test) test_mape = data.mape(idx_rows=data.idx_test) res1 = pd.DataFrame(train_mae, index=[t.name for t in ts.data], columns=[("MAE", "train")]) res2 = pd.DataFrame(train_mape, index=[t.name for t in ts.data], columns=[("MAPE", "train")]) res3 = pd.DataFrame(test_mae, index=[t.name for t in ts.data], columns=[("MAE", "test")]) res4 = pd.DataFrame(test_mape, index=[t.name for t in ts.data], columns=[("MAPE", "test")]) res = pd.concat([res1, res2, res3, res4], axis=1) print("Lasso") print(res) return res
def main(): """ Provides an example of usage of the system. The model consists of three main components: feature generation, feature selection and forecasting model. Feature generation and selection may be empty: generation = None selection = None which is the same as generator = gnt_class.FeatureGeneration(name="Identity generator") selector = sel_class.FeatureSelection(name="Identity selector", on=False) Other options for feature generation: generator = gnt_class.FeatureGeneration(name="univariate", replace=False, norm=True transformations=["univariate_transformation", "centroids"]) generator = gnt_class.Nonparametric() generator = gnt_class.Monotone() Examples of using sklearn solutions: frc_class.CustomModel(PCA, name="Randomized PCA", svd_solver="randomized") frc_class.CustomModel(PCA, name="PCA") Examples of custom models: * Mixture of experts: frc_model = frc_class.CustomModel(GatingEnsemble, name="Mixture", estimators=[Lasso(alpha=0.01), Lasso(alpha=0.001)]) * LSTM network: frc_model = frc_class.CustomModel(LSTM.LSTM, name="LSTM") """ # Load and prepare dataset. ts_list = load_energy_weather_data() generator = gnt_class.FeatureGeneration( transformations='centroids') #gnt_class.Monotone() # feature selection model can be defined in the same way. If you don't use any, just leave as is selector = sel_class.FeatureSelection(on=False) # # first argument is your model class, then follow optional parameters as keyword arguments frc_model = frc_class.CustomModel(RandomForestRegressor, name="RF") #frc_class.CustomModel(Lasso, name="Lasso", alpha=0.001) # train your model: model = demo_train(ts_list, frc_model=frc_model, fg_mdl=generator, fs_mdl=selector, verbose=VERBOSE) # evaluate errors on the test set train_error, train_std = competition_errors(model=model, names=TRAIN_FILE_NAMES, y_idx=TS_IDX) test_error, test_std = competition_errors(model=model, names=TEST_FILE_NAMES, y_idx=TS_IDX) print( "Average MAPE across time series: train = {} with std {}, test = {} with std {}" .format(train_error, train_std, test_error, test_std)) return train_error, test_error
def demo_train(ts_struct_list, frc_model=None, fg_mdl=None, fs_mdl=None, verbose=False, return_model=False, rewrite=True): """ Train and save the model. :param ts_struct_list: list of namedtuples tsStruct :param frc_model: forecasting model, instance of CustomModel :param fg_mdl: feature generation model, instance of FeatureGeneration :param fs_mdl: feature selection model, instance of FeatureSelection :param verbose: controls the output :return: testError, trainError, bias, model """ # Check arguments: if fg_mdl is None: fg_mdl = frc_class.IdentityGenerator(name="Identity generator", on=False) if fs_mdl is None: fs_mdl = gnt_class.FeatureGeneration( ) # IdentityModel(name="Identity selector") if frc_model is None: frc_model = frc_class.CustomModel(Lasso, name="Lasso", alpha=0.01) model = frc_class.PipelineModel(gen_mdl=fg_mdl, sel_mdl=fs_mdl, frc_mdl=frc_model) results = [] res_text = [] for ts in ts_struct_list: data = regression_matrix.RegMatrix(ts, x_idx=TS_IDX, y_idx=TS_IDX) # Create regression matrix data.create_matrix( nsteps=N_STEPS, norm_flag=True ) # this creates data.Y, data.X and some other fields # Split data for training and testing data.train_test_split(TRAIN_TEST_RATIO) # train the model. This returns trained pipeline and its steps model, frc, gen, sel = model.train_model(data.trainX, data.trainY) selection_res = "\n Feature selection results: problem status {}, selected {} from {} \\\\ \n".\ format(sel.status, len(sel.selected), sel.n_vars) frcY, _ = data.forecast( model) # returns forecasted matrix of the same shape as data.Y # frcY, idx_frc = data.forecast(model, idx_rows=data.idx_test) # this would return forecasts only for data.testY data.plot_frc(n_frc=5, n_hist=10, folder=SAVE_DIR) #this saves figures into SAVE_DIR train_mae = data.mae(idx_rows=data.idx_train, idx_original=data.original_index) train_mape = data.mape(idx_rows=data.idx_train, idx_original=data.original_index) test_mae = data.mae(idx_rows=data.idx_test, idx_original=data.original_index) test_mape = data.mape(idx_rows=data.idx_test, idx_original=data.original_index) index = [ts.data[i].name for i in TS_IDX] res1 = pd.DataFrame(train_mae, index=index, columns=[("MAE", "train")]) res2 = pd.DataFrame(train_mape, index=index, columns=[("MAPE", "train")]) res3 = pd.DataFrame(test_mae, index=index, columns=[("MAE", "test")]) res4 = pd.DataFrame(test_mape, index=index, columns=[("MAPE", "test")]) res = pd.concat([res1, res2, res3, res4], axis=1) configuration_str = "\n Time series {} forecasted with {} + '{}' feature generation model and " \ "'{}' feature selection model \\\\ \n".format(ts.name, frc.name, gen.name, sel.name) if verbose: print(configuration_str) print(selection_res) print(res) results.append(res) res_text.append(configuration_str) res_text.append(selection_res) saved_mdl_fname = model.save_model( file_name=FNAME_PREFIX, folder=SAVE_DIR) # saving in not an option yet # model = frc_class.PipelineModel().load_model(file_name=fname) # write results into a latex file my_plots.save_to_latex(results, df_names=res_text, folder=SAVE_DIR, rewrite=rewrite) print("Results saved to folder {}".format(SAVE_DIR)) if return_model: return model, saved_mdl_fname return saved_mdl_fname