def get_time_series(): """ Function returns time series for time series forecasting task """ len_forecast = 100 synthetic_ts = generate_synthetic_data(length=1000) train_data = synthetic_ts[:-len_forecast] test_data = synthetic_ts[-len_forecast:] task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast)) train_input = InputData(idx=np.arange(0, len(train_data)), features=train_data, target=train_data, task=task, data_type=DataTypesEnum.ts) start_forecast = len(train_data) end_forecast = start_forecast + len_forecast predict_input = InputData(idx=np.arange(start_forecast, end_forecast), features=train_data, target=None, task=task, data_type=DataTypesEnum.ts) return train_input, predict_input, test_data
def prepare_input_data(forecast_length, horizon): ts = np.array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 101 ]) # Forecast for 2 elements ahead task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length)) # To avoid data leak ts_train = ts[:-horizon] train_input = InputData(idx=np.arange(0, len(ts_train)), features=ts_train, target=ts_train, task=task, data_type=DataTypesEnum.ts) start_forecast = len(ts_train) end_forecast = start_forecast + forecast_length predict_input = InputData(idx=np.arange(start_forecast, end_forecast), features=ts, target=None, task=task, data_type=DataTypesEnum.ts) return train_input, predict_input
def get_synthetic_ts_data_period(n_steps=1000, forecast_length=1, max_window_size=50): simulated_data = ArmaProcess().generate_sample(nsample=n_steps) x1 = np.arange(0, n_steps) x2 = np.arange(0, n_steps) + 1 simulated_data = simulated_data + x1 * 0.0005 - x2 * 0.0001 periodicity = np.sin(x1 / 50) simulated_data = simulated_data + periodicity task = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False)) data = InputData(idx=np.arange(0, n_steps), features=np.asarray([x1, x2]).T, target=simulated_data, task=task, data_type=DataTypesEnum.ts) return train_test_data_setup(data)
def test_lagged_with_invalid_params_fit_correctly(): """ The function define a chain with incorrect parameters in the lagged transformation. During the training of the chain, the parameter 'window_size' is corrected """ window_size = 600 len_forecast = 50 # The length of the time series is 500 elements project_root_path = str(project_root()) file_path = os.path.join(project_root_path, 'test/data/short_time_series.csv') df = pd.read_csv(file_path) time_series = np.array(df['sea_height']) task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast)) train_input = InputData(idx=np.arange(0, len(time_series)), features=time_series, target=time_series, task=task, data_type=DataTypesEnum.ts) # Get chain with lagged transformation in it chain = get_ts_chain(window_size) # Fit it chain.fit(train_input) is_chain_was_fitted = True assert is_chain_was_fitted
def run_ts_forecasting_example(with_plot=True, with_pipeline_vis=True, timeout=None): train_data_path = f'{fedot_project_root()}/examples/data/salaries.csv' target = pd.read_csv(train_data_path)['target'] # Define forecast length and define parameters - forecast length forecast_length = 30 task_parameters = TsForecastingParams(forecast_length=forecast_length) # init model for the time series forecasting model = Fedot(problem='ts_forecasting', task_params=task_parameters, timeout=timeout) # run AutoML model design in the same way pipeline = model.fit(features=train_data_path, target='target') if with_pipeline_vis: pipeline.show() # use model to obtain forecast forecast = model.predict(features=train_data_path) print( model.get_metrics(metric_names=['rmse', 'mae', 'mape'], target=target)) # plot forecasting result if with_plot: model.plot_prediction() return forecast
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, is_visualise=False, timeout=5): # Prepare data for train and test ssh_history, ws_history, ssh_obs = prepare_input_data( train_file_path, test_file_path) historical_data = { 'ws': ws_history, # additional variable 'ssh': ssh_history, # target variable } fedot = Fedot( problem='ts_forecasting', task_params=TsForecastingParams(forecast_length=forecast_length), timeout=timeout, verbose_level=4) pipeline = fedot.fit(features=historical_data, target=ssh_history) fedot.forecast(historical_data, forecast_length=forecast_length) metric = fedot.get_metrics(target=ssh_obs) if is_visualise: pipeline.show() fedot.plot_prediction() return metric
def __chain_fit_predict(self, timeseries_train: np.array, len_gap: int): """ The method makes a prediction as a sequence of elements based on a training sample. There are two main parts: fit model and predict. :param timeseries_train: part of the time series for training the model :param len_gap: number of elements in the gap :return: array without gaps """ task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_gap)) input_data = InputData(idx=np.arange(0, len(timeseries_train)), features=timeseries_train, target=timeseries_train, task=task, data_type=DataTypesEnum.ts) # Making predictions for the missing part in the time series self.chain.fit_from_scratch(input_data) # "Test data" for making prediction for a specific length start_forecast = len(timeseries_train) end_forecast = start_forecast + len_gap idx_test = np.arange(start_forecast, end_forecast) test_data = InputData(idx=idx_test, features=timeseries_train, target=None, task=task, data_type=DataTypesEnum.ts) predicted_values = self.chain.predict(test_data) predicted_values = np.ravel(np.array(predicted_values.predict)) return predicted_values
def get_synthetic_ts_data_period(n_steps=6000, forecast_length=1, max_window_size=50, with_exog: bool = True) -> InputData: x1 = np.arange(0, n_steps) / 10 x2 = np.arange(0, n_steps) + 1 x1_exog = np.arange(0, n_steps + forecast_length) / 10 x2_exog = np.arange(0, n_steps + forecast_length) + 1 simulated_data = x1 * 0.005 - x2 * 0.001 periodicity = np.sin(x1 * 0.4) random = np.random.normal(0, 0.1, n_steps) simulated_data = simulated_data + periodicity + random task = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=True)) exog_features = np.asarray([x1_exog, x2_exog]).T if not with_exog: # move target to features exog_features = None input_data = InputData(idx=np.arange(0, n_steps), features=exog_features, target=simulated_data, task=task, data_type=DataTypesEnum.ts) return input_data
def prepare_input_data(len_forecast, train_data_features, train_data_target, test_data_features): """ Function return prepared data for fit and predict :param len_forecast: forecast length :param train_data_features: time series which can be used as predictors for train :param train_data_target: time series which can be used as target for train :param test_data_features: time series which can be used as predictors for prediction :return train_input: Input Data for fit :return predict_input: Input Data for predict :return task: Time series forecasting task with parameters """ task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast)) train_input = InputData(idx=np.arange(0, len(train_data_features)), features=train_data_features, target=train_data_target, task=task, data_type=DataTypesEnum.ts) # Determine indices for forecast start_forecast = len(train_data_features) end_forecast = start_forecast + len_forecast predict_input = InputData(idx=np.arange(start_forecast, end_forecast), features=test_data_features, target=None, task=task, data_type=DataTypesEnum.ts) return train_input, predict_input, task
def test_multivariate_ts(): forecast_length = 1 file_path_train = 'cases/data/metocean/metocean_data_train.csv' full_path_train = os.path.join(str(fedot_project_root()), file_path_train) # a dataset for a final validation of the composed model file_path_test = 'cases/data/metocean/metocean_data_test.csv' full_path_test = os.path.join(str(fedot_project_root()), file_path_test) target_history, add_history, obs = prepare_input_data( full_path_train, full_path_test) historical_data = { 'ws': add_history, # additional variable 'ssh': target_history, # target variable } fedot = Fedot( problem='ts_forecasting', composer_params=composer_params, task_params=TsForecastingParams(forecast_length=forecast_length)) fedot.fit(features=historical_data, target=target_history) forecast = fedot.forecast(historical_data, forecast_length=forecast_length) assert forecast is not None
def prepare_train_test_input(train_part, len_forecast): """ Function return prepared data for fit and predict :param len_forecast: forecast length :param train_part: time series which can be used as predictors for train :return train_input: Input Data for fit :return predict_input: Input Data for predict :return task: Time series forecasting task with parameters """ # Specify the task to solve task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast)) train_input = InputData(idx=np.arange(0, len(train_part)), features=train_part, target=train_part, task=task, data_type=DataTypesEnum.ts) start_forecast = len(train_part) end_forecast = start_forecast + len_forecast predict_input = InputData(idx=np.arange(start_forecast, end_forecast), features=train_part, target=None, task=task, data_type=DataTypesEnum.ts) return train_input, predict_input, task
def test_pipeline_with_wrong_data(): pipeline = Pipeline(PrimaryNode('linear')) data_seq = np.arange(0, 10) task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=10)) data = InputData(idx=data_seq, features=data_seq, target=data_seq, data_type=DataTypesEnum.ts, task=task) with pytest.raises(ValueError): pipeline.fit(data)
def test_api_forecast_correct(task_type: str = 'ts_forecasting'): # The forecast length must be equal to 12 forecast_length = 12 train_data, test_data, _ = get_dataset(task_type) model = Fedot(problem='ts_forecasting', composer_params=composer_params, task_params=TsForecastingParams(forecast_length=forecast_length)) model.fit(features=train_data) ts_forecast = model.predict(features=train_data) metric = model.get_metrics(target=test_data.target, metric_names='rmse') assert len(ts_forecast) == forecast_length assert metric['rmse'] >= 0
def test_chain_with_wrong_data(): chain = Chain(PrimaryNode('linear')) data_seq = np.arange(0, 10) task = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=10, max_window_size=len(data_seq) + 1, return_all_steps=False)) data = InputData(idx=data_seq, features=data_seq, target=data_seq, data_type=DataTypesEnum.ts, task=task) with pytest.raises(ValueError): chain.fit(data)
def test_api_forecast_numpy_input_with_static_model_correct(task_type: str = 'ts_forecasting'): forecast_length = 10 train_data, test_data, _ = get_dataset(task_type) model = Fedot(problem='ts_forecasting', task_params=TsForecastingParams(forecast_length=forecast_length)) # Define chain for prediction node_lagged = PrimaryNode('lagged') chain = Chain(SecondaryNode('linear', nodes_from=[node_lagged])) model.fit(features=train_data.features, target=train_data.target, predefined_model=chain) ts_forecast = model.predict(features=train_data) metric = model.get_metrics(target=test_data.target, metric_names='rmse') assert len(ts_forecast) == forecast_length assert metric['rmse'] >= 0
def make_forecast(df, len_forecast: int, time_series_label: str): """ Function for making time series forecasting with Prophet library :param df: dataframe to process :param len_forecast: forecast length :param time_series_label: name of time series to process :return predicted_values: forecast :return model_name: name of the model (always 'AutoTS') """ # Define parameters task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast)) # Init model for the time series forecasting model = Fedot(problem='ts_forecasting', task_params=task.task_params, composer_params={ 'timeout': 1, 'preset': 'ultra_light_tun' }, preset='ultra_light_tun') input_data = InputData(idx=np.arange(0, len(df)), features=np.array(df[time_series_label]), target=np.array(df[time_series_label]), task=task, data_type=DataTypesEnum.ts) start_forecast = len(df) end_forecast = start_forecast + len_forecast predict_input = InputData(idx=np.arange(start_forecast, end_forecast), features=np.array(df[time_series_label]), target=np.array(df[time_series_label]), task=task, data_type=DataTypesEnum.ts) # Run AutoML model design in the same way pipeline = model.fit(features=input_data) predicted_values = model.predict(predict_input) model_name = 'FEDOT' return predicted_values, model_name
def synthetic_with_exogenous_ts(): """ Method returns InputData for time series forecasting task with exogenous variable """ task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length)) # Time series with exogenous variable ts_train = np.array( [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130]) ts_exog = np.array( [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]) ts_test = np.array([140, 150, 160, 170]) ts_test_exog = np.array([24, 25, 26, 27]) # Indices for forecast start_forecast = len(ts_train) end_forecast = start_forecast + forecast_length # Input for source time series train_source_ts = InputData(idx=np.arange(0, len(ts_train)), features=ts_train, target=ts_train, task=task, data_type=DataTypesEnum.ts) predict_source_ts = InputData(idx=np.arange(start_forecast, end_forecast), features=ts_train, target=None, task=task, data_type=DataTypesEnum.ts) # Input for exogenous variable train_exog_ts = InputData(idx=np.arange(0, len(ts_train)), features=ts_exog, target=ts_train, task=task, data_type=DataTypesEnum.ts) predict_exog_ts = InputData(idx=np.arange(start_forecast, end_forecast), features=ts_test_exog, target=None, task=task, data_type=DataTypesEnum.ts) return train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test
def prepare_input_data(train_file_path, test_file_path, forecast_length): """ Function for preparing InputData for train and test algorithm :param train_file_path: path to the csv file for training :param test_file_path: path to the csv file for validation :param forecast_length: forecast length for prediction :return dataset_to_train: InputData for train :return dataset_to_validate: InputData for validation """ # specify the task to solve task_to_solve = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length)) # Load train and test dataframes full_path_train = os.path.join(str(project_root()), train_file_path) full_path_test = os.path.join(str(project_root()), test_file_path) df_train = pd.read_csv(full_path_train) df_test = pd.read_csv(full_path_test) # Get idx for train and series for train train_feature_ts = np.ravel(np.array(df_train['wind_speed'])) train_target_ts = np.ravel(np.array(df_train['sea_height'])) idx_train = np.arange(0, len(train_feature_ts)) dataset_to_train = InputData(idx=idx_train, features=train_feature_ts, target=train_target_ts, task=task_to_solve, data_type=DataTypesEnum.ts) start_forecast = len(idx_train) end_forecast = start_forecast + forecast_length idx_test = np.arange(start_forecast, end_forecast) test_target_ts = np.ravel(np.array(df_test['sea_height'])) test_target_ts = test_target_ts[:forecast_length] dataset_to_validate = InputData(idx=idx_test, features=train_feature_ts, target=test_target_ts, task=task_to_solve, data_type=DataTypesEnum.ts) return dataset_to_train, dataset_to_validate
def get_synthetic_ts_data_linear(n_steps=1000, forecast_length=1, max_window_size=50): simulated_data = np.asarray([float(_) for _ in (np.arange(0, n_steps))]) task = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=False)) data = InputData(idx=np.arange(0, n_steps), features=simulated_data, target=simulated_data, task=task, data_type=DataTypesEnum.ts) return train_test_data_setup(data, shuffle_flag=False)
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=32, is_visualise=False): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) chain_simple = TsForecastingChain(PrimaryNode('linear')) chain_simple.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_simple = calculate_validation_metric( chain_simple.predict(dataset_to_validate), dataset_to_validate, f'full-simple_{forecast_length}', is_visualise=is_visualise) print(f'RMSE simple: {rmse_on_valid_simple}') chain_composite_lstm = get_composite_chain() chain_composite_lstm.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_lstm_only = calculate_validation_metric( chain_composite_lstm.predict(dataset_to_validate), dataset_to_validate, f'full-lstm-only_{forecast_length}', is_visualise=is_visualise) print(f'RMSE LSTM composite: {rmse_on_valid_lstm_only}') return rmse_on_valid_simple
def get_synthetic_ts_data_period(n_steps=1000, forecast_length=5): simulated_data = ArmaProcess().generate_sample(nsample=n_steps) x1 = np.arange(0, n_steps) x2 = np.arange(0, n_steps) + 1 simulated_data = simulated_data + x1 * 0.0005 - x2 * 0.0001 periodicity = np.sin(x1 / 50) simulated_data = simulated_data + periodicity task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length)) data = InputData(idx=np.arange(0, n_steps), features=simulated_data, target=simulated_data, task=task, data_type=DataTypesEnum.ts) a, b = train_test_data_setup(data) return train_test_data_setup(data)
def get_ts_data(n_steps=80, forecast_length=5): """ Prepare data from csv file with time series and take needed number of elements :param n_steps: number of elements in time series to take :param forecast_length: the length of forecast """ project_root_path = str(fedot_project_root()) file_path = os.path.join(project_root_path, 'test/data/simple_time_series.csv') df = pd.read_csv(file_path) time_series = np.array(df['sea_height'])[:n_steps] task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length)) data = InputData(idx=np.arange(0, len(time_series)), features=time_series, target=time_series, task=task, data_type=DataTypesEnum.ts) return train_test_data_setup(data)
def test_api_cv_correct(): """ Checks if the composer works correctly when using cross validation for time series through api """ folds = 2 _, forecast_len, validation_blocks, time_series = configure_experiment() composer_params = { 'max_depth': 1, 'max_arity': 2, 'timeout': 0.05, 'preset': 'ultra_light', 'cv_folds': folds, 'validation_blocks': validation_blocks } task_parameters = TsForecastingParams(forecast_length=forecast_len) model = Fedot(problem='ts_forecasting', composer_params=composer_params, task_params=task_parameters, verbose_level=2) fedot_model = model.fit(features=time_series) is_succeeded = True assert is_succeeded
def _chain_fit_predict(self, timeseries_train: np.array, len_gap: int, max_window_size: int): """ The method makes a prediction as a sequence of elements based on a training sample. There are two main parts: fit model and predict. :param timeseries_train: part of the time series for training the model :param len_gap: number of elements in the gap :param max_window_size: window length :return: array without gaps """ task = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_gap, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=True)) input_data = InputData(idx=np.arange(0, len(timeseries_train)), features=None, target=timeseries_train, task=task, data_type=DataTypesEnum.ts) # Making predictions for the missing part in the time series self.chain.fit_from_scratch(input_data) # "Test data" for making prediction for a specific length test_data = InputData(idx=np.arange(0, len_gap), features=None, target=None, task=task, data_type=DataTypesEnum.ts) predicted_values = self.chain.forecast( initial_data=input_data, supplementary_data=test_data).predict return predicted_values
def get_synthetic_ts_data_custom(n_steps=6000, forecast_length=2, max_window_size=2, with_exog: bool = True) -> InputData: task = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=True)) exog_features = np.asarray( [10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0]) if not with_exog: # move target to features exog_features = None input_data = InputData(idx=np.arange(0, n_steps), features=exog_features, target=np.asarray([0.0, 1.0, 2.0, 3.0, 4.0, 5.0]), task=task, data_type=DataTypesEnum.ts) return input_data
def test_ts_single_chain_model_without_multiotput_support(): time_series = generate_synthetic_data(10) len_forecast = 2 train_part = time_series[:-len_forecast] test_part = time_series[-len_forecast:] task = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast, max_window_size=2, return_all_steps=False, make_future_prediction=True)) train_data = InputData(idx=np.arange(0, len(train_part)), features=None, target=train_part, task=task, data_type=DataTypesEnum.ts) for model_id in ['xgbreg', 'gbr', 'adareg', 'svr', 'sgdr']: chain = TsForecastingChain(PrimaryNode(model_id)) # making predictions for the missing part in the time series chain.fit_from_scratch(train_data) # data for making prediction for a specific length test_data = InputData(idx=np.arange(0, len_forecast), features=None, target=None, task=task, data_type=DataTypesEnum.ts) predicted_values = chain.forecast(initial_data=train_data, supplementary_data=test_data).predict mae = mean_absolute_error(test_part, predicted_values) assert mae < 50
def test_ts_single_pipeline_model_without_multiotput_support(): time_series = generate_synthetic_data(20) len_forecast = 2 train_part = time_series[:-len_forecast] test_part = time_series[-len_forecast:] task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast)) train_data = InputData(idx=np.arange(0, len(train_part)), features=train_part, target=train_part, task=task, data_type=DataTypesEnum.ts) start_forecast = len(train_part) end_forecast = start_forecast + len_forecast idx_for_predict = np.arange(start_forecast, end_forecast) # Data for making prediction for a specific length test_data = InputData(idx=idx_for_predict, features=train_part, target=test_part, task=task, data_type=DataTypesEnum.ts) for model_id in ['xgbreg', 'gbr', 'adareg', 'svr', 'sgdr']: pipeline = get_simple_ts_pipeline(model_root=model_id, window_size=2) # making predictions for the missing part in the time series pipeline.fit_from_scratch(train_data) predicted_values = pipeline.predict(test_data) pipeline_forecast = np.ravel(np.array(predicted_values.predict)) test_part = np.ravel(np.array(test_part)) mae = mean_absolute_error(test_part, pipeline_forecast) assert mae < 50
def synthetic_univariate_ts(): """ Method returns InputData for classical time series forecasting task """ task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length)) # Simple time series to process ts_train = np.array( [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130]) ts_test = np.array([140, 150, 160, 170]) # Prepare train data train_input = InputData(idx=np.arange(0, len(ts_train)), features=ts_train, target=ts_train, task=task, data_type=DataTypesEnum.ts) start_forecast = len(ts_train) end_forecast = start_forecast + forecast_length predict_input = InputData(idx=np.arange(start_forecast, end_forecast), features=ts_train, target=None, task=task, data_type=DataTypesEnum.ts) return train_input, predict_input, ts_test
def _chain_fit_predict(self, timeseries_train: np.array, len_gap: int, max_window_size: int): """ The method makes a prediction as a sequence of elements based on a training sample. There are two main parts: fit model and predict. :param timeseries_train: part of the time series for training the model :param len_gap: number of elements in the gap :param max_window_size: window length :return: array without gaps """ task = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_gap, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=True)) input_data = InputData(idx=np.arange(0, len(timeseries_train)), features=None, target=timeseries_train, task=task, data_type=DataTypesEnum.ts) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) available_model_types_primary = [ 'linear', 'lasso', 'ridge', 'trend_data_model', 'residual_data_model' ] available_model_types_secondary = [ 'rfr', 'linear', 'knnreg', 'gbr', 'ridge', 'lasso', 'svr' ] composer_requirements = GPComposerRequirements( primary=available_model_types_primary, secondary=available_model_types_secondary, max_arity=3, max_depth=4, pop_size=5, num_of_generations=5, crossover_prob=0.1, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=20)) builder = FixedStructureComposerBuilder(task=task).with_requirements(composer_requirements) \ .with_metrics(metric_function).with_initial_chain(self.chain) composer = builder.build() obtained_chain = composer.compose_chain(data=input_data, is_visualise=False) # Making predictions for the missing part in the time series obtained_chain.__class__ = TsForecastingChain obtained_chain.fit_from_scratch(input_data) print(f'\n Размер полученной цепочки {len(obtained_chain.nodes)} \n') # "Test data" for making prediction for a specific length test_data = InputData(idx=np.arange(0, len_gap), features=None, target=None, task=task, data_type=DataTypesEnum.ts) predicted_values = obtained_chain.forecast( initial_data=input_data, supplementary_data=test_data).predict return predicted_values
def forecasting_accuracy(path, prediction_len, vis=True): mapes_per_model = [] models = [] files = [] for file_name in ['Synthetic.csv', 'Sea_hour.csv', 'Sea_10_240.csv']: # Исходный файл с пропусками gap_path = os.path.join(path, file_name) gap_df = pd.read_csv(gap_path) gap_df['Date'] = pd.to_datetime(gap_df['Date']) # Простые методы linear_path = os.path.join(os.path.join(path, 'linear'), file_name) linear_df = pd.read_csv(linear_path) local_poly_path = os.path.join(os.path.join(path, 'poly'), file_name) local_poly_df = pd.read_csv(local_poly_path) batch_poly_path = os.path.join(os.path.join(path, 'batch_poly'), file_name) batch_poly_df = pd.read_csv(batch_poly_path) # Методы восстановления пропусков средствами языка R kalman_path = os.path.join(os.path.join(path, 'kalman'), file_name) kalman_df = pd.read_csv(kalman_path) ma_path = os.path.join(os.path.join(path, 'ma'), file_name) ma_df = pd.read_csv(ma_path) spline_path = os.path.join(os.path.join(path, 'spline'), file_name) spline_df = pd.read_csv(spline_path) # Методы восстановления пропусков FEDOT fedot_ridge_30_path = os.path.join(os.path.join(path, 'fedot_ridge_30'), file_name) fedot_ridge_30_df = pd.read_csv(fedot_ridge_30_path) fedot_ridge_100_path = os.path.join(os.path.join(path, 'fedot_ridge_100'), file_name) fedot_ridge_100_df = pd.read_csv(fedot_ridge_100_path) fedot_compose = os.path.join(os.path.join(path, 'fedot_composing'), file_name) fedot_compose_df = pd.read_csv(fedot_compose) # Исходный временной ряд без пропусков arr_parameter = np.array(gap_df['Height']) # Временной ряд с пропусками arr_mask = np.array(gap_df['gap']) ids_gaps = np.ravel(np.argwhere(arr_mask == -100.0)) array_gaps = np.ma.masked_where(arr_mask == -100.0, arr_mask) if vis: plt.plot(gap_df['Date'], arr_parameter, c='red', alpha=0.2) for index in ids_gaps: plt.plot([gap_df['Date'][index], gap_df['Date'][index]], [min(arr_parameter), arr_parameter[index]], c='red', alpha=0.05) plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0) plt.ylabel('Sea level, m', fontsize=15) plt.xlabel('Date', fontsize=15) plt.grid() plt.show() withoutgap_arr_linear = np.array(linear_df['gap']) withoutgap_arr_local = np.array(local_poly_df['gap']) withoutgap_arr_batch = np.array(batch_poly_df['gap']) withoutgap_arr_kalman = np.array(kalman_df['gap']) withoutgap_arr_ma = np.array(ma_df['gap']) withoutgap_arr_spline = np.array(spline_df['gap']) withoutgap_arr_ridge_30 = np.array(fedot_ridge_30_df['gap']) withoutgap_arr_ridge_100 = np.array(fedot_ridge_100_df['gap']) withoutgap_arr_compose = np.array(fedot_compose_df['gap']) if vis: plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5, label='Actual values') plt.plot(gap_df['Date'], withoutgap_arr_linear, c='red', alpha=0.5, label='Linear interpolation') plt.plot(gap_df['Date'], withoutgap_arr_local, c='orange', alpha=0.5, label='Local polynomial approximation') plt.plot(gap_df['Date'], withoutgap_arr_batch, c='purple', alpha=0.5, label='Batch polynomial approximation') plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0) plt.ylabel('Sea level, m', fontsize=15) plt.xlabel('Date', fontsize=15) plt.grid() plt.legend(fontsize=15) plt.show() plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5, label='Actual values') plt.plot(gap_df['Date'], withoutgap_arr_kalman, c='red', alpha=0.5, label='Kalman filtering') plt.plot(gap_df['Date'], withoutgap_arr_ma, c='orange', alpha=0.5, label='Moving average') plt.plot(gap_df['Date'], withoutgap_arr_spline, c='purple', alpha=0.5, label='Spline interpolation') plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0) plt.ylabel('Sea level, m', fontsize=15) plt.xlabel('Date', fontsize=15) plt.grid() plt.legend(fontsize=15) plt.show() plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5, label='Actual values') plt.plot(gap_df['Date'], withoutgap_arr_batch, c='red', alpha=0.5, label='Batch polynomial approximation') plt.plot(gap_df['Date'], withoutgap_arr_kalman, c='orange', alpha=0.5, label='Kalman filtering') plt.plot(gap_df['Date'], withoutgap_arr_ridge_30, c='purple', alpha=0.5, label='Ridge 30 ws') plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0) plt.ylabel('Sea level, m', fontsize=15) plt.xlabel('Date', fontsize=15) plt.grid() plt.legend(fontsize=15) plt.show() train_part = arr_parameter[:-prediction_len] test_part = arr_parameter[-prediction_len:] # Подготавливаем часть временного ряда с восстановленными значениями train_part_linear = withoutgap_arr_linear[:-prediction_len] train_part_local = withoutgap_arr_local[:-prediction_len] train_part_batch = withoutgap_arr_batch[:-prediction_len] train_part_kalman = withoutgap_arr_kalman[:-prediction_len] train_part_ma = withoutgap_arr_ma[:-prediction_len] train_part_stine = withoutgap_arr_spline[:-prediction_len] train_part_ridge_30 = withoutgap_arr_ridge_30[:-prediction_len] train_part_ridge_100 = withoutgap_arr_ridge_100[:-prediction_len] train_part_compose = withoutgap_arr_compose[:-prediction_len] if file_name == 'Hour_data_m.csv': max_window_size = 50 else: max_window_size = 500 for sample, model in zip([train_part, train_part_linear, train_part_local, train_part_batch, train_part_kalman, train_part_ma, train_part_stine, train_part_ridge_30, train_part_ridge_100, train_part_compose], ['Original', 'Linear interpolation', 'Local polynomial approximation', 'Batch polynomial approximation', 'Kalman filtering', 'Moving average', 'Spline interpolation', 'Ridge forward 30 ws', 'Ridge forward 100 ws', 'Chain compose']): node_first = PrimaryNode('ridge') node_second = PrimaryNode('ridge') node_trend_model = SecondaryNode('linear', nodes_from=[node_first]) node_residual_model = SecondaryNode('linear', nodes_from=[node_second]) node_final = SecondaryNode('svr', nodes_from=[node_trend_model, node_residual_model]) chain = TsForecastingChain(node_final) task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=prediction_len, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=True)) input_data = InputData(idx=np.arange(0, len(sample)), features=None, target=sample, task=task, data_type=DataTypesEnum.ts) chain.fit_from_scratch(input_data) # "Test data" for making prediction for a specific length test_data = InputData(idx=np.arange(0, prediction_len), features=None, target=None, task=task, data_type=DataTypesEnum.ts) predicted_values = chain.forecast(initial_data=input_data, supplementary_data=test_data).predict print(model) MAE = mean_absolute_error(test_part, predicted_values) print('Mean absolute error -', round(MAE, 4)) RMSE = (mean_squared_error(test_part, predicted_values)) ** 0.5 print('RMSE -', round(RMSE, 4)) MedianAE = median_absolute_error(test_part, predicted_values) print('Median absolute error -', round(MedianAE, 4)) mape = mean_absolute_percentage_error(test_part, predicted_values) print('MAPE -', round(mape, 4), '\n') if file_name == 'Sea_10_240.csv': plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5, label='Actual values') plt.plot(gap_df['Date'][:-prediction_len], sample, c='blue', label='Restored series') plt.plot(gap_df['Date'][-prediction_len:], predicted_values, c='red', alpha=0.5, label='Model forecast') plt.ylabel('Sea level, m', fontsize=15) plt.xlabel('Date', fontsize=15) plt.grid() plt.title(model, fontsize=15) plt.legend(fontsize=15) plt.show() models.append(model) mapes_per_model.append(mape) files.append(file_name) local_df = pd.DataFrame({'MAPE': mapes_per_model, 'Model': models, 'File': files}) for model in local_df['Model'].unique(): local_local_df = local_df[local_df['Model'] == model] mape_arr = np.array(local_local_df['MAPE']) print(f'Среднее значение ошибки для модели {model} - {np.mean(mape_arr)}') for file in local_local_df['File'].unique(): l_local_local_df = local_local_df[local_local_df['File'] == file] print(f'{model}, {file}, MAPE - {float(l_local_local_df["MAPE"])}')