def get_composite_chain(): chain = TsForecastingChain() node_trend = PrimaryNode('trend_data_model') node_model_trend = SecondaryNode('linear', nodes_from=[node_trend]) node_residual = PrimaryNode('residual_data_model') node_model_residual = SecondaryNode('linear', nodes_from=[node_residual]) node_final = SecondaryNode( 'linear', nodes_from=[node_model_residual, node_model_trend]) chain.add_node(node_final) return chain
def run_onestep_linear_example(n_steps=1000, is_visualise: bool = True): window_size = 16 dataset = get_synthetic_ts_data_period(n_steps=n_steps, forecast_length=1, max_window_size=window_size, with_exog=True) # regression forecasting chain = TsForecastingChain(PrimaryNode('linear')) # one step regression run_forecasting( chain=chain, data=dataset, is_visualise=is_visualise, desc= f'Linear model, {dataset.task.task_params.forecast_length} step prediction with exog' ) dataset = get_synthetic_ts_data_period(n_steps=n_steps, forecast_length=1, max_window_size=window_size, with_exog=False) run_forecasting( chain=chain, data=dataset, is_visualise=is_visualise, desc= f'Linear model, {dataset.task.task_params.forecast_length} step prediction without exog' )
def run_multistep_composite_example(n_steps=20000, is_visualise: bool = True): # composite forecasting with ensemble node_first = PrimaryNode('linear') node_second = PrimaryNode('ridge') node_final = SecondaryNode('linear', nodes_from=[node_first, node_second]) chain = TsForecastingChain(node_final) dataset = get_synthetic_ts_data_period(n_steps=n_steps, forecast_length=64, max_window_size=512, with_exog=False) run_forecasting( chain=chain, data=dataset, is_visualise=is_visualise, desc= f'Composite model, {dataset.task.task_params.forecast_length} step prediction without exog' ) dataset = get_synthetic_ts_data_period(n_steps=n_steps, forecast_length=64, max_window_size=64, with_exog=True) run_forecasting( chain=chain, data=dataset, is_visualise=is_visualise, desc= f'Composite model, {dataset.task.task_params.forecast_length} step prediction with exog' )
def run_multistep_custom_example(n_steps=6, is_visualise: bool = True): chain = TsForecastingChain(PrimaryNode('ridge')) dataset = get_synthetic_ts_data_custom(n_steps=n_steps, forecast_length=2, max_window_size=2, with_exog=False) # multi step regression run_forecasting( chain=chain, data=dataset, is_visualise=is_visualise, desc= f'Linear model, {dataset.task.task_params.forecast_length} step prediction without exog' ) dataset = get_synthetic_ts_data_custom(n_steps=n_steps, forecast_length=2, max_window_size=2, with_exog=True) run_forecasting( chain=chain, data=dataset, is_visualise=is_visualise, desc= f'Linear model, {dataset.task.task_params.forecast_length} step prediction with exog' )
def run_forecasting(chain: TsForecastingChain, data: InputData, is_visualise: bool, desc: str): train_data, test_data = train_test_data_setup(data, shuffle_flag=False, split_ratio=0.9) data.task.task_params.make_future_prediction = True chain.fit_from_scratch(train_data) test_data_for_pred = copy(test_data) # to avoid data leak test_data_for_pred.target = None data.task.task_params.make_future_prediction = True full_prediction = chain.forecast( initial_data=train_data, supplementary_data=test_data_for_pred).predict if is_visualise: plot_prediction(full_prediction, test_data, desc)
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=32, is_visualise=False): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) chain_simple = TsForecastingChain(PrimaryNode('linear')) chain_simple.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_simple = calculate_validation_metric( chain_simple.predict(dataset_to_validate), dataset_to_validate, f'full-simple_{forecast_length}', is_visualise=is_visualise) print(f'RMSE simple: {rmse_on_valid_simple}') chain_composite_lstm = get_composite_chain() chain_composite_lstm.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_lstm_only = calculate_validation_metric( chain_composite_lstm.predict(dataset_to_validate), dataset_to_validate, f'full-lstm-only_{forecast_length}', is_visualise=is_visualise) print(f'RMSE LSTM composite: {rmse_on_valid_lstm_only}') return rmse_on_valid_simple
def test_ts_single_chain_model_without_multiotput_support(): time_series = generate_synthetic_data(10) len_forecast = 2 train_part = time_series[:-len_forecast] test_part = time_series[-len_forecast:] task = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast, max_window_size=2, return_all_steps=False, make_future_prediction=True)) train_data = InputData(idx=np.arange(0, len(train_part)), features=None, target=train_part, task=task, data_type=DataTypesEnum.ts) for model_id in ['xgbreg', 'gbr', 'adareg', 'svr', 'sgdr']: chain = TsForecastingChain(PrimaryNode(model_id)) # making predictions for the missing part in the time series chain.fit_from_scratch(train_data) # data for making prediction for a specific length test_data = InputData(idx=np.arange(0, len_forecast), features=None, target=None, task=task, data_type=DataTypesEnum.ts) predicted_values = chain.forecast(initial_data=train_data, supplementary_data=test_data).predict mae = mean_absolute_error(test_part, predicted_values) assert mae < 50
def run_multistep_lstm_example(n_steps=6000, is_visualise: bool = True): # lstm forecasting dataset = get_synthetic_ts_data_period(n_steps=n_steps, forecast_length=64, max_window_size=64, with_exog=False) chain = TsForecastingChain(PrimaryNode('lstm')) run_forecasting( chain=chain, data=dataset, is_visualise=is_visualise, desc= f'LSTM model, {dataset.task.task_params.forecast_length} step prediction with exog' ) return True
def test_gapfilling_forward_ridge_correct(): arr_with_gaps, real_values = get_array_with_gaps() # Find all gap indices in the array id_gaps = np.ravel(np.argwhere(arr_with_gaps == -100.0)) ridge_chain = TsForecastingChain(PrimaryNode('ridge')) gapfiller = ModelGapFiller(gap_value=-100.0, chain=ridge_chain, max_window_size=150) without_gap = gapfiller.forward_filling(arr_with_gaps) # Get only values in the gaps predicted_values = without_gap[id_gaps] true_values = real_values[id_gaps] rmse_test = mean_squared_error(true_values, predicted_values, squared=False) # The RMSE must be less than the standard deviation of random noise * 2.0 assert rmse_test < 0.2
def run_gapfilling_example(): """ This function runs an example of filling in gaps in synthetic data :return arrays_dict: dictionary with 4 keys ('ridge', 'local_poly', 'batch_poly', 'linear') that can be used to get arrays without gaps :return gap_data: an array with gaps :return real_data: an array with actual values in gaps """ # Get synthetic time series gap_data, real_data = get_array_with_gaps() # Filling in gaps using chain from FEDOT ridge_chain = TsForecastingChain(PrimaryNode('ridge')) ridge_gapfiller = ModelGapFiller(gap_value=-100.0, chain=ridge_chain, max_window_size=150) without_gap_arr_ridge = \ ridge_gapfiller.forward_inverse_filling(gap_data) # Filling in gaps using simple methods such as polynomial approximation simple_gapfill = SimpleGapFiller(gap_value=-100.0) without_gap_local_poly = \ simple_gapfill.local_poly_approximation(gap_data, 4, 150) without_gap_batch_poly = \ simple_gapfill.batch_poly_approximation(gap_data, 4, 150) without_gap_linear = \ simple_gapfill.linear_interpolation(gap_data) arrays_dict = { 'ridge': without_gap_arr_ridge, 'local_poly': without_gap_local_poly, 'batch_poly': without_gap_batch_poly, 'linear': without_gap_linear } return arrays_dict, gap_data, real_data
def run_multistep_multiscale_example(n_steps=10000, is_visualise: bool = True): dataset = get_synthetic_ts_data_period(n_steps=n_steps, forecast_length=64, max_window_size=512, with_exog=False) # composite forecasting with decomposition node_first = PrimaryNode('trend_data_model') node_second = PrimaryNode('residual_data_model') node_trend_model = SecondaryNode('ridge', nodes_from=[node_first]) node_residual_model = SecondaryNode('linear', nodes_from=[node_second]) node_final = SecondaryNode( 'linear', nodes_from=[node_trend_model, node_residual_model]) chain = TsForecastingChain(node_final) run_forecasting( chain=chain, data=dataset, is_visualise=is_visualise, desc= f'Multiscale model, {dataset.task.task_params.forecast_length} step prediction withot exog' )
plt.legend(fontsize=15) plt.show() folder_to_save = './iccs_article/fedot_composing' if __name__ == '__main__': # Заполнение пропусков и проверка результатов for file in ['Synthetic.csv', 'Sea_hour.csv', 'Sea_10_240.csv']: print(file) data = pd.read_csv(f'./data/{file}') data['Date'] = pd.to_datetime(data['Date']) dataframe = data.copy() chain = TsForecastingChain() node_trend = PrimaryNode('trend_data_model') node_trend.labels = ["fixed"] node_lstm_trend = SecondaryNode('linear', nodes_from=[node_trend]) node_trend.labels = ["fixed"] node_residual = PrimaryNode('residual_data_model') node_ridge_residual = SecondaryNode('linear', nodes_from=[node_residual]) node_final = SecondaryNode( 'linear', nodes_from=[node_ridge_residual, node_lstm_trend]) node_final.labels = ["fixed"] chain.add_node(node_final) print(f'Размер исходной цепочки {len(chain.nodes)}') # Заполнение пропусков
def forecasting_accuracy(path, prediction_len, vis=True): mapes_per_model = [] models = [] files = [] for file_name in ['Synthetic.csv', 'Sea_hour.csv', 'Sea_10_240.csv']: # Исходный файл с пропусками gap_path = os.path.join(path, file_name) gap_df = pd.read_csv(gap_path) gap_df['Date'] = pd.to_datetime(gap_df['Date']) # Простые методы linear_path = os.path.join(os.path.join(path, 'linear'), file_name) linear_df = pd.read_csv(linear_path) local_poly_path = os.path.join(os.path.join(path, 'poly'), file_name) local_poly_df = pd.read_csv(local_poly_path) batch_poly_path = os.path.join(os.path.join(path, 'batch_poly'), file_name) batch_poly_df = pd.read_csv(batch_poly_path) # Методы восстановления пропусков средствами языка R kalman_path = os.path.join(os.path.join(path, 'kalman'), file_name) kalman_df = pd.read_csv(kalman_path) ma_path = os.path.join(os.path.join(path, 'ma'), file_name) ma_df = pd.read_csv(ma_path) spline_path = os.path.join(os.path.join(path, 'spline'), file_name) spline_df = pd.read_csv(spline_path) # Методы восстановления пропусков FEDOT fedot_ridge_30_path = os.path.join(os.path.join(path, 'fedot_ridge_30'), file_name) fedot_ridge_30_df = pd.read_csv(fedot_ridge_30_path) fedot_ridge_100_path = os.path.join(os.path.join(path, 'fedot_ridge_100'), file_name) fedot_ridge_100_df = pd.read_csv(fedot_ridge_100_path) fedot_compose = os.path.join(os.path.join(path, 'fedot_composing'), file_name) fedot_compose_df = pd.read_csv(fedot_compose) # Исходный временной ряд без пропусков arr_parameter = np.array(gap_df['Height']) # Временной ряд с пропусками arr_mask = np.array(gap_df['gap']) ids_gaps = np.ravel(np.argwhere(arr_mask == -100.0)) array_gaps = np.ma.masked_where(arr_mask == -100.0, arr_mask) if vis: plt.plot(gap_df['Date'], arr_parameter, c='red', alpha=0.2) for index in ids_gaps: plt.plot([gap_df['Date'][index], gap_df['Date'][index]], [min(arr_parameter), arr_parameter[index]], c='red', alpha=0.05) plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0) plt.ylabel('Sea level, m', fontsize=15) plt.xlabel('Date', fontsize=15) plt.grid() plt.show() withoutgap_arr_linear = np.array(linear_df['gap']) withoutgap_arr_local = np.array(local_poly_df['gap']) withoutgap_arr_batch = np.array(batch_poly_df['gap']) withoutgap_arr_kalman = np.array(kalman_df['gap']) withoutgap_arr_ma = np.array(ma_df['gap']) withoutgap_arr_spline = np.array(spline_df['gap']) withoutgap_arr_ridge_30 = np.array(fedot_ridge_30_df['gap']) withoutgap_arr_ridge_100 = np.array(fedot_ridge_100_df['gap']) withoutgap_arr_compose = np.array(fedot_compose_df['gap']) if vis: plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5, label='Actual values') plt.plot(gap_df['Date'], withoutgap_arr_linear, c='red', alpha=0.5, label='Linear interpolation') plt.plot(gap_df['Date'], withoutgap_arr_local, c='orange', alpha=0.5, label='Local polynomial approximation') plt.plot(gap_df['Date'], withoutgap_arr_batch, c='purple', alpha=0.5, label='Batch polynomial approximation') plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0) plt.ylabel('Sea level, m', fontsize=15) plt.xlabel('Date', fontsize=15) plt.grid() plt.legend(fontsize=15) plt.show() plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5, label='Actual values') plt.plot(gap_df['Date'], withoutgap_arr_kalman, c='red', alpha=0.5, label='Kalman filtering') plt.plot(gap_df['Date'], withoutgap_arr_ma, c='orange', alpha=0.5, label='Moving average') plt.plot(gap_df['Date'], withoutgap_arr_spline, c='purple', alpha=0.5, label='Spline interpolation') plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0) plt.ylabel('Sea level, m', fontsize=15) plt.xlabel('Date', fontsize=15) plt.grid() plt.legend(fontsize=15) plt.show() plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5, label='Actual values') plt.plot(gap_df['Date'], withoutgap_arr_batch, c='red', alpha=0.5, label='Batch polynomial approximation') plt.plot(gap_df['Date'], withoutgap_arr_kalman, c='orange', alpha=0.5, label='Kalman filtering') plt.plot(gap_df['Date'], withoutgap_arr_ridge_30, c='purple', alpha=0.5, label='Ridge 30 ws') plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0) plt.ylabel('Sea level, m', fontsize=15) plt.xlabel('Date', fontsize=15) plt.grid() plt.legend(fontsize=15) plt.show() train_part = arr_parameter[:-prediction_len] test_part = arr_parameter[-prediction_len:] # Подготавливаем часть временного ряда с восстановленными значениями train_part_linear = withoutgap_arr_linear[:-prediction_len] train_part_local = withoutgap_arr_local[:-prediction_len] train_part_batch = withoutgap_arr_batch[:-prediction_len] train_part_kalman = withoutgap_arr_kalman[:-prediction_len] train_part_ma = withoutgap_arr_ma[:-prediction_len] train_part_stine = withoutgap_arr_spline[:-prediction_len] train_part_ridge_30 = withoutgap_arr_ridge_30[:-prediction_len] train_part_ridge_100 = withoutgap_arr_ridge_100[:-prediction_len] train_part_compose = withoutgap_arr_compose[:-prediction_len] if file_name == 'Hour_data_m.csv': max_window_size = 50 else: max_window_size = 500 for sample, model in zip([train_part, train_part_linear, train_part_local, train_part_batch, train_part_kalman, train_part_ma, train_part_stine, train_part_ridge_30, train_part_ridge_100, train_part_compose], ['Original', 'Linear interpolation', 'Local polynomial approximation', 'Batch polynomial approximation', 'Kalman filtering', 'Moving average', 'Spline interpolation', 'Ridge forward 30 ws', 'Ridge forward 100 ws', 'Chain compose']): node_first = PrimaryNode('ridge') node_second = PrimaryNode('ridge') node_trend_model = SecondaryNode('linear', nodes_from=[node_first]) node_residual_model = SecondaryNode('linear', nodes_from=[node_second]) node_final = SecondaryNode('svr', nodes_from=[node_trend_model, node_residual_model]) chain = TsForecastingChain(node_final) task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=prediction_len, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=True)) input_data = InputData(idx=np.arange(0, len(sample)), features=None, target=sample, task=task, data_type=DataTypesEnum.ts) chain.fit_from_scratch(input_data) # "Test data" for making prediction for a specific length test_data = InputData(idx=np.arange(0, prediction_len), features=None, target=None, task=task, data_type=DataTypesEnum.ts) predicted_values = chain.forecast(initial_data=input_data, supplementary_data=test_data).predict print(model) MAE = mean_absolute_error(test_part, predicted_values) print('Mean absolute error -', round(MAE, 4)) RMSE = (mean_squared_error(test_part, predicted_values)) ** 0.5 print('RMSE -', round(RMSE, 4)) MedianAE = median_absolute_error(test_part, predicted_values) print('Median absolute error -', round(MedianAE, 4)) mape = mean_absolute_percentage_error(test_part, predicted_values) print('MAPE -', round(mape, 4), '\n') if file_name == 'Sea_10_240.csv': plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5, label='Actual values') plt.plot(gap_df['Date'][:-prediction_len], sample, c='blue', label='Restored series') plt.plot(gap_df['Date'][-prediction_len:], predicted_values, c='red', alpha=0.5, label='Model forecast') plt.ylabel('Sea level, m', fontsize=15) plt.xlabel('Date', fontsize=15) plt.grid() plt.title(model, fontsize=15) plt.legend(fontsize=15) plt.show() models.append(model) mapes_per_model.append(mape) files.append(file_name) local_df = pd.DataFrame({'MAPE': mapes_per_model, 'Model': models, 'File': files}) for model in local_df['Model'].unique(): local_local_df = local_df[local_df['Model'] == model] mape_arr = np.array(local_local_df['MAPE']) print(f'Среднее значение ошибки для модели {model} - {np.mean(mape_arr)}') for file in local_local_df['File'].unique(): l_local_local_df = local_local_df[local_local_df['File'] == file] print(f'{model}, {file}, MAPE - {float(l_local_local_df["MAPE"])}')
plt.show() folder_to_save = './iccs_article/fedot_ridge_two_way_80' if __name__ == '__main__': # Заполнение пропусков и проверка результатов for file in ['Synthetic.csv', 'Sea_hour.csv', 'Sea_10_240.csv']: print(file) data = pd.read_csv(f'./data/{file}') data['Date'] = pd.to_datetime(data['Date']) dataframe = data.copy() # Цепочка из одной модели chain = TsForecastingChain(PrimaryNode('ridge')) # Заполнение пропусков gapfiller = ModelGapFiller(gap_value=-100.0, chain=chain) with_gap_array = np.array(data['gap']) withoutgap_arr = gapfiller.forward_inverse_filling(with_gap_array, max_window_size=80) dataframe['gap'] = withoutgap_arr validate(parameter='Height', mask='gap', data=data, withoutgap_arr=withoutgap_arr) save_path = os.path.join(folder_to_save, file) # Create folder if it doesnt exists
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=64, with_visualisation=True): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) time_limit_min = 10 available_model_types = [ 'linear', 'ridge', 'lasso', 'rfr', 'dtreg', 'knnreg', 'svr' ] if max_window_size == 1: # unit test model available_model_types = ['linear', 'ridge'] time_limit_min = 0.001 # each possible single-model chain for model in available_model_types: chain = TsForecastingChain(PrimaryNode(model)) chain.fit(input_data=dataset_to_train, verbose=False) calculate_validation_metric(chain.predict(dataset_to_validate), dataset_to_validate, is_visualise=with_visualisation, label=model) # static multiscale chain multiscale_chain = get_composite_multiscale_chain() multiscale_chain.fit(input_data=dataset_to_train, verbose=False) calculate_validation_metric(multiscale_chain.predict(dataset_to_validate), dataset_to_validate, is_visualise=with_visualisation, label='Fixed multiscale') # static all-in-one ensemble chain ens_chain = get_ensemble_chain() ens_chain.fit(input_data=dataset_to_train, verbose=False) calculate_validation_metric(ens_chain.predict(dataset_to_validate), dataset_to_validate, is_visualise=with_visualisation, label='Ensemble composite') # optimized ensemble chain composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=5, max_depth=2, pop_size=10, num_of_generations=10, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=time_limit_min), add_single_model_chains=False) builder = GPComposerBuilder(task=task_to_solve).with_requirements( composer_requirements).with_metrics(metric_function) composer = builder.build() chain = composer.compose_chain(data=dataset_to_train, is_visualise=False) chain.fit_from_scratch(input_data=dataset_to_train, verbose=False) if with_visualisation: ComposerVisualiser.visualise(chain) calculate_validation_metric(chain.predict(dataset_to_validate), dataset_to_validate, is_visualise=with_visualisation, label='Automated ensemble') # optimized multiscale chain available_model_types_primary = ['trend_data_model', 'residual_data_model'] available_model_types_secondary = [ 'linear', 'ridge', 'lasso', 'rfr', 'dtreg', 'knnreg', 'svr' ] available_model_types_all = available_model_types_primary + available_model_types_secondary composer_requirements = GPComposerRequirements( primary=available_model_types_all, secondary=available_model_types_secondary, max_arity=5, max_depth=2, pop_size=10, num_of_generations=30, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=time_limit_min)) builder = GPComposerBuilder(task=task_to_solve).with_requirements( composer_requirements).with_metrics( metric_function).with_initial_chain(multiscale_chain) composer = builder.build() chain = composer.compose_chain(data=dataset_to_train, is_visualise=False) chain.fit_from_scratch(input_data=dataset_to_train, verbose=False) if with_visualisation: visualiser = ChainVisualiser() visualiser.visualise(chain) rmse_on_valid = calculate_validation_metric( chain.predict(dataset_to_validate), dataset_to_validate, is_visualise=with_visualisation, label='Automated multiscale') return rmse_on_valid