Esempio n. 1
0
def run_forecasting(chain: TsForecastingChain, data: InputData,
                    is_visualise: bool, desc: str):
    train_data, test_data = train_test_data_setup(data,
                                                  shuffle_flag=False,
                                                  split_ratio=0.9)
    data.task.task_params.make_future_prediction = True
    chain.fit_from_scratch(train_data)

    test_data_for_pred = copy(test_data)
    # to avoid data leak
    test_data_for_pred.target = None
    data.task.task_params.make_future_prediction = True

    full_prediction = chain.forecast(
        initial_data=train_data, supplementary_data=test_data_for_pred).predict
    if is_visualise:
        plot_prediction(full_prediction, test_data, desc)
Esempio n. 2
0
def test_ts_single_chain_model_without_multiotput_support():
    time_series = generate_synthetic_data(10)
    len_forecast = 2
    train_part = time_series[:-len_forecast]
    test_part = time_series[-len_forecast:]

    task = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=len_forecast,
                            max_window_size=2,
                            return_all_steps=False,
                            make_future_prediction=True))

    train_data = InputData(idx=np.arange(0, len(train_part)),
                           features=None,
                           target=train_part,
                           task=task,
                           data_type=DataTypesEnum.ts)

    for model_id in ['xgbreg', 'gbr', 'adareg', 'svr', 'sgdr']:
        chain = TsForecastingChain(PrimaryNode(model_id))

        # making predictions for the missing part in the time series
        chain.fit_from_scratch(train_data)

        # data for making prediction for a specific length
        test_data = InputData(idx=np.arange(0, len_forecast),
                              features=None,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.ts)

        predicted_values = chain.forecast(initial_data=train_data,
                                          supplementary_data=test_data).predict

        mae = mean_absolute_error(test_part, predicted_values)
        assert mae < 50
Esempio n. 3
0
def forecasting_accuracy(path, prediction_len, vis=True):
    mapes_per_model = []
    models = []
    files = []

    for file_name in ['Synthetic.csv', 'Sea_hour.csv', 'Sea_10_240.csv']:
        # Исходный файл с пропусками
        gap_path = os.path.join(path, file_name)
        gap_df = pd.read_csv(gap_path)
        gap_df['Date'] = pd.to_datetime(gap_df['Date'])

        # Простые методы
        linear_path = os.path.join(os.path.join(path, 'linear'), file_name)
        linear_df = pd.read_csv(linear_path)
        local_poly_path = os.path.join(os.path.join(path, 'poly'), file_name)
        local_poly_df = pd.read_csv(local_poly_path)
        batch_poly_path = os.path.join(os.path.join(path, 'batch_poly'), file_name)
        batch_poly_df = pd.read_csv(batch_poly_path)

        # Методы восстановления пропусков средствами языка R
        kalman_path = os.path.join(os.path.join(path, 'kalman'), file_name)
        kalman_df = pd.read_csv(kalman_path)
        ma_path = os.path.join(os.path.join(path, 'ma'), file_name)
        ma_df = pd.read_csv(ma_path)
        spline_path = os.path.join(os.path.join(path, 'spline'), file_name)
        spline_df = pd.read_csv(spline_path)

        # Методы восстановления пропусков FEDOT
        fedot_ridge_30_path = os.path.join(os.path.join(path, 'fedot_ridge_30'), file_name)
        fedot_ridge_30_df = pd.read_csv(fedot_ridge_30_path)
        fedot_ridge_100_path = os.path.join(os.path.join(path, 'fedot_ridge_100'), file_name)
        fedot_ridge_100_df = pd.read_csv(fedot_ridge_100_path)
        fedot_compose = os.path.join(os.path.join(path, 'fedot_composing'), file_name)
        fedot_compose_df = pd.read_csv(fedot_compose)

        # Исходный временной ряд без пропусков
        arr_parameter = np.array(gap_df['Height'])
        # Временной ряд с пропусками
        arr_mask = np.array(gap_df['gap'])
        ids_gaps = np.ravel(np.argwhere(arr_mask == -100.0))

        array_gaps = np.ma.masked_where(arr_mask == -100.0, arr_mask)

        if vis:
            plt.plot(gap_df['Date'], arr_parameter, c='red', alpha=0.2)
            for index in ids_gaps:
                plt.plot([gap_df['Date'][index], gap_df['Date'][index]], [min(arr_parameter), arr_parameter[index]],
                         c='red', alpha=0.05)
            plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0)
            plt.ylabel('Sea level, m', fontsize=15)
            plt.xlabel('Date', fontsize=15)
            plt.grid()
            plt.show()

        withoutgap_arr_linear = np.array(linear_df['gap'])
        withoutgap_arr_local = np.array(local_poly_df['gap'])
        withoutgap_arr_batch = np.array(batch_poly_df['gap'])

        withoutgap_arr_kalman = np.array(kalman_df['gap'])
        withoutgap_arr_ma = np.array(ma_df['gap'])
        withoutgap_arr_spline = np.array(spline_df['gap'])

        withoutgap_arr_ridge_30 = np.array(fedot_ridge_30_df['gap'])
        withoutgap_arr_ridge_100 = np.array(fedot_ridge_100_df['gap'])
        withoutgap_arr_compose = np.array(fedot_compose_df['gap'])

        if vis:
            plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5,
                     label='Actual values')
            plt.plot(gap_df['Date'], withoutgap_arr_linear, c='red', alpha=0.5,
                     label='Linear interpolation')
            plt.plot(gap_df['Date'], withoutgap_arr_local, c='orange', alpha=0.5,
                     label='Local polynomial approximation')
            plt.plot(gap_df['Date'], withoutgap_arr_batch, c='purple', alpha=0.5,
                     label='Batch polynomial approximation')
            plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0)
            plt.ylabel('Sea level, m', fontsize=15)
            plt.xlabel('Date', fontsize=15)
            plt.grid()
            plt.legend(fontsize=15)
            plt.show()

            plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5,
                     label='Actual values')
            plt.plot(gap_df['Date'], withoutgap_arr_kalman, c='red', alpha=0.5,
                     label='Kalman filtering')
            plt.plot(gap_df['Date'], withoutgap_arr_ma, c='orange', alpha=0.5,
                     label='Moving average')
            plt.plot(gap_df['Date'], withoutgap_arr_spline, c='purple', alpha=0.5,
                     label='Spline interpolation')
            plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0)
            plt.ylabel('Sea level, m', fontsize=15)
            plt.xlabel('Date', fontsize=15)
            plt.grid()
            plt.legend(fontsize=15)
            plt.show()

            plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5,
                     label='Actual values')
            plt.plot(gap_df['Date'], withoutgap_arr_batch, c='red',
                     alpha=0.5,
                     label='Batch polynomial approximation')
            plt.plot(gap_df['Date'], withoutgap_arr_kalman, c='orange', alpha=0.5,
                     label='Kalman filtering')
            plt.plot(gap_df['Date'], withoutgap_arr_ridge_30, c='purple', alpha=0.5,
                     label='Ridge 30 ws')
            plt.plot(gap_df['Date'], array_gaps, c='blue', alpha=1.0)
            plt.ylabel('Sea level, m', fontsize=15)
            plt.xlabel('Date', fontsize=15)
            plt.grid()
            plt.legend(fontsize=15)
            plt.show()

        train_part = arr_parameter[:-prediction_len]
        test_part = arr_parameter[-prediction_len:]

        # Подготавливаем часть временного ряда с восстановленными значениями
        train_part_linear = withoutgap_arr_linear[:-prediction_len]
        train_part_local = withoutgap_arr_local[:-prediction_len]
        train_part_batch = withoutgap_arr_batch[:-prediction_len]

        train_part_kalman = withoutgap_arr_kalman[:-prediction_len]
        train_part_ma = withoutgap_arr_ma[:-prediction_len]
        train_part_stine = withoutgap_arr_spline[:-prediction_len]

        train_part_ridge_30 = withoutgap_arr_ridge_30[:-prediction_len]
        train_part_ridge_100 = withoutgap_arr_ridge_100[:-prediction_len]
        train_part_compose = withoutgap_arr_compose[:-prediction_len]

        if file_name == 'Hour_data_m.csv':
            max_window_size = 50
        else:
            max_window_size = 500
        for sample, model in zip([train_part, train_part_linear, train_part_local, train_part_batch,
                                  train_part_kalman, train_part_ma, train_part_stine, train_part_ridge_30,
                                  train_part_ridge_100, train_part_compose],
                                 ['Original', 'Linear interpolation', 'Local polynomial approximation',
                                  'Batch polynomial approximation', 'Kalman filtering', 'Moving average',
                                  'Spline interpolation', 'Ridge forward 30 ws', 'Ridge forward 100 ws',
                                  'Chain compose']):
            node_first = PrimaryNode('ridge')
            node_second = PrimaryNode('ridge')
            node_trend_model = SecondaryNode('linear', nodes_from=[node_first])
            node_residual_model = SecondaryNode('linear', nodes_from=[node_second])

            node_final = SecondaryNode('svr', nodes_from=[node_trend_model,
                                                          node_residual_model])
            chain = TsForecastingChain(node_final)

            task = Task(TaskTypesEnum.ts_forecasting,
                        TsForecastingParams(forecast_length=prediction_len,
                                            max_window_size=max_window_size,
                                            return_all_steps=False,
                                            make_future_prediction=True))

            input_data = InputData(idx=np.arange(0, len(sample)),
                                   features=None,
                                   target=sample,
                                   task=task,
                                   data_type=DataTypesEnum.ts)

            chain.fit_from_scratch(input_data)

            # "Test data" for making prediction for a specific length
            test_data = InputData(idx=np.arange(0, prediction_len),
                                  features=None,
                                  target=None,
                                  task=task,
                                  data_type=DataTypesEnum.ts)

            predicted_values = chain.forecast(initial_data=input_data,
                                              supplementary_data=test_data).predict

            print(model)
            MAE = mean_absolute_error(test_part, predicted_values)
            print('Mean absolute error -', round(MAE, 4))

            RMSE = (mean_squared_error(test_part, predicted_values)) ** 0.5
            print('RMSE -', round(RMSE, 4))

            MedianAE = median_absolute_error(test_part, predicted_values)
            print('Median absolute error -', round(MedianAE, 4))

            mape = mean_absolute_percentage_error(test_part, predicted_values)
            print('MAPE -', round(mape, 4), '\n')

            if file_name == 'Sea_10_240.csv':
                plt.plot(gap_df['Date'], arr_parameter, c='green', alpha=0.5, label='Actual values')
                plt.plot(gap_df['Date'][:-prediction_len], sample, c='blue', label='Restored series')
                plt.plot(gap_df['Date'][-prediction_len:], predicted_values, c='red', alpha=0.5, label='Model forecast')
                plt.ylabel('Sea level, m', fontsize=15)
                plt.xlabel('Date', fontsize=15)
                plt.grid()
                plt.title(model, fontsize=15)
                plt.legend(fontsize=15)
                plt.show()

            models.append(model)
            mapes_per_model.append(mape)
            files.append(file_name)

    local_df = pd.DataFrame({'MAPE': mapes_per_model,
                             'Model': models,
                             'File': files})

    for model in local_df['Model'].unique():
        local_local_df = local_df[local_df['Model'] == model]
        mape_arr = np.array(local_local_df['MAPE'])

        print(f'Среднее значение ошибки для модели {model} - {np.mean(mape_arr)}')
        for file in local_local_df['File'].unique():
            l_local_local_df = local_local_df[local_local_df['File'] == file]
            print(f'{model}, {file}, MAPE - {float(l_local_local_df["MAPE"])}')