Ejemplo n.º 1
0
def test_forecast_with_exog():
    train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts(
    )

    # Source data for lagged node
    node_lagged = PrimaryNode('lagged')
    # Set window size for lagged transformation
    node_lagged.custom_params = {'window_size': window_size}
    # Exogenous variable for exog node
    node_exog = PrimaryNode('exog_ts_data_source')

    node_final = SecondaryNode('linear', nodes_from=[node_lagged, node_exog])
    pipeline = Pipeline(node_final)

    pipeline.fit(input_data=MultiModalData({
        'exog_ts_data_source': train_exog_ts,
        'lagged': train_source_ts
    }))

    forecast = pipeline.predict(
        input_data=MultiModalData({
            'exog_ts_data_source': predict_exog_ts,
            'lagged': predict_source_ts
        }))
    prediction = np.ravel(np.array(forecast.predict))

    assert tuple(prediction) == tuple(ts_test)
Ejemplo n.º 2
0
def _train_test_multi_modal_data_setup(data: MultiModalData, split_ratio=0.8,
                                       shuffle_flag=False) -> Tuple[MultiModalData, MultiModalData]:
    train_data = MultiModalData()
    test_data = MultiModalData()
    for node in data.keys():
        data_part = data[node]
        train_data_part, test_data_part = train_test_data_setup(data_part, split_ratio, shuffle_flag)
        train_data[node] = train_data_part
        test_data[node] = test_data_part

    return train_data, test_data
Ejemplo n.º 3
0
    def multimodal_data_preparing(name):
        pipeline = pipelines[name]['model']
        train_dict = {}
        predict_dict = {}
        for key in pipelines[name]['tr_nodes_data'].keys():
            train_dict[key] = deepcopy(pipelines[name]['tr_nodes_data'][key])
        for key in pipelines[name]['pr_nodes_data'].keys():
            predict_dict[key] = deepcopy(pipelines[name]['pr_nodes_data'][key])

        train_dataset = MultiModalData(train_dict)
        predict_dataset = MultiModalData(predict_dict)
        return pipeline, train_dataset, predict_dataset
Ejemplo n.º 4
0
def test_multi_modal_data():
    num_samples = 5
    target = np.asarray([0, 0, 1, 0, 1])
    img_data = InputData(
        idx=range(num_samples),
        features=None,  # in test the real data is not passed
        target=target,
        data_type=DataTypesEnum.text,
        task=Task(TaskTypesEnum.classification))
    tbl_data = InputData(
        idx=range(num_samples),
        features=None,  # in test the real data is not passed
        target=target,
        data_type=DataTypesEnum.table,
        task=Task(TaskTypesEnum.classification))

    multi_modal = MultiModalData({
        'data_source_img': img_data,
        'data_source_table': tbl_data,
    })

    assert multi_modal.task.task_type == TaskTypesEnum.classification
    assert len(multi_modal.idx) == 5
    assert multi_modal.num_classes == 2
    assert np.array_equal(multi_modal.target, target)
Ejemplo n.º 5
0
def test_forecast_with_sparse_lagged():
    train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts(
    )

    # Source data for lagged node
    node_lagged = PrimaryNode('sparse_lagged')
    # Set window size for lagged transformation
    node_lagged.custom_params = {'window_size': window_size}

    node_final = SecondaryNode('linear', nodes_from=[node_lagged])
    pipeline = Pipeline(node_final)

    pipeline.fit(input_data=MultiModalData({'sparse_lagged': train_source_ts}))

    forecast = pipeline.predict(
        input_data=MultiModalData({'sparse_lagged': predict_source_ts}))
    is_forecasted = True

    assert is_forecasted
Ejemplo n.º 6
0
def make_forecast(pipeline, train: InputData, predict: InputData,
                  train_exog: InputData, predict_exog: InputData):
    """
    Function for predicting values in a time series

    :return predicted_values: numpy array, forecast of model
    """

    # Fit it
    start_time = timeit.default_timer()

    second_node_name = 'exog_ts_data_source'

    if train_exog is None:
        second_node_name = 'lagged/2'
        train_exog = train
        predict_exog = predict

    train_dataset = MultiModalData({
        'lagged/1': train,
        second_node_name: train_exog,
    })

    predict_dataset = MultiModalData({
        'lagged/1': predict,
        second_node_name: predict_exog,
    })

    pipeline.fit_from_scratch(train_dataset)
    amount_of_seconds = timeit.default_timer() - start_time

    print(f'\nIt takes {amount_of_seconds:.2f} seconds to train pipeline\n')

    # Predict
    predicted_values = pipeline.predict(predict_dataset)
    predicted_values = predicted_values.predict

    return predicted_values
Ejemplo n.º 7
0
def generate_initial_pipeline_and_data(images_size,
                                    train_num, test_num,
                                    train_img, test_img,
                                    train_text, test_text):
    # image
    ds_image = PrimaryNode('data_source_img/1')
    image_node = SecondaryNode('cnn', nodes_from=[ds_image])
    image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1),
                                'architecture': 'simplified',
                                'num_classes': 2,
                                'epochs': 15,
                                'batch_size': 128}

    # table
    ds_table = PrimaryNode('data_source_table/2')
    scaling_node = SecondaryNode('scaling', nodes_from=[ds_table])
    numeric_node = SecondaryNode('rf', nodes_from=[scaling_node])

    # text
    ds_text = PrimaryNode('data_source_text/3')
    node_text_clean = SecondaryNode('text_clean', nodes_from=[ds_text])
    text_node = SecondaryNode('tfidf', nodes_from=[node_text_clean])

    pipeline = Pipeline(SecondaryNode('logit', nodes_from=[numeric_node, image_node, text_node]))

    fit_data = MultiModalData({
        'data_source_img/1': train_img,
        'data_source_table/2': train_num,
        'data_source_text/3': train_text
    })
    predict_data = MultiModalData({
        'data_source_img/1': test_img,
        'data_source_table/2': test_num,
        'data_source_text/3': test_text
    })

    return pipeline, fit_data, predict_data
Ejemplo n.º 8
0
def test_multi_modal_pipeline():
    task = Task(TaskTypesEnum.classification)
    images_size = (128, 128)

    files_path = os.path.join('test', 'data', 'multi_modal')
    path = os.path.join(str(fedot_project_root()), files_path)

    train_num, _, train_img, _, train_text, _ = \
        prepare_multi_modal_data(path, task, images_size, with_split=False)

    # image
    image_node = PrimaryNode('cnn')
    image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1),
                                'architecture': 'simplified',
                                'num_classes': 2,
                                'epochs': 1,
                                'batch_size': 128}

    # image
    ds_image = PrimaryNode('data_source_img')
    image_node = SecondaryNode('cnn', nodes_from=[ds_image])
    image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1),
                                'architecture': 'simplified',
                                'num_classes': 2,
                                'epochs': 15,
                                'batch_size': 128}

    # table
    ds_table = PrimaryNode('data_source_table')
    scaling_node = SecondaryNode('scaling', nodes_from=[ds_table])
    numeric_node = SecondaryNode('rf', nodes_from=[scaling_node])

    # text
    ds_text = PrimaryNode('data_source_text')
    node_text_clean = SecondaryNode('text_clean', nodes_from=[ds_text])
    text_node = SecondaryNode('tfidf', nodes_from=[node_text_clean])

    pipeline = Pipeline(SecondaryNode('logit', nodes_from=[numeric_node, image_node, text_node]))

    fit_data = MultiModalData({
        'data_source_img': train_img,
        'data_source_table': train_num,
        'data_source_text': train_text
    })

    pipeline.fit(fit_data)
    prediction = pipeline.predict(fit_data)

    assert prediction is not None
Ejemplo n.º 9
0
train_input, predict_input, task = prepare_input_data(
    len_forecast=len_forecast,
    train_data_features=train_data,
    train_data_target=train_data,
    test_data_features=train_data)

# Exogenous time series
train_input_exog, predict_input_exog, _ = prepare_input_data(
    len_forecast=len_forecast,
    train_data_features=train_data_exog,
    train_data_target=train_data,
    test_data_features=test_data_exog)

pipeline = get_arima_pipeline()
train_dataset = MultiModalData({
    'arima': deepcopy(train_input),
})
predict_dataset = MultiModalData({
    'arima': deepcopy(predict_input),
})
pipeline.fit_from_scratch(train_dataset)
predicted_values = pipeline.predict(predict_dataset)
predicted_values = predicted_values.predict

predicted = np.ravel(np.array(predicted_values))
test_data = np.ravel(np.array(test_data))

mse_before = mean_squared_error(test_data, predicted, squared=False)
mae_before = mean_absolute_error(test_data, predicted)
mape_before = mean_absolute_percentage_error(test_data, predicted)
print(f'ARIMA MSE - {mse_before:.4f}')
Ejemplo n.º 10
0
def run_oil_forecasting(path_to_file, path_to_file_crm, len_forecast,
                        len_forecast_full, ax, well_id, timeout):
    if timeout is None:
        timeout = 1
    df = pd.read_csv(path_to_file, sep=' *, *')
    df_crm = pd.read_csv(path_to_file_crm, sep=' *, *')

    len_forecast_for_split = len_forecast_full
    dates, target_train, data_fit, data_predict, input_data_fit, input_data_predict, test_data, \
    train_data, time_series = prepare_dataset(df, len_forecast, len_forecast_for_split, well_id)

    dates, target_train_crm, data_fit_crm, data_predict_crm, input_data_fit_crm, input_data_predict_crm, test_data_crm, \
    train_data, time_series = prepare_dataset(df_crm, len_forecast, len_forecast_for_split, well_id)

    task_parameters = TsForecastingParams(forecast_length=len_forecast)

    if not os.path.exists(f'pipeline_{well_id}/pipeline_{well_id}.json'):
        model = Fedot(problem='ts_forecasting',
                      task_params=task_parameters,
                      composer_params={'timeout': timeout},
                      preset='ultra_light',
                      verbose_level=4)

        # run AutoML model design in the same way
        pipeline = model.fit(features=data_fit, target=target_train)
        pipeline.save(f'pipeline_{well_id}')  # , datetime_in_path=False)
    else:
        pipeline = Pipeline()
        pipeline.load(f'pipeline_{well_id}/pipeline_{well_id}.json')

    if not os.path.exists(
            f'pipeline_crm_{well_id}/pipeline_crm_{well_id}.json'):
        model = Fedot(problem='ts_forecasting',
                      task_params=task_parameters,
                      composer_params={'timeout': timeout},
                      preset='ultra_light',
                      verbose_level=4)

        # run AutoML model design in the same way
        pipeline_crm = model.fit(features=data_fit_crm,
                                 target=target_train_crm)
        pipeline_crm.save(
            f'pipeline_crm_{well_id}')  # , datetime_in_path=False)
    else:
        pipeline_crm = Pipeline()
        pipeline_crm.load(
            f'pipeline_crm_{well_id}/pipeline_crm_{well_id}.json')

    sources = dict(
        (f'data_source_ts/{data_part_key}', data_part)
        for (data_part_key, data_part) in input_data_predict.items())
    input_data_predict_mm = MultiModalData(sources)

    sources_crm = dict(
        (f'data_source_ts/{data_part_key}', data_part)
        for (data_part_key, data_part) in input_data_predict_crm.items())
    input_data_predict_mm_crm = MultiModalData(sources_crm)

    forecast = in_sample_ts_forecast(pipeline,
                                     input_data_predict_mm,
                                     horizon=len_forecast_full)
    forecast_crm = in_sample_ts_forecast(pipeline_crm,
                                         input_data_predict_mm_crm,
                                         horizon=len_forecast_full)

    predicted = np.ravel(np.array(forecast))
    predicted_crm = np.ravel(np.array(forecast_crm))
    predicted_only_crm = np.asarray(
        df_crm[f'crm_{well_id}'][-len_forecast_full:])

    test_data = np.ravel(test_data)

    print('CRM')
    predicted_only_crm[np.isnan(predicted_only_crm)] = 0
    mse_before = mean_squared_error(test_data,
                                    predicted_only_crm,
                                    squared=False)
    mae_before = mean_absolute_error(test_data, predicted_only_crm)
    print(f'RMSE - {mse_before:.4f}')
    print(f'MAE - {mae_before:.4f}\n')

    print('ML')
    mse_before = mean_squared_error(test_data, predicted, squared=False)
    mae_before = mean_absolute_error(test_data, predicted)
    print(f'RMSE - {mse_before:.4f}')
    print(f'MAE - {mae_before:.4f}\n')

    print('AutoML+CRM')
    mse_before = mean_squared_error(test_data, predicted_crm, squared=False)
    mae_before = mean_absolute_error(test_data, predicted_crm)
    print(f'RMSE - {mse_before:.4f}')
    print(f'MAE - {mae_before:.4f}\n')

    if ax:
        x_for = range(len(train_data), len(time_series))
        ax.plot(x_for,
                time_series[-len_forecast_full:],
                label='Actual time series',
                linewidth=0.5)
        ax.plot(x_for, predicted_crm, label='AutoML+CRM', linewidth=0.5)
        ax.plot(x_for, predicted_only_crm, label='CRM', linewidth=0.5)

        ci_crm = t_conf_interval(np.std(predicted_crm), 0.975,
                                 len(predicted_crm)) * 1.96
        ax.fill_between(x_for, (predicted_crm - ci_crm),
                        (predicted_crm + ci_crm),
                        color='orange',
                        alpha=.5)

        ci_crmonly = t_conf_interval(np.std(predicted_only_crm), 0.975,
                                     len(predicted_only_crm)) * 1.96
        ax.fill_between(x_for, (predicted_only_crm - ci_crmonly),
                        (predicted_only_crm + ci_crmonly),
                        color='green',
                        alpha=.5)

        ax.set(xlabel='Days from 2013.06.01', ylabel='Oil volume, m3')
        if well_id == '5351':
            ax.legend()
        ax.set_title(well_id)
        ax.plot()
Ejemplo n.º 11
0
def _define_data(ml_task: Task,
                 features: Union[str, np.ndarray, pd.DataFrame, InputData,
                                 dict],
                 target: Union[str, np.ndarray, pd.Series] = None,
                 is_predict=False):
    if type(features) == InputData:
        # native FEDOT format for input data
        data = features
        data.task = ml_task
    elif type(features) == pd.DataFrame:
        # pandas format for input data
        if target is None:
            target = np.array([])

        if isinstance(target, str) and target in features.columns:
            target_array = features[target]
            del features[target]
        else:
            target_array = target

        data = array_to_input_data(features_array=np.asarray(features),
                                   target_array=np.asarray(target_array),
                                   task=ml_task)
    elif type(features) == np.ndarray:
        # numpy format for input data
        if target is None:
            target = np.array([])

        if isinstance(target, str):
            target_array = features[target]
            del features[target]
        else:
            target_array = target

        data = array_to_input_data(features_array=features,
                                   target_array=target_array,
                                   task=ml_task)
    elif type(features) == tuple:
        data = array_to_input_data(features_array=features[0],
                                   target_array=features[1],
                                   task=ml_task)
    elif type(features) == str:
        # CSV files as input data, by default - table data
        if target is None:
            target = 'target'

        data_type = DataTypesEnum.table
        if ml_task.task_type == TaskTypesEnum.ts_forecasting:
            # For time series forecasting format - time series
            data = InputData.from_csv_time_series(task=ml_task,
                                                  file_path=features,
                                                  target_column=target,
                                                  is_predict=is_predict)
        else:
            # Make default features table
            # CSV files as input data
            if target is None:
                target = 'target'
            data = InputData.from_csv(features,
                                      task=ml_task,
                                      target_columns=target,
                                      data_type=data_type)
    elif type(features) == dict:
        if target is None:
            target = np.array([])
        target_array = target

        data_part_transformation_func = partial(array_to_input_data,
                                                target_array=target_array,
                                                task=ml_task)

        # create labels for data sources
        sources = dict(
            (f'data_source_ts/{data_part_key}',
             data_part_transformation_func(features_array=data_part))
            for (data_part_key, data_part) in features.items())
        data = MultiModalData(sources)
    else:
        raise ValueError(
            'Please specify a features as path to csv file or as Numpy array')

    return data