コード例 #1
0
def run_oil_forecasting_problem(train_file_path,
                                train_file_path_crm,
                                forecast_length,
                                max_window_size,
                                is_visualise=False,
                                well_id='Unknown'):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size,
                            return_all_steps=False,
                            make_future_prediction=False))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts,
                                          delimiter=',')

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), train_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts,
                                             delimiter=',')

    full_path_train_crm = os.path.join(str(project_root()),
                                       train_file_path_crm)
    dataset_to_train_crm = InputData.from_csv(full_path_train_crm,
                                              task=task_to_solve,
                                              data_type=DataTypesEnum.ts,
                                              delimiter=',')

    dataset_to_validate_crm = copy(dataset_to_train_crm)

    prediction_full = None
    prediction_full_crm = None
    prediction_full_crm_opt = None

    forecast_window_shift_num = 4

    depth = 100

    for forecasting_step in range(forecast_window_shift_num):
        start = 0 + depth * forecasting_step
        end = depth * 2 + depth * (forecasting_step + 1)

        dataset_to_train_local = dataset_to_train.subset(start, end)
        dataset_to_train_local_crm = dataset_to_train_crm.subset(start, end)

        start = 0 + depth * forecasting_step
        end = depth * 2 + depth * (forecasting_step + 1)

        dataset_to_validate_local = dataset_to_validate.subset(
            start + depth, end + depth)
        dataset_to_validate_local_crm = dataset_to_validate_crm.subset(
            start + depth, end + depth)

        chain_simple = Chain(PrimaryNode('lstm'))
        chain_simple_crm = Chain(PrimaryNode('lstm'))
        chain_crm_opt = get_comp_chain()

        chain_simple.fit_from_scratch(input_data=dataset_to_train_local,
                                      verbose=False)
        chain_simple_crm.fit_from_scratch(
            input_data=dataset_to_train_local_crm, verbose=False)
        chain_crm_opt.fit_from_scratch(input_data=dataset_to_train_local_crm,
                                       verbose=False)

        prediction = chain_simple.predict(dataset_to_validate_local)
        prediction_crm = chain_simple_crm.predict(
            dataset_to_validate_local_crm)
        prediction_crm_opt = chain_crm_opt.predict(
            dataset_to_validate_local_crm)

        prediction_full = merge_datasets(prediction_full, prediction,
                                         forecasting_step)
        prediction_full_crm = merge_datasets(prediction_full_crm,
                                             prediction_crm, forecasting_step)
        prediction_full_crm_opt = merge_datasets(prediction_full_crm_opt,
                                                 prediction_crm_opt,
                                                 forecasting_step)

    rmse_on_valid_simple = calculate_validation_metric(
        prediction_full, prediction_full_crm, prediction_full_crm_opt,
        dataset_to_validate, well_id, is_visualise)

    print(well_id)
    print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}')
    print(f'RMSE ML: {round(rmse_on_valid_simple[1])}')
    print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}')
    print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}')

    print(f'DTW CRM: {round(rmse_on_valid_simple[4])}')
    print(f'DTW ML: {round(rmse_on_valid_simple[5])}')
    print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}')
    print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}')

    return rmse_on_valid_simple
コード例 #2
0
def synthetic_benchmark_dataset(samples_amount: int,
                                features_amount: int,
                                classes_amount: int = 2,
                                features_options: Dict = DEFAULT_OPTIONS,
                                fitted_chain: Chain = None) -> InputData:
    """
    Generates a binary classification benchmark dataset that was obtained using
    the (TODO: add. reference) proposed fitting schema.
    :param samples_amount: Total amount of samples in the resulted dataset.
    :param features_amount: Total amount of features per sample.
    :param classes_amount: The amount of classes in the dataset.
    :param features_options: features options in key-value suitable for classification_dataset.
    :param fitted_chain: Chain with separately fitted models.
    If None then 3-level balanced tree were fitted and taken as a default.
    :return: Benchmark dataset that is ready to be used by Chain.
    """
    if fitted_chain is None:
        fitted_chain = _default_chain(samples_amount=samples_amount,
                                      features_amount=features_amount,
                                      classes_amount=classes_amount)

    if classes_amount != 2:
        raise NotImplementedError(
            'Only binary classification tasks are supported')

    features, target = classification_dataset(
        samples_amount=samples_amount,
        features_amount=features_amount,
        classes_amount=classes_amount,
        features_options=features_options)
    target = np.expand_dims(target, axis=1)

    task = Task(TaskTypesEnum.classification)
    samples_idxs = np.arange(0, samples_amount)

    train = InputData(idx=samples_idxs,
                      features=features,
                      target=target,
                      task=task,
                      data_type=DataTypesEnum.table)

    synth_target = fitted_chain.predict(input_data=train).predict
    synth_labels = _to_labels(synth_target)
    data_synth_train = InputData(idx=np.arange(0, samples_amount),
                                 features=features,
                                 target=synth_labels,
                                 task=task,
                                 data_type=DataTypesEnum.table)

    # TODO: fix preproc issues

    fitted_chain.fit_from_scratch(input_data=data_synth_train)

    features, target = classification_dataset(
        samples_amount=samples_amount,
        features_amount=features_amount,
        classes_amount=classes_amount,
        features_options=features_options)
    target = np.expand_dims(target, axis=1)
    test = InputData(idx=samples_idxs,
                     features=features,
                     target=target,
                     data_type=DataTypesEnum.table,
                     task=task)
    synth_target = fitted_chain.predict(input_data=test).predict
    synth_labels = _to_labels(synth_target)
    data_synth_final = InputData(idx=samples_idxs,
                                 features=features,
                                 data_type=DataTypesEnum.table,
                                 target=synth_labels,
                                 task=task)

    return data_synth_final