def run_oil_forecasting_problem(train_file_path, train_file_path_crm, forecast_length, max_window_size, is_visualise=False, well_id='Unknown'): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=False)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), train_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') full_path_train_crm = os.path.join(str(project_root()), train_file_path_crm) dataset_to_train_crm = InputData.from_csv(full_path_train_crm, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') dataset_to_validate_crm = copy(dataset_to_train_crm) prediction_full = None prediction_full_crm = None prediction_full_crm_opt = None forecast_window_shift_num = 4 depth = 100 for forecasting_step in range(forecast_window_shift_num): start = 0 + depth * forecasting_step end = depth * 2 + depth * (forecasting_step + 1) dataset_to_train_local = dataset_to_train.subset(start, end) dataset_to_train_local_crm = dataset_to_train_crm.subset(start, end) start = 0 + depth * forecasting_step end = depth * 2 + depth * (forecasting_step + 1) dataset_to_validate_local = dataset_to_validate.subset( start + depth, end + depth) dataset_to_validate_local_crm = dataset_to_validate_crm.subset( start + depth, end + depth) chain_simple = Chain(PrimaryNode('lstm')) chain_simple_crm = Chain(PrimaryNode('lstm')) chain_crm_opt = get_comp_chain() chain_simple.fit_from_scratch(input_data=dataset_to_train_local, verbose=False) chain_simple_crm.fit_from_scratch( input_data=dataset_to_train_local_crm, verbose=False) chain_crm_opt.fit_from_scratch(input_data=dataset_to_train_local_crm, verbose=False) prediction = chain_simple.predict(dataset_to_validate_local) prediction_crm = chain_simple_crm.predict( dataset_to_validate_local_crm) prediction_crm_opt = chain_crm_opt.predict( dataset_to_validate_local_crm) prediction_full = merge_datasets(prediction_full, prediction, forecasting_step) prediction_full_crm = merge_datasets(prediction_full_crm, prediction_crm, forecasting_step) prediction_full_crm_opt = merge_datasets(prediction_full_crm_opt, prediction_crm_opt, forecasting_step) rmse_on_valid_simple = calculate_validation_metric( prediction_full, prediction_full_crm, prediction_full_crm_opt, dataset_to_validate, well_id, is_visualise) print(well_id) print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}') print(f'RMSE ML: {round(rmse_on_valid_simple[1])}') print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}') print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}') print(f'DTW CRM: {round(rmse_on_valid_simple[4])}') print(f'DTW ML: {round(rmse_on_valid_simple[5])}') print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}') print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}') return rmse_on_valid_simple
def synthetic_benchmark_dataset(samples_amount: int, features_amount: int, classes_amount: int = 2, features_options: Dict = DEFAULT_OPTIONS, fitted_chain: Chain = None) -> InputData: """ Generates a binary classification benchmark dataset that was obtained using the (TODO: add. reference) proposed fitting schema. :param samples_amount: Total amount of samples in the resulted dataset. :param features_amount: Total amount of features per sample. :param classes_amount: The amount of classes in the dataset. :param features_options: features options in key-value suitable for classification_dataset. :param fitted_chain: Chain with separately fitted models. If None then 3-level balanced tree were fitted and taken as a default. :return: Benchmark dataset that is ready to be used by Chain. """ if fitted_chain is None: fitted_chain = _default_chain(samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount) if classes_amount != 2: raise NotImplementedError( 'Only binary classification tasks are supported') features, target = classification_dataset( samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount, features_options=features_options) target = np.expand_dims(target, axis=1) task = Task(TaskTypesEnum.classification) samples_idxs = np.arange(0, samples_amount) train = InputData(idx=samples_idxs, features=features, target=target, task=task, data_type=DataTypesEnum.table) synth_target = fitted_chain.predict(input_data=train).predict synth_labels = _to_labels(synth_target) data_synth_train = InputData(idx=np.arange(0, samples_amount), features=features, target=synth_labels, task=task, data_type=DataTypesEnum.table) # TODO: fix preproc issues fitted_chain.fit_from_scratch(input_data=data_synth_train) features, target = classification_dataset( samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount, features_options=features_options) target = np.expand_dims(target, axis=1) test = InputData(idx=samples_idxs, features=features, target=target, data_type=DataTypesEnum.table, task=task) synth_target = fitted_chain.predict(input_data=test).predict synth_labels = _to_labels(synth_target) data_synth_final = InputData(idx=samples_idxs, features=features, data_type=DataTypesEnum.table, target=synth_labels, task=task) return data_synth_final