def test_forecast_with_exog(): train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts( ) # Source data for lagged node node_lagged = PrimaryNode('lagged') # Set window size for lagged transformation node_lagged.custom_params = {'window_size': window_size} # Exogenous variable for exog node node_exog = PrimaryNode('exog_ts_data_source') node_final = SecondaryNode('linear', nodes_from=[node_lagged, node_exog]) pipeline = Pipeline(node_final) pipeline.fit(input_data=MultiModalData({ 'exog_ts_data_source': train_exog_ts, 'lagged': train_source_ts })) forecast = pipeline.predict( input_data=MultiModalData({ 'exog_ts_data_source': predict_exog_ts, 'lagged': predict_source_ts })) prediction = np.ravel(np.array(forecast.predict)) assert tuple(prediction) == tuple(ts_test)
def _train_test_multi_modal_data_setup(data: MultiModalData, split_ratio=0.8, shuffle_flag=False) -> Tuple[MultiModalData, MultiModalData]: train_data = MultiModalData() test_data = MultiModalData() for node in data.keys(): data_part = data[node] train_data_part, test_data_part = train_test_data_setup(data_part, split_ratio, shuffle_flag) train_data[node] = train_data_part test_data[node] = test_data_part return train_data, test_data
def multimodal_data_preparing(name): pipeline = pipelines[name]['model'] train_dict = {} predict_dict = {} for key in pipelines[name]['tr_nodes_data'].keys(): train_dict[key] = deepcopy(pipelines[name]['tr_nodes_data'][key]) for key in pipelines[name]['pr_nodes_data'].keys(): predict_dict[key] = deepcopy(pipelines[name]['pr_nodes_data'][key]) train_dataset = MultiModalData(train_dict) predict_dataset = MultiModalData(predict_dict) return pipeline, train_dataset, predict_dataset
def test_multi_modal_data(): num_samples = 5 target = np.asarray([0, 0, 1, 0, 1]) img_data = InputData( idx=range(num_samples), features=None, # in test the real data is not passed target=target, data_type=DataTypesEnum.text, task=Task(TaskTypesEnum.classification)) tbl_data = InputData( idx=range(num_samples), features=None, # in test the real data is not passed target=target, data_type=DataTypesEnum.table, task=Task(TaskTypesEnum.classification)) multi_modal = MultiModalData({ 'data_source_img': img_data, 'data_source_table': tbl_data, }) assert multi_modal.task.task_type == TaskTypesEnum.classification assert len(multi_modal.idx) == 5 assert multi_modal.num_classes == 2 assert np.array_equal(multi_modal.target, target)
def test_forecast_with_sparse_lagged(): train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts( ) # Source data for lagged node node_lagged = PrimaryNode('sparse_lagged') # Set window size for lagged transformation node_lagged.custom_params = {'window_size': window_size} node_final = SecondaryNode('linear', nodes_from=[node_lagged]) pipeline = Pipeline(node_final) pipeline.fit(input_data=MultiModalData({'sparse_lagged': train_source_ts})) forecast = pipeline.predict( input_data=MultiModalData({'sparse_lagged': predict_source_ts})) is_forecasted = True assert is_forecasted
def make_forecast(pipeline, train: InputData, predict: InputData, train_exog: InputData, predict_exog: InputData): """ Function for predicting values in a time series :return predicted_values: numpy array, forecast of model """ # Fit it start_time = timeit.default_timer() second_node_name = 'exog_ts_data_source' if train_exog is None: second_node_name = 'lagged/2' train_exog = train predict_exog = predict train_dataset = MultiModalData({ 'lagged/1': train, second_node_name: train_exog, }) predict_dataset = MultiModalData({ 'lagged/1': predict, second_node_name: predict_exog, }) pipeline.fit_from_scratch(train_dataset) amount_of_seconds = timeit.default_timer() - start_time print(f'\nIt takes {amount_of_seconds:.2f} seconds to train pipeline\n') # Predict predicted_values = pipeline.predict(predict_dataset) predicted_values = predicted_values.predict return predicted_values
def generate_initial_pipeline_and_data(images_size, train_num, test_num, train_img, test_img, train_text, test_text): # image ds_image = PrimaryNode('data_source_img/1') image_node = SecondaryNode('cnn', nodes_from=[ds_image]) image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1), 'architecture': 'simplified', 'num_classes': 2, 'epochs': 15, 'batch_size': 128} # table ds_table = PrimaryNode('data_source_table/2') scaling_node = SecondaryNode('scaling', nodes_from=[ds_table]) numeric_node = SecondaryNode('rf', nodes_from=[scaling_node]) # text ds_text = PrimaryNode('data_source_text/3') node_text_clean = SecondaryNode('text_clean', nodes_from=[ds_text]) text_node = SecondaryNode('tfidf', nodes_from=[node_text_clean]) pipeline = Pipeline(SecondaryNode('logit', nodes_from=[numeric_node, image_node, text_node])) fit_data = MultiModalData({ 'data_source_img/1': train_img, 'data_source_table/2': train_num, 'data_source_text/3': train_text }) predict_data = MultiModalData({ 'data_source_img/1': test_img, 'data_source_table/2': test_num, 'data_source_text/3': test_text }) return pipeline, fit_data, predict_data
def test_multi_modal_pipeline(): task = Task(TaskTypesEnum.classification) images_size = (128, 128) files_path = os.path.join('test', 'data', 'multi_modal') path = os.path.join(str(fedot_project_root()), files_path) train_num, _, train_img, _, train_text, _ = \ prepare_multi_modal_data(path, task, images_size, with_split=False) # image image_node = PrimaryNode('cnn') image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1), 'architecture': 'simplified', 'num_classes': 2, 'epochs': 1, 'batch_size': 128} # image ds_image = PrimaryNode('data_source_img') image_node = SecondaryNode('cnn', nodes_from=[ds_image]) image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1), 'architecture': 'simplified', 'num_classes': 2, 'epochs': 15, 'batch_size': 128} # table ds_table = PrimaryNode('data_source_table') scaling_node = SecondaryNode('scaling', nodes_from=[ds_table]) numeric_node = SecondaryNode('rf', nodes_from=[scaling_node]) # text ds_text = PrimaryNode('data_source_text') node_text_clean = SecondaryNode('text_clean', nodes_from=[ds_text]) text_node = SecondaryNode('tfidf', nodes_from=[node_text_clean]) pipeline = Pipeline(SecondaryNode('logit', nodes_from=[numeric_node, image_node, text_node])) fit_data = MultiModalData({ 'data_source_img': train_img, 'data_source_table': train_num, 'data_source_text': train_text }) pipeline.fit(fit_data) prediction = pipeline.predict(fit_data) assert prediction is not None
train_input, predict_input, task = prepare_input_data( len_forecast=len_forecast, train_data_features=train_data, train_data_target=train_data, test_data_features=train_data) # Exogenous time series train_input_exog, predict_input_exog, _ = prepare_input_data( len_forecast=len_forecast, train_data_features=train_data_exog, train_data_target=train_data, test_data_features=test_data_exog) pipeline = get_arima_pipeline() train_dataset = MultiModalData({ 'arima': deepcopy(train_input), }) predict_dataset = MultiModalData({ 'arima': deepcopy(predict_input), }) pipeline.fit_from_scratch(train_dataset) predicted_values = pipeline.predict(predict_dataset) predicted_values = predicted_values.predict predicted = np.ravel(np.array(predicted_values)) test_data = np.ravel(np.array(test_data)) mse_before = mean_squared_error(test_data, predicted, squared=False) mae_before = mean_absolute_error(test_data, predicted) mape_before = mean_absolute_percentage_error(test_data, predicted) print(f'ARIMA MSE - {mse_before:.4f}')
def run_oil_forecasting(path_to_file, path_to_file_crm, len_forecast, len_forecast_full, ax, well_id, timeout): if timeout is None: timeout = 1 df = pd.read_csv(path_to_file, sep=' *, *') df_crm = pd.read_csv(path_to_file_crm, sep=' *, *') len_forecast_for_split = len_forecast_full dates, target_train, data_fit, data_predict, input_data_fit, input_data_predict, test_data, \ train_data, time_series = prepare_dataset(df, len_forecast, len_forecast_for_split, well_id) dates, target_train_crm, data_fit_crm, data_predict_crm, input_data_fit_crm, input_data_predict_crm, test_data_crm, \ train_data, time_series = prepare_dataset(df_crm, len_forecast, len_forecast_for_split, well_id) task_parameters = TsForecastingParams(forecast_length=len_forecast) if not os.path.exists(f'pipeline_{well_id}/pipeline_{well_id}.json'): model = Fedot(problem='ts_forecasting', task_params=task_parameters, composer_params={'timeout': timeout}, preset='ultra_light', verbose_level=4) # run AutoML model design in the same way pipeline = model.fit(features=data_fit, target=target_train) pipeline.save(f'pipeline_{well_id}') # , datetime_in_path=False) else: pipeline = Pipeline() pipeline.load(f'pipeline_{well_id}/pipeline_{well_id}.json') if not os.path.exists( f'pipeline_crm_{well_id}/pipeline_crm_{well_id}.json'): model = Fedot(problem='ts_forecasting', task_params=task_parameters, composer_params={'timeout': timeout}, preset='ultra_light', verbose_level=4) # run AutoML model design in the same way pipeline_crm = model.fit(features=data_fit_crm, target=target_train_crm) pipeline_crm.save( f'pipeline_crm_{well_id}') # , datetime_in_path=False) else: pipeline_crm = Pipeline() pipeline_crm.load( f'pipeline_crm_{well_id}/pipeline_crm_{well_id}.json') sources = dict( (f'data_source_ts/{data_part_key}', data_part) for (data_part_key, data_part) in input_data_predict.items()) input_data_predict_mm = MultiModalData(sources) sources_crm = dict( (f'data_source_ts/{data_part_key}', data_part) for (data_part_key, data_part) in input_data_predict_crm.items()) input_data_predict_mm_crm = MultiModalData(sources_crm) forecast = in_sample_ts_forecast(pipeline, input_data_predict_mm, horizon=len_forecast_full) forecast_crm = in_sample_ts_forecast(pipeline_crm, input_data_predict_mm_crm, horizon=len_forecast_full) predicted = np.ravel(np.array(forecast)) predicted_crm = np.ravel(np.array(forecast_crm)) predicted_only_crm = np.asarray( df_crm[f'crm_{well_id}'][-len_forecast_full:]) test_data = np.ravel(test_data) print('CRM') predicted_only_crm[np.isnan(predicted_only_crm)] = 0 mse_before = mean_squared_error(test_data, predicted_only_crm, squared=False) mae_before = mean_absolute_error(test_data, predicted_only_crm) print(f'RMSE - {mse_before:.4f}') print(f'MAE - {mae_before:.4f}\n') print('ML') mse_before = mean_squared_error(test_data, predicted, squared=False) mae_before = mean_absolute_error(test_data, predicted) print(f'RMSE - {mse_before:.4f}') print(f'MAE - {mae_before:.4f}\n') print('AutoML+CRM') mse_before = mean_squared_error(test_data, predicted_crm, squared=False) mae_before = mean_absolute_error(test_data, predicted_crm) print(f'RMSE - {mse_before:.4f}') print(f'MAE - {mae_before:.4f}\n') if ax: x_for = range(len(train_data), len(time_series)) ax.plot(x_for, time_series[-len_forecast_full:], label='Actual time series', linewidth=0.5) ax.plot(x_for, predicted_crm, label='AutoML+CRM', linewidth=0.5) ax.plot(x_for, predicted_only_crm, label='CRM', linewidth=0.5) ci_crm = t_conf_interval(np.std(predicted_crm), 0.975, len(predicted_crm)) * 1.96 ax.fill_between(x_for, (predicted_crm - ci_crm), (predicted_crm + ci_crm), color='orange', alpha=.5) ci_crmonly = t_conf_interval(np.std(predicted_only_crm), 0.975, len(predicted_only_crm)) * 1.96 ax.fill_between(x_for, (predicted_only_crm - ci_crmonly), (predicted_only_crm + ci_crmonly), color='green', alpha=.5) ax.set(xlabel='Days from 2013.06.01', ylabel='Oil volume, m3') if well_id == '5351': ax.legend() ax.set_title(well_id) ax.plot()
def _define_data(ml_task: Task, features: Union[str, np.ndarray, pd.DataFrame, InputData, dict], target: Union[str, np.ndarray, pd.Series] = None, is_predict=False): if type(features) == InputData: # native FEDOT format for input data data = features data.task = ml_task elif type(features) == pd.DataFrame: # pandas format for input data if target is None: target = np.array([]) if isinstance(target, str) and target in features.columns: target_array = features[target] del features[target] else: target_array = target data = array_to_input_data(features_array=np.asarray(features), target_array=np.asarray(target_array), task=ml_task) elif type(features) == np.ndarray: # numpy format for input data if target is None: target = np.array([]) if isinstance(target, str): target_array = features[target] del features[target] else: target_array = target data = array_to_input_data(features_array=features, target_array=target_array, task=ml_task) elif type(features) == tuple: data = array_to_input_data(features_array=features[0], target_array=features[1], task=ml_task) elif type(features) == str: # CSV files as input data, by default - table data if target is None: target = 'target' data_type = DataTypesEnum.table if ml_task.task_type == TaskTypesEnum.ts_forecasting: # For time series forecasting format - time series data = InputData.from_csv_time_series(task=ml_task, file_path=features, target_column=target, is_predict=is_predict) else: # Make default features table # CSV files as input data if target is None: target = 'target' data = InputData.from_csv(features, task=ml_task, target_columns=target, data_type=data_type) elif type(features) == dict: if target is None: target = np.array([]) target_array = target data_part_transformation_func = partial(array_to_input_data, target_array=target_array, task=ml_task) # create labels for data sources sources = dict( (f'data_source_ts/{data_part_key}', data_part_transformation_func(features_array=data_part)) for (data_part_key, data_part) in features.items()) data = MultiModalData(sources) else: raise ValueError( 'Please specify a features as path to csv file or as Numpy array') return data