def get_split_data_paths(): file_path_train = 'test/data/simple_regression_train.csv' file_path_test = 'test/data/simple_regression_test.csv' full_path_train = os.path.join(str(fedot_project_root()), file_path_train) full_path_test = os.path.join(str(fedot_project_root()), file_path_test) return full_path_train, full_path_test
def prepare_multi_modal_data(files_path, task: Task, images_size=(128, 128), with_split=True): path = os.path.join(str(fedot_project_root()), files_path) unpack_archived_data(path) data = InputData.from_json_files(path, fields_to_use=['votes', 'year'], label='rating', task=task) class_labels = np.asarray([0 if t <= 7 else 1 for t in data.target]) data.target = class_labels ratio = 0.5 img_files_path = f'{files_path}/*.jpeg' img_path = os.path.join(str(fedot_project_root()), img_files_path) data_img = InputData.from_image(images=img_path, labels=class_labels, task=task, target_size=images_size) data_text = InputData.from_json_files(path, fields_to_use=['plot'], label='rating', task=task, data_type=DataTypesEnum.text) data_text.target = class_labels if with_split: train_num, test_num = train_test_data_setup(data, shuffle_flag=False, split_ratio=ratio) train_img, test_img = train_test_data_setup(data_img, shuffle_flag=False, split_ratio=ratio) train_text, test_text = train_test_data_setup(data_text, shuffle_flag=False, split_ratio=ratio) else: train_num, test_num = data, data train_img, test_img = data_img, data_img train_text, test_text = data_text, data_text return train_num, test_num, train_img, test_img, train_text, test_text
def get_scoring_case_data_paths() -> Tuple[str, str]: train_file_path = os.path.join('cases', 'data', 'scoring', 'scoring_train.csv') test_file_path = os.path.join('cases', 'data', 'scoring', 'scoring_test.csv') full_train_file_path = os.path.join(str(fedot_project_root()), train_file_path) full_test_file_path = os.path.join(str(fedot_project_root()), test_file_path) return full_train_file_path, full_test_file_path
def test_multivariate_ts(): forecast_length = 1 file_path_train = 'cases/data/metocean/metocean_data_train.csv' full_path_train = os.path.join(str(fedot_project_root()), file_path_train) # a dataset for a final validation of the composed model file_path_test = 'cases/data/metocean/metocean_data_test.csv' full_path_test = os.path.join(str(fedot_project_root()), file_path_test) target_history, add_history, obs = prepare_input_data( full_path_train, full_path_test) historical_data = { 'ws': add_history, # additional variable 'ssh': target_history, # target variable } fedot = Fedot( problem='ts_forecasting', composer_params=composer_params, task_params=TsForecastingParams(forecast_length=forecast_length)) fedot.fit(features=historical_data, target=target_history) forecast = fedot.forecast(historical_data, forecast_length=forecast_length) assert forecast is not None
def test_evaluate_individuals(): project_root_path = str(fedot_project_root()) file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv') full_path_train = os.path.join(str(fedot_project_root()), file_path_train) task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(full_path_train, task=task) available_model_types, _ = OperationTypesRepository().suitable_operation( task_type=task.task_type) metric_function = ClassificationMetricsEnum.ROCAUC_penalty composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types) builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \ with_metrics(metric_function) composer = builder.build() pipelines_to_evaluate = [ pipeline_first(), pipeline_second(), pipeline_third(), pipeline_fourth() ] train_data, test_data = train_test_data_setup( dataset_to_compose, sample_split_ratio_for_tasks[dataset_to_compose.task.task_type]) metric_function_for_nodes = partial(composer.composer_metric, composer.metrics, train_data, test_data) adapter = PipelineAdapter() population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate] timeout = datetime.timedelta(minutes=0.001) params = GraphGenerationParams(adapter=PipelineAdapter(), advisor=PipelineChangeAdvisor()) with OptimisationTimer(timeout=timeout) as t: evaluate_individuals(individuals_set=population, objective_function=metric_function_for_nodes, graph_generation_params=params, is_multi_objective=False, timer=t) assert len(population) == 1 assert population[0].fitness is not None population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate] timeout = datetime.timedelta(minutes=5) with OptimisationTimer(timeout=timeout) as t: evaluate_individuals(individuals_set=population, objective_function=metric_function_for_nodes, graph_generation_params=params, is_multi_objective=False, timer=t) assert len(population) == 4 assert all([ind.fitness is not None for ind in population])
def test_credit_scoring_problem(): project_root_path = str(fedot_project_root()) file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv') file_path_test = file_path_train full_path_train = os.path.join(str(fedot_project_root()), file_path_train) full_path_test = os.path.join(str(fedot_project_root()), file_path_test) roc_auc_test = run_credit_scoring_problem(full_path_train, full_path_test, timeout=timedelta(minutes=0.1)) assert roc_auc_test > 0.5
def get_scoring_data(): file_path_train = join('cases', 'data', 'scoring', 'scoring_train.csv') full_path_train = join(str(fedot_project_root()), file_path_train) # a dataset for a final validation of the composed model file_path_test = join('cases', 'data', 'scoring', 'scoring_test.csv') full_path_test = join(str(fedot_project_root()), file_path_test) task = Task(TaskTypesEnum.classification) train = InputData.from_csv(full_path_train, task=task) test = InputData.from_csv(full_path_test, task=task) return train, test
def get_scoring_data(): # the dataset was obtained from https://www.kaggle.com/c/GiveMeSomeCredit # a dataset that will be used as a train and test set during composition file_path_train = 'cases/data/scoring/scoring_train.csv' full_path_train = os.path.join(str(fedot_project_root()), file_path_train) # a dataset for a final validation of the composed model file_path_test = 'cases/data/scoring/scoring_test.csv' full_path_test = os.path.join(str(fedot_project_root()), file_path_test) return full_path_train, full_path_test
def test_metocean_forecasting_problem(): project_root_path = str(fedot_project_root()) file_path_train = os.path.join(project_root_path, 'test/data/simple_time_series.csv') file_path_test = file_path_train full_path_train = os.path.join(str(fedot_project_root()), file_path_train) full_path_test = os.path.join(str(fedot_project_root()), file_path_test) rmse = run_metocean_forecasting_problem(full_path_train, full_path_test, forecast_length=2, timeout=0.1) print(rmse) assert rmse['rmse'] < 500
def run_gapfilling_case(file_path): """ The function runs an example of filling in gaps in a time series with air temperature. Real data case. :param file_path: path to the file :return: pandas dataframe with columns 'date','with_gap','ridge', 'composite','temperature' """ # Load dataframe full_path = os.path.join(str(fedot_project_root()), file_path) dataframe = pd.read_csv(full_path) dataframe['date'] = pd.to_datetime(dataframe['date']) # Filling in gaps based on inverted ridge regression model ridge_pipeline = get_simple_pipeline() ridge_gapfiller = ModelGapFiller(gap_value=-100.0, pipeline=ridge_pipeline) with_gap_array = np.array(dataframe['with_gap']) without_gap_arr_ridge = ridge_gapfiller.forward_inverse_filling(with_gap_array) dataframe['ridge'] = without_gap_arr_ridge # Filling in gaps based on a pipeline of 5 models composite_pipeline = get_composite_pipeline() composite_gapfiller = ModelGapFiller(gap_value=-100.0, pipeline=composite_pipeline) without_gap_composite = composite_gapfiller.forward_filling(with_gap_array) dataframe['composite'] = without_gap_composite return dataframe
def test_tpot_vs_fedot_example(): project_root_path = str(fedot_project_root()) file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv') file_path_test = file_path_train auc = run_tpot_vs_fedot_example(file_path_train, file_path_test) assert auc > 0.5
def test_lagged_with_invalid_params_fit_correctly(): """ The function define a pipeline with incorrect parameters in the lagged transformation. During the training of the pipeline, the parameter 'window_size' is corrected """ window_size = 600 len_forecast = 50 # The length of the time series is 500 elements project_root_path = str(fedot_project_root()) file_path = os.path.join(project_root_path, 'test/data/short_time_series.csv') df = pd.read_csv(file_path) time_series = np.array(df['sea_height']) task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast)) train_input = InputData(idx=np.arange(0, len(time_series)), features=time_series, target=time_series, task=task, data_type=DataTypesEnum.ts) # Get pipeline with lagged transformation in it pipeline = get_ts_pipeline(window_size) # Fit it pipeline.fit(train_input) is_pipeline_was_fitted = True assert is_pipeline_was_fitted
def get_cholesterol_data(): file_path = join('cases', 'data', 'cholesterol', 'cholesterol.csv') full_path = join(str(fedot_project_root()), file_path) task = Task(TaskTypesEnum.regression) data = InputData.from_csv(full_path, task=task) train, test = train_test_data_setup(data) return train, test
def get_kc2_data(): file_path = join('cases', 'data', 'kc2', 'kc2.csv') full_path = join(str(fedot_project_root()), file_path) task = Task(TaskTypesEnum.classification) data = InputData.from_csv(full_path, task=task) train, test = train_test_data_setup(data) return train, test
def test_spam_detection_problem(): """ Simple launch of spam detection case """ project_root_path = str(fedot_project_root()) file_path_train = os.path.join(project_root_path, 'test/data/spam_detection.csv') # Classification task based on text data run_text_problem_from_saved_meta_file(file_path_train)
def test_pipeline_from_automl_example(): project_root_path = str(fedot_project_root()) with OperationTypesRepository().assign_repo('model', 'model_repository_with_automl.json') as _: file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv') file_path_test = file_path_train auc = run_pipeline_from_automl(file_path_train, file_path_test, max_run_time=timedelta(seconds=1)) OperationTypesRepository.assign_repo('model', 'model_repository.json') assert auc > 0.5
def prepare_input_data(train_file_path, test_file_path): """ Function for preparing InputData for train and test algorithm :param train_file_path: path to the csv file for training :param test_file_path: path to the csv file for validation :return dataset_to_train: InputData for train :return dataset_to_validate: InputData for validation """ # Load train and test dataframes full_path_train = os.path.join(str(fedot_project_root()), train_file_path) full_path_test = os.path.join(str(fedot_project_root()), test_file_path) df_train = pd.read_csv(full_path_train) df_test = pd.read_csv(full_path_test) ws_history = np.ravel(np.array(df_train['wind_speed'])) ssh_history = np.ravel(np.array(df_train['sea_height'])) ssh_obs = np.ravel(np.array(df_test['sea_height'])) return ssh_history, ws_history, ssh_obs
def test_multi_modal_pipeline(): task = Task(TaskTypesEnum.classification) images_size = (128, 128) files_path = os.path.join('test', 'data', 'multi_modal') path = os.path.join(str(fedot_project_root()), files_path) train_num, _, train_img, _, train_text, _ = \ prepare_multi_modal_data(path, task, images_size, with_split=False) # image image_node = PrimaryNode('cnn') image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1), 'architecture': 'simplified', 'num_classes': 2, 'epochs': 1, 'batch_size': 128} # image ds_image = PrimaryNode('data_source_img') image_node = SecondaryNode('cnn', nodes_from=[ds_image]) image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1), 'architecture': 'simplified', 'num_classes': 2, 'epochs': 15, 'batch_size': 128} # table ds_table = PrimaryNode('data_source_table') scaling_node = SecondaryNode('scaling', nodes_from=[ds_table]) numeric_node = SecondaryNode('rf', nodes_from=[scaling_node]) # text ds_text = PrimaryNode('data_source_text') node_text_clean = SecondaryNode('text_clean', nodes_from=[ds_text]) text_node = SecondaryNode('tfidf', nodes_from=[node_text_clean]) pipeline = Pipeline(SecondaryNode('logit', nodes_from=[numeric_node, image_node, text_node])) fit_data = MultiModalData({ 'data_source_img': train_img, 'data_source_table': train_num, 'data_source_text': train_text }) pipeline.fit(fit_data) prediction = pipeline.predict(fit_data) assert prediction is not None
def test_data_from_json(): # several features files_path = os.path.join('test', 'data', 'multi_modal') path = os.path.join(str(fedot_project_root()), files_path) data = InputData.from_json_files(path, fields_to_use=['votes', 'year'], label='rating', task=Task(TaskTypesEnum.regression)) assert data.features.shape[1] == 2 # check there is two features assert len(data.target) == data.features.shape[0] == len(data.idx) # single feature data = InputData.from_json_files(path, fields_to_use=['votes'], label='rating', task=Task(TaskTypesEnum.regression)) assert len(data.features.shape) == 1 # check there is one feature assert len(data.target) == len(data.features) == len(data.idx)
def run_custom_example( timeout: datetime.timedelta = datetime.timedelta(minutes=0.2)): data = pd.read_csv( os.path.join(fedot_project_root(), 'examples', 'data', 'custom_encoded.csv')) nodes_types = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10'] rules = [has_no_self_cycled_nodes, has_no_cycle, _has_no_duplicates] initial = CustomGraphModel(nodes=[ CustomGraphNode(nodes_from=None, content=node_type) for node_type in nodes_types ]) requirements = GPComposerRequirements(primary=nodes_types, secondary=nodes_types, max_arity=10, max_depth=10, pop_size=5, num_of_generations=5, crossover_prob=0.8, mutation_prob=0.9, timeout=timeout) optimiser_parameters = GPGraphOptimiserParameters( genetic_scheme_type=GeneticSchemeTypesEnum.steady_state, mutation_types=[custom_mutation], crossover_types=[CrossoverTypesEnum.none], regularization_type=RegularizationTypesEnum.none) graph_generation_params = GraphGenerationParams(adapter=DirectAdapter( base_graph_class=CustomGraphModel, base_node_class=CustomGraphNode), rules_for_constraint=rules) optimizer = GPGraphOptimiser( graph_generation_params=graph_generation_params, metrics=[], parameters=optimiser_parameters, requirements=requirements, initial_graph=initial, log=default_log(logger_name='Bayesian', verbose_level=1)) optimized_network = optimizer.optimise(partial(custom_metric, data=data)) optimized_network.show()
def get_ts_data(n_steps=80, forecast_length=5): """ Prepare data from csv file with time series and take needed number of elements :param n_steps: number of elements in time series to take :param forecast_length: the length of forecast """ project_root_path = str(fedot_project_root()) file_path = os.path.join(project_root_path, 'test/data/simple_time_series.csv') df = pd.read_csv(file_path) time_series = np.array(df['sea_height'])[:n_steps] task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length)) data = InputData(idx=np.arange(0, len(time_series)), features=time_series, target=time_series, task=task, data_type=DataTypesEnum.ts) return train_test_data_setup(data)
def create_multi_clf_examples_from_excel(file_path: str, return_df: bool = False): """ Return dataframe from excel file or path to the csv file """ df = pd.read_excel(file_path, engine='openpyxl') train, test = split_data(df) file_dir_name = file_path.replace('.', '/').split('/')[-2] file_csv_name = f'{file_dir_name}.csv' directory_names = ['examples', 'data', file_dir_name] # Check does obtained directory exist or not ensure_directory_exists(directory_names) if return_df: # Need to return dataframe and path to the file in csv format path = os.path.join(directory_names[0], directory_names[1], directory_names[2], file_csv_name) full_file_path = os.path.join(str(fedot_project_root()), path) save_file_to_csv(df, full_file_path) return df, full_file_path else: # Need to return only paths to the files with train and test data full_train_file_path, full_test_file_path = get_split_data_paths(directory_names) save_file_to_csv(train, full_train_file_path) save_file_to_csv(train, full_test_file_path) return full_train_file_path, full_test_file_path
def test_river_levels_problem(): # Initialise pipeline for river levels prediction node_encoder = PrimaryNode('one_hot_encoding') node_scaling = SecondaryNode('scaling', nodes_from=[node_encoder]) node_ridge = SecondaryNode('ridge', nodes_from=[node_scaling]) node_lasso = SecondaryNode('lasso', nodes_from=[node_scaling]) node_final = SecondaryNode('rfr', nodes_from=[node_ridge, node_lasso]) init_pipeline = Pipeline(node_final) project_root_path = str(fedot_project_root()) file_path_train = os.path.join(project_root_path, 'test/data/station_levels.csv') run_river_experiment(file_path=file_path_train, pipeline=init_pipeline, iterations=1, tuner=PipelineTuner, tuner_iterations=10) is_experiment_finished = True assert is_experiment_finished
def test_fedot_project_root(): root_path = fedot_project_root() assert 'core' in os.listdir(os.path.join(root_path, 'fedot')) assert 'api' in os.listdir(os.path.join(root_path, 'fedot'))