def prepare_multi_modal_data(files_path, task: Task, images_size=(128, 128), with_split=True): path = os.path.join(str(fedot_project_root()), files_path) unpack_archived_data(path) data = InputData.from_json_files(path, fields_to_use=['votes', 'year'], label='rating', task=task) class_labels = np.asarray([0 if t <= 7 else 1 for t in data.target]) data.target = class_labels ratio = 0.5 img_files_path = f'{files_path}/*.jpeg' img_path = os.path.join(str(fedot_project_root()), img_files_path) data_img = InputData.from_image(images=img_path, labels=class_labels, task=task, target_size=images_size) data_text = InputData.from_json_files(path, fields_to_use=['plot'], label='rating', task=task, data_type=DataTypesEnum.text) data_text.target = class_labels if with_split: train_num, test_num = train_test_data_setup(data, shuffle_flag=False, split_ratio=ratio) train_img, test_img = train_test_data_setup(data_img, shuffle_flag=False, split_ratio=ratio) train_text, test_text = train_test_data_setup(data_text, shuffle_flag=False, split_ratio=ratio) else: train_num, test_num = data, data train_img, test_img = data_img, data_img train_text, test_text = data_text, data_text return train_num, test_num, train_img, test_img, train_text, test_text
def test_pipeline_hierarchy_fit_correct(data_setup): data = data_setup train, _ = train_test_data_setup(data) first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[first]) final = SecondaryNode(operation_type='logit', nodes_from=[second, third]) pipeline = Pipeline() for node in [first, second, third, final]: pipeline.add_node(node) pipeline.unfit() train_predicted = pipeline.fit(input_data=train) assert pipeline.root_node.descriptive_id == ( '((/n_logit_default_params;)/' 'n_logit_default_params;;(/' 'n_logit_default_params;)/' 'n_logit_default_params;)/' 'n_logit_default_params') assert pipeline.length == 4 assert pipeline.depth == 3 assert train_predicted.predict.shape[0] == train.target.shape[0] assert final.fitted_operation is not None
def run_multi_output_case(path, vis=False): """ Function launch case for river levels prediction on Lena river as multi-output regression task :param path: path to the file with table :param vis: is it needed to visualise pipeline and predictions """ target_columns = [ '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day' ] data = InputData.from_csv(path, target_columns=target_columns, columns_to_drop=['date']) train, test = train_test_data_setup(data) problem = 'regression' automl_model = Fedot(problem=problem) automl_model.fit(features=train) predicted_array = automl_model.predict(features=test) # Convert output into one dimensional array forecast = np.ravel(predicted_array) mae_value = mean_absolute_error(np.ravel(test.target), forecast) print(f'MAE - {mae_value:.2f}') if vis: plot_predictions(predicted_array, test)
def test_pipeline_fit_time_constraint(data_fixture, request): system = platform.system() if system == 'Linux': set_start_method("spawn", force=True) data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) test_pipeline_first = pipeline_first() time_constraint = datetime.timedelta(minutes=0.01) predicted_first = None computation_time_first = None process_start_time = time.time() try: predicted_first = test_pipeline_first.fit(input_data=train_data, time_constraint=time_constraint) except Exception as ex: received_ex = ex computation_time_first = test_pipeline_first.computation_time assert type(received_ex) is TimeoutError comp_time_proc_with_first_constraint = (time.time() - process_start_time) time_constraint = datetime.timedelta(minutes=0.05) process_start_time = time.time() try: test_pipeline_first.fit(input_data=train_data, time_constraint=time_constraint) except Exception as ex: received_ex = ex assert type(received_ex) is TimeoutError comp_time_proc_with_second_constraint = (time.time() - process_start_time) test_pipeline_second = pipeline_first() predicted_second = test_pipeline_second.fit(input_data=train_data) computation_time_second = test_pipeline_second.computation_time assert comp_time_proc_with_first_constraint < comp_time_proc_with_second_constraint assert computation_time_first is None assert predicted_first is None assert computation_time_second is not None assert predicted_second is not None
def data_setup(): task = Task(TaskTypesEnum.classification) predictors, response = load_breast_cancer(return_X_y=True) np.random.seed(1) np.random.shuffle(predictors) np.random.shuffle(response) response = response[:100] predictors = predictors[:100] input_data = InputData(idx=np.arange(0, len(predictors)), features=predictors, target=response, task=task, data_type=DataTypesEnum.table) train_data, test_data = train_test_data_setup(data=input_data) train_data_x = train_data.features test_data_x = test_data.features train_data_y = train_data.target test_data_y = test_data.target train_data = InputData(features=train_data_x, target=train_data_y, idx=np.arange(0, len(train_data_y)), task=task, data_type=DataTypesEnum.table) test_data = InputData(features=test_data_x, target=test_data_y, idx=np.arange(0, len(test_data_y)), task=task, data_type=DataTypesEnum.table) return train_data, test_data
def test_classification_models_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) roc_threshold = 0.95 logger = default_log('default_test_logger') with OperationTypesRepository() as repo: model_names, _ = repo.suitable_operation( task_type=TaskTypesEnum.classification, data_type=data.data_type, tags=['ml']) for model_name in model_names: logger.info(f"Test classification model: {model_name}.") model = Model(operation_type=model_name) _, train_predicted = model.fit(data=train_data) test_pred = model.predict(fitted_operation=_, data=test_data, is_fit_pipeline_stage=False) roc_on_test = get_roc_auc(valid_data=test_data, predicted_data=test_pred) if model_name not in ['bernb', 'multinb']: assert roc_on_test >= roc_threshold else: assert roc_on_test >= 0.5
def get_cholesterol_data(): file_path = join('cases', 'data', 'cholesterol', 'cholesterol.csv') full_path = join(str(fedot_project_root()), file_path) task = Task(TaskTypesEnum.regression) data = InputData.from_csv(full_path, task=task) train, test = train_test_data_setup(data) return train, test
def run_text_problem_from_saved_meta_file(path): data = InputData.from_text_meta_file(meta_file_path=path) train_data, test_data = train_test_data_setup(data, split_ratio=0.7) metric = execute_pipeline_for_text_problem(train_data, test_data) print(f'meta_file metric: {metric}')
def get_kc2_data(): file_path = join('cases', 'data', 'kc2', 'kc2.csv') full_path = join(str(fedot_project_root()), file_path) task = Task(TaskTypesEnum.classification) data = InputData.from_csv(full_path, task=task) train, test = train_test_data_setup(data) return train, test
def test_evaluate_individuals(): project_root_path = str(fedot_project_root()) file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv') full_path_train = os.path.join(str(fedot_project_root()), file_path_train) task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(full_path_train, task=task) available_model_types, _ = OperationTypesRepository().suitable_operation( task_type=task.task_type) metric_function = ClassificationMetricsEnum.ROCAUC_penalty composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types) builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \ with_metrics(metric_function) composer = builder.build() pipelines_to_evaluate = [ pipeline_first(), pipeline_second(), pipeline_third(), pipeline_fourth() ] train_data, test_data = train_test_data_setup( dataset_to_compose, sample_split_ratio_for_tasks[dataset_to_compose.task.task_type]) metric_function_for_nodes = partial(composer.composer_metric, composer.metrics, train_data, test_data) adapter = PipelineAdapter() population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate] timeout = datetime.timedelta(minutes=0.001) params = GraphGenerationParams(adapter=PipelineAdapter(), advisor=PipelineChangeAdvisor()) with OptimisationTimer(timeout=timeout) as t: evaluate_individuals(individuals_set=population, objective_function=metric_function_for_nodes, graph_generation_params=params, is_multi_objective=False, timer=t) assert len(population) == 1 assert population[0].fitness is not None population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate] timeout = datetime.timedelta(minutes=5) with OptimisationTimer(timeout=timeout) as t: evaluate_individuals(individuals_set=population, objective_function=metric_function_for_nodes, graph_generation_params=params, is_multi_objective=False, timer=t) assert len(population) == 4 assert all([ind.fitness is not None for ind in population])
def get_dataset(task_type: str): if task_type == 'regression': data = get_synthetic_regression_data() train_data, test_data = train_test_data_setup(data) threshold = np.std(test_data.target) * 0.05 elif task_type == 'classification': data = get_iris_data() train_data, test_data = train_test_data_setup(data, shuffle_flag=True) threshold = 0.95 elif task_type == 'clustering': data = get_synthetic_input_data(n_samples=1000) train_data, test_data = train_test_data_setup(data) threshold = 0.5 elif task_type == 'ts_forecasting': train_data, test_data = get_ts_data(forecast_length=5) threshold = np.std(test_data.target) else: raise ValueError('Incorrect type of machine learning task') return train_data, test_data, threshold
def compose_pipeline( self, data: Union[InputData, MultiModalData], is_visualise: bool = False, is_tune: bool = False, on_next_iteration_callback: Optional[Callable] = None ) -> Union[Pipeline, List[Pipeline]]: """ Function for optimal pipeline structure searching :param data: InputData for pipeline composing :param is_visualise: is it needed to visualise :param is_tune: is it needed to tune pipeline after composing TODO integrate new tuner :param on_next_iteration_callback: TODO add description :return best_pipeline: obtained result after composing: one pipeline for single-objective optimization; For the multi-objective case, the list of the graph is returned. In the list, the pipelines are ordered by the descending of primary metric (the first is the best) """ self.optimiser.graph_generation_params.advisor.task = data.task if self.composer_requirements.max_pipeline_fit_time: set_multiprocess_start_method() if not self.optimiser: raise AttributeError( f'Optimiser for graph composition is not defined') if self.composer_requirements.cv_folds is not None: objective_function_for_pipeline = self._cv_validation_metric_build( data) else: self.log.info( "Hold out validation for graph composing was applied.") split_ratio = sample_split_ratio_for_tasks[data.task.task_type] train_data, test_data = train_test_data_setup(data, split_ratio) objective_function_for_pipeline = partial(self.composer_metric, self.metrics, train_data, test_data) if self.cache_path is None: self.cache.clear() else: self.cache.clear(tmp_only=True) self.cache = OperationsCache( self.cache_path, clear_exiting=not self.use_existing_cache) best_pipeline = self.optimiser.optimise( objective_function_for_pipeline, on_next_iteration_callback=on_next_iteration_callback) self.log.info('GP composition finished') self.cache.clear() if is_tune: self.tune_pipeline(best_pipeline, data, self.composer_requirements.timeout) return best_pipeline
def test_regression_pipeline_fit_correct(): data = get_synthetic_regression_data() pipeline = generate_pipeline() train_data, test_data = train_test_data_setup(data) pipeline.fit(input_data=train_data) _, rmse_on_test = get_rmse_value(pipeline, train_data, test_data) rmse_threshold = np.std(data.target) * 0.05 assert rmse_on_test < rmse_threshold
def run_text_problem_from_meta_file(): data_file_abspath = os.path.abspath( os.path.join('data', 'spam', 'spamham.csv')) data = InputData.from_text_meta_file(meta_file_path=data_file_abspath) train_data, test_data = train_test_data_setup(data, split_ratio=0.7) metric = execute_pipeline_for_text_problem(train_data, test_data) print(f'meta_file metric: {metric}')
def test_log_clustering_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Scaling pipeline. Fit predict it scaling_pipeline = Pipeline(PrimaryNode('normalization')) scaling_pipeline.fit(train_data) scaled_data = scaling_pipeline.predict(train_data) kmeans = Model(operation_type='kmeans') _, train_predicted = kmeans.fit(data=scaled_data) assert all(np.unique(train_predicted.predict) == [0, 1])
def test_output_mode_labels(): data = get_iris_data() pipeline = pipeline_simple() train_data, test_data = train_test_data_setup(data, shuffle_flag=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data, output_mode='labels') results_probs = pipeline.predict(input_data=test_data) assert len(results.predict) == len(test_data.target) assert set(results.predict) == {0, 1, 2} assert not np.array_equal(results_probs.predict, results.predict)
def test_model_fit_and_predict_correctly(): """Checks whether the model fits and predict correctly on the synthetic dataset""" data = get_synthetic_input_data(N_SAMPLES, N_FEATURES, random_state=1) pipeline = generate_pipeline() train_data, test_data = train_test_data_setup(data) pipeline.fit(input_data=train_data) roc_auc_value_train, roc_auc_value_test = get_roc_auc_value(pipeline, train_data, test_data) train_auc_thr = get_auc_threshold(roc_auc_value_train) test_auc_thr = get_auc_threshold(roc_auc_value_test) assert train_auc_thr >= CORRECT_MODEL_AUC_THR assert test_auc_thr >= CORRECT_MODEL_AUC_THR
def test_multiclassification_pipeline_fit_correct(): data = get_iris_data() pipeline = pipeline_simple() train_data, test_data = train_test_data_setup(data, shuffle_flag=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data) roc_auc_on_test = roc_auc(y_true=test_data.target, y_score=results.predict, multi_class='ovo', average='macro') assert roc_auc_on_test > 0.95
def test_output_mode_full_probs(): data = get_binary_classification_data() pipeline = pipeline_simple() train_data, test_data = train_test_data_setup(data, shuffle_flag=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data, output_mode='full_probs') results_default = pipeline.predict(input_data=test_data) results_probs = pipeline.predict(input_data=test_data, output_mode='probs') assert not np.array_equal(results_probs.predict, results.predict) assert np.array_equal(results_probs.predict, results_default.predict) assert results.predict.shape == (len(test_data.target), 2) assert results_probs.predict.shape == (len(test_data.target),)
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup): data = data_setup train, test = train_test_data_setup(data) first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='xgboost', nodes_from=[first, second, third]) pipeline = Pipeline() for node in [first, second, third, final]: pipeline.add_node(node) first = deepcopy(first) second = deepcopy(second) third = deepcopy(third) final_shuffled = SecondaryNode(operation_type='xgboost', nodes_from=[third, first, second]) pipeline_shuffled = Pipeline() # change order of nodes in list for node in [final_shuffled, third, first, second]: pipeline_shuffled.add_node(node) train_predicted = pipeline.fit(input_data=train) train_predicted_shuffled = pipeline_shuffled.fit(input_data=train) # train results should be invariant assert pipeline.root_node.descriptive_id == pipeline_shuffled.root_node.descriptive_id assert np.equal(train_predicted.predict, train_predicted_shuffled.predict).all() test_predicted = pipeline.predict(input_data=test) test_predicted_shuffled = pipeline_shuffled.predict(input_data=test) # predict results should be invariant assert np.equal(test_predicted.predict, test_predicted_shuffled.predict).all() # change parents order for the nodes fitted pipeline nodes_for_change = pipeline.nodes[3].nodes_from pipeline.nodes[3].nodes_from = [nodes_for_change[2], nodes_for_change[0], nodes_for_change[1]] pipeline.nodes[3].unfit() pipeline.fit(train) test_predicted_re_shuffled = pipeline.predict(input_data=test) # predict results should be invariant assert np.equal(test_predicted.predict, test_predicted_re_shuffled.predict).all()
def multi_target_data_setup(): test_file_path = str(os.path.dirname(__file__)) file = '../../data/multi_target_sample.csv' path = os.path.join(test_file_path, file) target_columns = [ '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day' ] task = Task(TaskTypesEnum.regression) data = InputData.from_csv(path, target_columns=target_columns, columns_to_drop=['date'], task=task) train, test = train_test_data_setup(data) return train, test
def test_eval_strategy_logreg(data_setup): data_set = data_setup train, test = train_test_data_setup(data=data_set) test_skl_model = LogisticRegression(C=10., random_state=1, solver='liblinear', max_iter=10000, verbose=0) test_skl_model.fit(train.features, train.target) expected_result = test_skl_model.predict(test.features) test_model_node = PrimaryNode(operation_type='logit') test_model_node.fit(input_data=train) actual_result = test_model_node.predict(input_data=test) assert len(actual_result.predict) == len(expected_result)
def test_multi_times_analyze_analyze(analyze_method): # given pipeline, train_data, test_data, node_index, result_dir = given_data() test_data, valid_data = train_test_data_setup(test_data, split_ratio=0.5) # when analyze_result = MultiTimesAnalyze(pipeline=pipeline, train_data=train_data, test_data=test_data, valid_data=valid_data, case_name='test_case_name', path_to_save=result_dir).analyze() # then assert type(analyze_result) is float assert analyze_method.called
def test_pca_model_removes_redunant_features_correct(): n_informative = 5 data = classification_dataset_with_redunant_features( n_samples=1000, n_features=100, n_informative=n_informative) train_data, test_data = train_test_data_setup(data=data) # Scaling pipeline. Fit predict it scaling_pipeline = Pipeline(PrimaryNode('normalization')) scaling_pipeline.fit(train_data) scaled_data = scaling_pipeline.predict(train_data) pca = DataOperation(operation_type='pca') _, train_predicted = pca.fit(data=scaled_data) transformed_features = train_predicted.predict assert transformed_features.shape[1] < data.features.shape[1]
def test_svc_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Scaling pipeline. Fit predict it scaling_pipeline = Pipeline(PrimaryNode('normalization')) scaling_pipeline.fit(train_data) scaled_data = scaling_pipeline.predict(train_data) svc = Model(operation_type='svc') _, train_predicted = svc.fit(data=scaled_data) roc_on_train = get_roc_auc(valid_data=train_data, predicted_data=train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_logger_manager_keeps_loggers_correctly(): LogManager().clear_cache() pipeline = create_four_depth_pipeline() expected_number_of_loggers = 5 file = os.path.join('../data', 'advanced_classification.csv') test_file_path = str(os.path.dirname(__file__)) data = InputData.from_csv(os.path.join(test_file_path, file)) train_data, _ = train_test_data_setup(data=data) pipeline.fit(train_data) actual_number_of_loggers = LogManager().debug['loggers_number'] assert actual_number_of_loggers == expected_number_of_loggers
def test_pipeline_with_clusters_fit_correct(): mean_roc_on_test = 0 # mean ROC AUC is analysed because of stochastic clustering for _ in range(5): data = get_synthetic_input_data(n_samples=10000) pipeline = generate_pipeline() train_data, test_data = train_test_data_setup(data) pipeline.fit(input_data=train_data) _, roc_on_test = get_roc_auc_value(pipeline, train_data, test_data) mean_roc_on_test = np.mean([mean_roc_on_test, roc_on_test]) roc_threshold = 0.5 assert mean_roc_on_test > roc_threshold
def fit_predict_one_fold(data, pipeline): """ Simple strategy for model evaluation based on one folder check :param data: InputData for validation :param pipeline: Chain to validate """ # Train test split train_input, predict_input = train_test_data_setup(data) test_target = np.array(predict_input.target) pipeline.fit_from_scratch(train_input) predicted_output = pipeline.predict(predict_input) predictions = np.array(predicted_output.predict) return test_target, predictions
def data_setup(): predictors, response = load_breast_cancer(return_X_y=True) np.random.seed(1) np.random.shuffle(predictors) np.random.shuffle(response) response = response[:100] predictors = predictors[:100] # Wrap data into InputData input_data = InputData(features=predictors, target=response, idx=np.arange(0, len(predictors)), task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) # Train test split train_data, test_data = train_test_data_setup(input_data) return train_data, test_data
def test_model_predictions_on_train_test_random(): """Checks that model can't predict correctly on random train and test datasets and the roc_auc_scores is close to 0.5. Both train and test data have no relations between features and target.""" data = get_synthetic_input_data(N_SAMPLES, N_FEATURES, random_state=1) data = get_random_target_data(data) train_data, test_data = train_test_data_setup(data) pipeline = generate_pipeline() pipeline.fit(input_data=train_data) roc_auc_value_train, roc_auc_value_test = get_roc_auc_value(pipeline, train_data, test_data) train_auc_thr = get_auc_threshold(roc_auc_value_train) test_auc_thr = get_auc_threshold(roc_auc_value_test) assert test_auc_thr <= CORRECT_MODEL_AUC_THR assert train_auc_thr <= CORRECT_MODEL_AUC_THR