def run_refinement_scoring_example(train_path, test_path, with_tuning=False): """ Function launch example with error modeling for classification task :param train_path: path to the csv file with training sample :param test_path: path to the csv file with test sample :param with_tuning: is it need to tune pipelines or not """ task = Task(TaskTypesEnum.classification) train_dataset = InputData.from_csv(train_path, task=task) test_dataset = InputData.from_csv(test_path, task=task) # Get and fit pipelines no_decompose_c = get_non_refinement_pipeline() decompose_c = get_refinement_pipeline() no_decompose_c.fit(train_dataset) decompose_c.fit(train_dataset) # Check metrics for both pipelines display_roc_auc(no_decompose_c, test_dataset, 'Non decomposition pipeline') display_roc_auc(decompose_c, test_dataset, 'With decomposition pipeline') if with_tuning: no_decompose_c.fine_tune_all_nodes(loss_function=roc_auc, loss_params=None, input_data=train_dataset, iterations=30) decompose_c.fine_tune_all_nodes(loss_function=roc_auc, loss_params=None, input_data=train_dataset, iterations=30) display_roc_auc(no_decompose_c, test_dataset, 'Non decomposition pipeline after tuning') display_roc_auc(decompose_c, test_dataset, 'With decomposition pipeline after tuning')
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier()) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(f'ROC AUC for TPOT: {roc_auc_value}') node_scaling = PrimaryNode('scaling') node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling]) pipeline = Pipeline(node_rf) pipeline.fit(train_data) results = pipeline.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(f'ROC AUC for FEDOT: {roc_auc_value}') return roc_auc_value
def test_target_data_from_csv_correct(): """ Function tests two ways of processing target columns in "from_csv" method """ test_file_path = str(os.path.dirname(__file__)) file = '../../data/multi_target_sample.csv' path = os.path.join(test_file_path, file) task = Task(TaskTypesEnum.regression) # Process one column target_column = '1_day' one_column_data = InputData.from_csv(path, target_columns=target_column, columns_to_drop=['date'], task=task) # Process multiple target columns target_columns = [ '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day' ] seven_columns_data = InputData.from_csv(path, target_columns=target_columns, columns_to_drop=['date'], task=task) assert one_column_data.target.shape == (499, 1) assert seven_columns_data.target.shape == (499, 7)
def test_with_custom_target(): test_file_path = str(os.path.dirname(__file__)) file = '../../data/simple_classification.csv' file_custom = '../../data/simple_classification_with_custom_target.csv' file_data = InputData.from_csv(os.path.join(test_file_path, file)) expected_features = file_data.features expected_target = file_data.target custom_file_data = InputData.from_csv(os.path.join(test_file_path, file_custom), delimiter=';') actual_features = custom_file_data.features actual_target = custom_file_data.target assert not np.array_equal(expected_features, actual_features) assert not np.array_equal(expected_target, actual_target) custom_file_data = InputData.from_csv(os.path.join(test_file_path, file_custom), delimiter=';', columns_to_drop=['redundant'], target_columns='custom_target') actual_features = custom_file_data.features actual_target = custom_file_data.target assert np.array_equal(expected_features, actual_features) assert np.array_equal(expected_target, actual_target)
def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_tpot = PrimaryNode('tpot') node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds} node_lda = PrimaryNode('lda') node_rf = SecondaryNode('rf') node_rf.nodes_from = [node_tpot, node_lda] chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def get_case_train_test_data(): """ Function for getting data for train and validation """ train_file_path, test_file_path = get_scoring_case_data_paths() train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) return train_data, test_data
def run_autokeras(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file task = params.task config_data = get_models_hyperparameters()['autokeras'] max_trial = config_data['MAX_TRIAL'] epoch = config_data['EPOCH'] train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) # TODO Save model to file if task == TaskTypesEnum.classification: estimator = ak.StructuredDataClassifier else: estimator = ak.StructuredDataRegressor model = estimator(max_trials=max_trial) model.fit(train_data.features, train_data.target, epochs=epoch) predicted = model.predict(test_data.features) return test_data.target, predicted
def run_h2o(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task = params.task config_data = get_models_hyperparameters()['H2O'] max_models = config_data['MAX_MODELS'] max_runtime_secs = config_data['MAX_RUNTIME_SECS'] result_filename = f'{case_label}_m{max_models}_rs{max_runtime_secs}_{task.name}' exported_model_path = os.path.join(CURRENT_PATH, result_filename) # TODO Regression if result_filename not in os.listdir(CURRENT_PATH): train_data = InputData.from_csv(train_file_path) best_model = fit_h2o(train_data, round(max_runtime_secs / 60)) temp_exported_model_path = h2o.save_model(model=best_model, path=CURRENT_PATH) os.renames(temp_exported_model_path, exported_model_path) ip, port = get_h2o_connect_config() h2o.init(ip=ip, port=port, name='h2o_server') imported_model = h2o.load_model(exported_model_path) test_frame = InputData.from_csv(test_file_path) true_target = test_frame.target predicted = predict_h2o(imported_model, test_frame) h2o.shutdown(prompt=False) return true_target, predicted
def run_pipeline_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): """ Function run pipeline with Auto ML models in nodes :param train_file_path: path to the csv file with data for train :param test_file_path: path to the csv file with data for validation :param max_run_time: maximum running time for customization of the "tpot" model :return roc_auc_value: ROC AUC metric for pipeline """ train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target node_scaling = PrimaryNode('scaling') node_tpot = PrimaryNode('tpot') node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds} node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda]) OperationTypesRepository.assign_repo('model', 'automl_repository.json') pipeline = Pipeline(node_rf) pipeline.fit(train_data) results = pipeline.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def run_xgboost(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file task = params.task train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) if task == TaskTypesEnum.classification: model = xgb.XGBClassifier(max_depth=2, learning_rate=1.0, objective='binary:logistic') model.fit(train_data.features, train_data.target) predicted = model.predict_proba(test_data.features)[:, 1] predicted_labels = model.predict(test_data.features) elif task == TaskTypesEnum.regression: xgbr = xgb.XGBRegressor(max_depth=3, learning_rate=0.3, n_estimators=300, objective='reg:squarederror') xgbr.fit(train_data.features, train_data.target) predicted = xgbr.predict(test_data.features) predicted_labels = None else: raise NotImplementedError() return test_data.target, predicted, predicted_labels
def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): """ Function run chain with Auto ML models in nodes :param train_file_path: path to the csv file with data for train :param test_file_path: path to the csv file with data for validation :param max_run_time: maximum running time for customization of the "tpot" model :return roc_auc_value: ROC AUC metric for chain """ train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_scaling = PrimaryNode('scaling') node_tpot = PrimaryNode('tpot') node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds} node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda]) chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def get_scoring_data() -> Tuple[InputData, InputData]: train_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_train.csv' test_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_test.csv' train_data = InputData.from_csv(train_data_path) test_data = InputData.from_csv(test_data_path) return train_data, test_data
def get_scoring_data(): file_path_train = 'cases/data/scoring/scoring_train.csv' full_path_train = join(str(project_root()), file_path_train) # a dataset for a final validation of the composed model file_path_test = 'cases/data/scoring/scoring_test.csv' full_path_test = join(str(project_root()), file_path_test) task = Task(TaskTypesEnum.classification) train = InputData.from_csv(full_path_train, task=task) test = InputData.from_csv(full_path_test, task=task) return train, test
def file_data_setup(): test_file_path = str(os.path.dirname(__file__)) file = '../../data/simple_classification.csv' input_data = InputData.from_csv( os.path.join(test_file_path, file)) input_data.idx = _to_numerical(categorical_ids=input_data.idx) return input_data
def apply_model_to_data(model: Chain, data_path: str): df, file_path = create_multi_clf_examples_from_excel(data_path, return_df=True) dataset_to_apply = InputData.from_csv(file_path, target_column=None) evo_predicted = model.predict(dataset_to_apply) df['forecast'] = probs_to_labels(evo_predicted.predict) return df
def get_model(train_file_path: str, cur_lead_time: datetime.timedelta = timedelta(seconds=60)): task = Task(task_type=TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) # the search of the models provided by the framework # that can be used as nodes in a chain for the selected task models_repo = ModelTypesRepository() available_model_types, _ = models_repo.suitable_model( task_type=task.task_type, tags=['simple']) metric_function = MetricsRepository(). \ metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_lead_time=cur_lead_time) # Create the genetic programming-based composer, that allow to find # the optimal structure of the composite model builder = GPComposerBuilder(task).with_requirements( composer_requirements).with_metrics(metric_function) composer = builder.build() # run the search of best suitable model chain_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=False) chain_evo_composed.fit(input_data=dataset_to_compose) return chain_evo_composed
def run_multi_output_case(path, vis=False): """ Function launch case for river levels prediction on Lena river as multi-output regression task :param path: path to the file with table :param vis: is it needed to visualise pipeline and predictions """ target_columns = [ '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day' ] data = InputData.from_csv(path, target_columns=target_columns, columns_to_drop=['date']) train, test = train_test_data_setup(data) problem = 'regression' automl_model = Fedot(problem=problem) automl_model.fit(features=train) predicted_array = automl_model.predict(features=test) # Convert output into one dimensional array forecast = np.ravel(predicted_array) mae_value = mean_absolute_error(np.ravel(test.target), forecast) print(f'MAE - {mae_value:.2f}') if vis: plot_predictions(predicted_array, test)
def get_kc2_data(): file_path = 'cases/data/kc2/kc2.csv' full_path = join(str(project_root()), file_path) task = Task(TaskTypesEnum.classification) data = InputData.from_csv(full_path, task=task) train, test = train_test_data_setup(data) return train, test
def create_fitted_chain() -> Chain: train_file_path, test_file_path = get_scoring_case_data_paths() train_data = InputData.from_csv(train_file_path) chain = create_chain() chain.fit(train_data) return chain
def get_cholesterol_data(): file_path = 'cases/data/cholesterol/cholesterol.csv' full_path = join(str(project_root()), file_path) task = Task(TaskTypesEnum.regression) data = InputData.from_csv(full_path, task=task) train, test = train_test_data_setup(data) return train, test
def run_tpot(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task = params.task models_hyperparameters = get_models_hyperparameters()['TPOT'] generations = models_hyperparameters['GENERATIONS'] population_size = models_hyperparameters['POPULATION_SIZE'] result_model_filename = f'{case_label}_g{generations}' \ f'_p{population_size}_{task.name}.pkl' current_file_path = str(os.path.dirname(__file__)) result_file_path = os.path.join(current_file_path, result_model_filename) train_data = InputData.from_csv(train_file_path, task=Task(task)) if result_model_filename not in os.listdir(current_file_path): # TODO change hyperparameters to actual from variable model = fit_tpot(train_data, models_hyperparameters['MAX_RUNTIME_MINS']) model.export( output_file_name=f'{result_model_filename[:-4]}_pipeline.py') # sklearn pipeline object fitted_model_config = model.fitted_pipeline_ joblib.dump(fitted_model_config, result_file_path, compress=1) imported_model = joblib.load(result_file_path) predict_data = InputData.from_csv(test_file_path, task=Task(task)) true_target = predict_data.target if task == TaskTypesEnum.regression: predicted = predict_tpot_reg(imported_model, predict_data) predicted_labels = predicted elif task == TaskTypesEnum.classification: predicted, predicted_labels = predict_tpot_class( imported_model, predict_data) else: print('Incorrect type of ml task') raise NotImplementedError() print(f'BEST_model: {imported_model}') return true_target, predicted, predicted_labels
def create_fitted_pipeline() -> Pipeline: train_file_path, test_file_path = get_scoring_case_data_paths() train_data = InputData.from_csv(train_file_path) pipeline = create_pipeline() pipeline.fit(train_data) return pipeline
def test_string_features_from_csv(): test_file_path = str(os.path.dirname(__file__)) file = '../../data/classification_with_categorical.csv' expected_features = InputData.from_csv(os.path.join(test_file_path, file)).features assert expected_features.dtype == float assert np.isfinite(expected_features).all()
def test_fitted_pipeline_cache_correctness_after_export_and_import(): train_file_path, test_file_path = get_scoring_case_data_paths() train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) pipeline = create_classification_pipeline_with_preprocessing() pipeline.fit(train_data) pipeline.save('test_fitted_pipeline_cache_correctness_after_export_and_import') prediction = pipeline.predict(test_data) new_pipeline = Pipeline() new_pipeline.load(create_correct_path('test_fitted_pipeline_cache_correctness_after_export_and_import')) new_prediction = new_pipeline.predict(test_data) assert np.array_equal(prediction.predict, new_prediction.predict) assert new_pipeline.is_fitted
def run_credit_scoring_problem(train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), gp_optimiser_params: Optional[GPChainOptimiserParameters] = None, pop_size=None, generations=None): dataset_to_compose = InputData.from_csv(train_file_path) dataset_to_validate = InputData.from_csv(test_file_path) available_model_types, _ = ModelTypesRepository(). \ suitable_model(task_type=TaskTypesEnum.classification) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id(ClassificationMetricsEnum.ROCAUC) if gp_optimiser_params: optimiser_parameters = gp_optimiser_params else: selection_types = [SelectionTypesEnum.tournament] crossover_types = [CrossoverTypesEnum.subtree] mutation_types = [MutationTypesEnum.simple, MutationTypesEnum.growth, MutationTypesEnum.reduce] regularization_type = RegularizationTypesEnum.decremental optimiser_parameters = GPChainOptimiserParameters(selection_types=selection_types, crossover_types=crossover_types, mutation_types=mutation_types, regularization_type=regularization_type) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=4, max_depth=3, pop_size=pop_size, num_of_generations=generations, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time) # Create GP-based composer composer = GPComposer() chain_evo_composed = composer.compose_chain(data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, optimiser_parameters=optimiser_parameters, is_visualise=False) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True) roc_on_valid_evo_composed = calculate_validation_metric(chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return roc_on_valid_evo_composed, chain_evo_composed, composer
def validate_model_quality(model: Pipeline, data_path: str): dataset_to_validate = InputData.from_csv(data_path) predicted_labels = model.predict(dataset_to_validate).predict roc_auc_valid = round(roc_auc(y_true=test_data.target, y_score=predicted_labels, multi_class='ovo', average='macro'), 3) return roc_auc_valid
def test_evaluate_individuals(): project_root_path = str(fedot_project_root()) file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv') full_path_train = os.path.join(str(fedot_project_root()), file_path_train) task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(full_path_train, task=task) available_model_types, _ = OperationTypesRepository().suitable_operation( task_type=task.task_type) metric_function = ClassificationMetricsEnum.ROCAUC_penalty composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types) builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \ with_metrics(metric_function) composer = builder.build() pipelines_to_evaluate = [ pipeline_first(), pipeline_second(), pipeline_third(), pipeline_fourth() ] train_data, test_data = train_test_data_setup( dataset_to_compose, sample_split_ratio_for_tasks[dataset_to_compose.task.task_type]) metric_function_for_nodes = partial(composer.composer_metric, composer.metrics, train_data, test_data) adapter = PipelineAdapter() population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate] timeout = datetime.timedelta(minutes=0.001) params = GraphGenerationParams(adapter=PipelineAdapter(), advisor=PipelineChangeAdvisor()) with OptimisationTimer(timeout=timeout) as t: evaluate_individuals(individuals_set=population, objective_function=metric_function_for_nodes, graph_generation_params=params, is_multi_objective=False, timer=t) assert len(population) == 1 assert population[0].fitness is not None population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate] timeout = datetime.timedelta(minutes=5) with OptimisationTimer(timeout=timeout) as t: evaluate_individuals(individuals_set=population, objective_function=metric_function_for_nodes, graph_generation_params=params, is_multi_objective=False, timer=t) assert len(population) == 4 assert all([ind.fitness is not None for ind in population])
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier() ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(roc_auc_value) chain = Chain() node_first = PrimaryNode('direct_data_model') node_second = PrimaryNode('bernb') node_third = SecondaryNode('rf') node_third.nodes_from.append(node_first) node_third.nodes_from.append(node_second) chain.add_node(node_third) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=32, is_visualise=False): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) chain_simple = TsForecastingChain(PrimaryNode('linear')) chain_simple.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_simple = calculate_validation_metric( chain_simple.predict(dataset_to_validate), dataset_to_validate, f'full-simple_{forecast_length}', is_visualise=is_visualise) print(f'RMSE simple: {rmse_on_valid_simple}') chain_composite_lstm = get_composite_chain() chain_composite_lstm.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_lstm_only = calculate_validation_metric( chain_composite_lstm.predict(dataset_to_validate), dataset_to_validate, f'full-lstm-only_{forecast_length}', is_visualise=is_visualise) print(f'RMSE LSTM composite: {rmse_on_valid_lstm_only}') return rmse_on_valid_simple
def test_data_model_type_classification_chain_fit(): train_file_path, test_file_path = get_scoring_case_data_paths() train_data = InputData.from_csv(train_file_path) chain = create_classification_chain_with_preprocessing() chain.fit(train_data) chain.save('data_model_classification') expected_len_nodes = len(chain.nodes) actual_len_nodes = len(ChainTemplate(chain).operation_templates) assert actual_len_nodes == expected_len_nodes